# Import libraries

In [176]:
import string
import re
import nltk
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    # If not present, download NLTK stopwords.
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")

from nltk.corpus import stopwords as nltk_en_stopwords
nltk_stopwords = set(nltk_en_stopwords.words("english"))

In [47]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Helper functions

In [203]:
def remove_punctuations(s):
    """
    Replace all punctuations with whitespace
    Punctuations are exhaustive from string.punctuation library:
    !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    """
    for character in string.punctuation:
        s = s.replace(character, ' ')
    return s


def replace_digits(s):
    """
    Replaces all digits with whitespace.
    """
    return re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", s)

def remove_urls(s):
    """
    Replaces all URLs with whitespace
    URLs: All text that starts with:
    - http://
    - https://
    """
    return re.sub('http[s]?://\S+', '', s)

def remove_hashtags(s):
    """
    Replaces all hashtags with whitespace
    Hashtags are any words that starts with '#'
    """
    return re.sub("#[A-Za-z0-9_]+"," ", s)

def standardize_lowercase(s):
    return s.lower()

def remove_stopwords(s, sw=nltk_stopwords):
    """
    Replaces all NLTK stopwords with whitespace 
    """
    pattern = re.compile(r'\b(' + r'|'.join(set(sw)) + r')\b\s*')
    s = pattern.sub(' ', s)
    return s

def remove_excess_whitespace(s):
    """
    Ensure multiple whitespaces are converted to just one whitespace for text consistency
    """
    return " ".join(s.split()).strip()

def get_default_pipeline():
    return [remove_urls,
        remove_punctuations,
           replace_digits,
           remove_hashtags,
           standardize_lowercase,
           remove_stopwords,
           remove_excess_whitespace]

def overall_clean(df, pipeline=None):
    if not pipeline: # May allow future customization
        pipeline = get_default_pipeline()
    for task in pipeline:
        df["text"] = df["text"].apply(task)
    return df

In [218]:
df = pd.read_csv("../dataset/amazon_product.csv", header=None, encoding="unicode-escape").iloc[1:,:].rename(columns={0:"text"})
df

Unnamed: 0,text
1,I initially had trouble deciding between the p...
2,Allow me to preface this with a little history...
3,I am enjoying it so far. Great for reading. Ha...
4,I bought one of the first Paperwhites and have...
5,I have to say upfront - I don't like coroporat...
...,...
1593,This is not the same remote that I got for my ...
1594,I have had to change the batteries in this rem...
1595,"Remote did not activate, nor did it connect to..."
1596,It does the job but is super over priced. I fe...


In [219]:
%%time
df = overall_clean(df)
df

CPU times: total: 891 ms
Wall time: 868 ms


Unnamed: 0,text
1,initially trouble deciding paperwhite voyage r...
2,allow preface little history casual reader own...
3,enjoying far great reading original fire since...
4,bought one first paperwhites pleased constant ...
5,say upfront like coroporate hermetically close...
...,...
1593,remote got alexa echo control volume think rem...
1594,change batteries remote twice per month since ...
1595,remote activate connect box poorly designed re...
1596,job super priced feel like offer replacement r...


In [220]:
def remove_punctuation(text):
    for tok in string.punctuation:
        text = text.replace(tok, " ")
    return text

def remove_digits(text):
    for tok in "1234567890":
        text = text.replace(tok, " ")
    return text

def remove_whitespace(text):
    return " ".join(text.split())

def remove_stopwords(text):
    text_tokens = word_tokenize(text)
    filtered_sentence = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_sentence)

In [221]:
remove_pun

NameError: name 'remove_pun' is not defined

In [222]:
text_for_contextual = clean_for_contextual(df)[df.columns[0]].to_list()
text_for_bow = clean_for_bow(df)[df.columns[0]].to_list()

NameError: name 'clean_for_contextual' is not defined

# Topic modelling - GSDMM
1) Explore means of short text TM
2) Get overall topic docs and distribution
3) Get each text topic

In [223]:
docs = df.to_numpy()
docs

array([['initially trouble deciding paperwhite voyage reviews less said thing paperwhite great spending money go voyage fortunately friends owned ended buying paperwhite basis models ppi dollar jump turns pricey voyage page press always sensitive fine specific setting need auto light adjustment week loving paperwhite regrets touch screen receptive easy use keep light specific setting regardless time day case hard change setting either changing light level certain time day every reading also glad went international shipping option amazon extra expense delivery time tracking didnt need worry customs may used third party shipping service'],
       ['allow preface little history casual reader owned nook simple touch read harry potter series girl dragon tattoo series brave new world key titles fair say nook get much use many others may gotten fast forward today full week new kindle paperwhite admit love kindle reading let relate review love reading back kindle investment 00 experience recei

In [224]:
from gsdmm import MovieGroupProcess

# GSDMM only requires num of unique words and numpy array of text series
docs = df.to_numpy()
vocab = set(" ".join(df.text.to_list()).split())

# Declare GSDMM object where K: number of topcs
gsdmm = MovieGroupProcess(n_iters=50)

# Fit GSDMM model
y = gsdmm.fit(docs, len(vocab))

In stage 0: transferred 1368 clusters with 8 clusters populated
In stage 1: transferred 1289 clusters with 8 clusters populated
In stage 2: transferred 1215 clusters with 8 clusters populated
In stage 3: transferred 1133 clusters with 8 clusters populated
In stage 4: transferred 1000 clusters with 8 clusters populated
In stage 5: transferred 962 clusters with 8 clusters populated
In stage 6: transferred 845 clusters with 8 clusters populated
In stage 7: transferred 874 clusters with 8 clusters populated
In stage 8: transferred 860 clusters with 8 clusters populated
In stage 9: transferred 848 clusters with 8 clusters populated
In stage 10: transferred 843 clusters with 8 clusters populated
In stage 11: transferred 844 clusters with 8 clusters populated
In stage 12: transferred 830 clusters with 8 clusters populated
In stage 13: transferred 854 clusters with 8 clusters populated
In stage 14: transferred 827 clusters with 8 clusters populated
In stage 15: transferred 789 clusters with 8 

In [225]:
# Assigning cluster
df["cluster"] = y
df

Unnamed: 0,text,cluster
1,initially trouble deciding paperwhite voyage r...,6
2,allow preface little history casual reader own...,6
3,enjoying far great reading original fire since...,6
4,bought one first paperwhites pleased constant ...,5
5,say upfront like coroporate hermetically close...,6
...,...,...
1593,remote got alexa echo control volume think rem...,5
1594,change batteries remote twice per month since ...,6
1595,remote activate connect box poorly designed re...,5
1596,job super priced feel like offer replacement r...,3


In [226]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Number of documents per topic : [122  32  25  97   0 316 886 119]
Most important clusters (by number of docs inside): [6 5 0 7 3 1 2 4]


In [227]:
# Get c-TF-IDF

# Treat all documents in one cluster as one text
class_text = []
for i in range(len(doc_count)):
    class_text.append(" ".join(df[df["cluster"]==i]["text"].to_list()))

# Use sklearn's TF-IDF wrapper
tfidf_vectorizer = TfidfVectorizer()
c_tfidf = tfidf_vectorizer.fit_transform(raw_documents=class_text)

# Get highest weighted word per cluster
index_value={i[1]:i[0] for i in tfidf_vectorizer.vocabulary_.items()}
weighted_words = {}
for cluster ,row in enumerate(c_tfidf):
    words_weight_dict = {index_value[column]:value for (column,value) in zip(row.indices,row.data)}
    words_weight_dict = {k: v for k, v in sorted(words_weight_dict.items(), key=lambda item: item[1], reverse=True)}
    top10words = {k: words_weight_dict[k] for k in list(words_weight_dict)[:10]}
    weighted_words[cluster] = top10words

In [228]:
weighted_words

{0: {'amazon': 0.2741696389680442,
  'roku': 0.20461479797870435,
  'tv': 0.19705942800828177,
  'device': 0.19134756052978083,
  'box': 0.19032092987446605,
  'like': 0.17706789183352856,
  'fire': 0.17135602435502761,
  'kindle': 0.15993228939802578,
  'use': 0.1513644881802744,
  'great': 0.14565262070177348},
 1: {'kindle': 0.21472682023073322,
  'great': 0.21472682023073322,
  'echo': 0.20279755244013695,
  'remote': 0.20031523688898514,
  'amazon': 0.19086828464954064,
  'prime': 0.19086828464954064,
  'use': 0.17893901685894434,
  'voice': 0.16700974906834806,
  'one': 0.14315121348715548,
  'button': 0.13122194569655918},
 2: {'fire': 0.402928912262418,
  'hd': 0.22774242867006236,
  'model': 0.201464456131209,
  'year': 0.19270513195159122,
  'amazon': 0.19270513195159122,
  'one': 0.1664271594127379,
  'great': 0.1489085110535023,
  'kindle': 0.14014918687388453,
  'screen': 0.12263053851464895,
  'well': 0.12263053851464895},
 3: {'fire': 0.3245397969338852,
  'kindle': 0.26

# Topic Modelling - NMF

# Sentiment analysis
1) Get sentiment distribution
2) Get sentiment over topics
3) Visualization

In [33]:
df

Unnamed: 0,0
1,This tremendous 100% varietal wine hails from ...
2,"Ripe aromas of fig, blackberry and cassis are ..."
3,Mac Watson honors the memory of a wine once ma...
4,"This spent 20 months in 30% new French oak, an..."
5,"This is the top wine from La Bégude, named aft..."
...,...
4996,"This easy-drinking, value-minded wine offers n..."
4997,"Peach and powdered sugar aromas are first up, ..."
4998,Simple cherry and plum aromas lead to a modest...
4999,This 100% varietal wine is lightly aromatic wi...


# Aspect Based Sentiment Analysis
1) Get visualization

# Keyphrase Extraction
1) Per sentiment
2) Per cluster

# Most representative text
1. Textrank -> per sentiment