<a href="https://colab.research.google.com/github/crfernando/sumryviu/blob/dev/sumryvu_v1_bert_vader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [20]:
!pip install symspellpy
!pip install sentence_transformers
!pip install hdbscan
!pip install vaderSentiment
!pip install umap-learn

Collecting umap-learn
[?25l  Downloading https://files.pythonhosted.org/packages/75/69/85e7f950bb75792ad5d666d86c5f3e62eedbb942848e7e3126513af9999c/umap-learn-0.5.1.tar.gz (80kB)
[K     |████████████████████████████████| 81kB 3.4MB/s 
Collecting pynndescent>=0.5
[?25l  Downloading https://files.pythonhosted.org/packages/af/65/8189298dd3a05bbad716ee8e249764ff8800e365d8dc652ad2192ca01b4a/pynndescent-0.5.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.2MB 6.7MB/s 
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.1-cp37-none-any.whl size=76569 sha256=5e821f3158eef270bd6003d09f83a8294699ec33e1b9df6c2c4676191af81358
  Stored in directory: /root/.cache/pip/wheels/ad/df/d5/a3691296ff779f25cd1cf415a3af954b987fb53111e3392cf4
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.

In [1]:
import os
import pandas as pd; pd.set_option('display.max_colwidth', None)
import re
import json
import gensim
from gensim.parsing.preprocessing import strip_tags, strip_non_alphanum, strip_multiple_whitespaces, strip_short, remove_stopwords, split_alphanum, strip_numeric
import pkg_resources
from symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split
import spacy; nlp = spacy.load('en', disable=['parser', 'ner'])

#### Data loading and Preprocessing

In [2]:
def load_data(file_path):
    raw_data = pd.read_csv(file_path)
    raw_data.columns = map(str.lower, raw_data.columns)
    raw_data = raw_data.dropna(subset=['body']) # drop empty or null rows
    raw_data = raw_data.drop_duplicates('body') # drop duplicate rows
    raw_data = raw_data.groupby('asin').filter(lambda x:len(x) > 780) # get items length over n amount
    raw_data = raw_data.query('verified == True') # consider ony verified reviews
    raw_data = raw_data[raw_data['body'].str.split().str.len() > 25] # remove review length less than 5
    stage_data = raw_data[{'asin', 'body'}].rename(columns={'asin': 'item_id', 'body': 'review_text'}, errors='raise').copy().reset_index(drop=True)
    
    return stage_data.query("item_id == 'B00F2SKPIM'") # <-- THIS FILTER HAS TO BE REMOVED...

In [3]:
def expand_contractions(text, pattern, contraction_map):
    '''
    this function will expands the contraction of provided text by matching the pattern given
        text - sentence, phrase or word for expansion
        patter - regex pattern
        contraction_map - contraction mapping dictionary
    '''
    def replace(match):
        return contraction_map[match.group(0)]
    return pattern.sub(replace, text)

def sentence_preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove html tags
        - remove non-alphabetic 
        - remove punctuation
        - lowercase
    '''
    step_process_text = strip_tags(str(text))
#     step_process_text = split_alphanum(step_process_text)
    step_process_text = strip_numeric(step_process_text)
    step_process_text = strip_non_alphanum(step_process_text)
    step_process_text = strip_multiple_whitespaces(step_process_text)
    step_process_text = strip_short(step_process_text, minsize=2)
#     step_process_text = remove_stopwords(step_process_text)
    processed_text = step_process_text.strip()
    
    return processed_text

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def tokeninze(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.pos_ in allowed_postags]

    for idx, token in enumerate(tokens):
        suggestion = sym_spell.lookup(token, Verbosity.CLOSEST, max_edit_distance=3)
        if suggestion:
            tokens[idx] = suggestion[0].term
    
    return tokens
 

In [4]:
work_dir = "/content/drive/MyDrive/sumryviu/"
data_file = os.path.join(work_dir, "data/amazon-cell-phone-reviews.csv")
contractions_file = os.path.join(work_dir, "data/contractions.json")

with open(contractions_file) as file:
    contraction_dict = {key.lower(): value.lower() for key, value in json.load(file).items()}

re_pattern = re.compile('({})'.format('|'.join(contraction_dict.keys())), flags=re.IGNORECASE)

reviews_data = load_data(data_file)
reviews_data.index.names = ['row_id']
reviews_data = reviews_data.reset_index()
reviews_data['processed_review'] = reviews_data['review_text'].str.lower().apply(lambda row: expand_contractions(row, re_pattern, contraction_dict)) # expand contraction
reviews_data['processed_review'] = reviews_data['processed_review'].apply(sentence_preprocess)
# reviews_data['processed_tokens'] = reviews_data['processed_review'].apply(lambda row: tokeninze(row, allowed_postags=['NOUN', 'VERB', 'ADV']))

In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    polarity_scores = analyser.polarity_scores(sentence)
    compound_score = polarity_scores['compound']

    return compound_score

In [6]:
reviews_data['compound_score'] = reviews_data['processed_review'].apply(sentiment_analyzer_scores)
reviews_data['polarity'] = reviews_data['compound_score'].apply(lambda s: "Positive" if s > 0 else "Negative")

In [7]:
train_review_set, test_review_set = train_test_split(reviews_data, test_size=0.2)
data = train_review_set.processed_review.to_numpy()
row_id = train_review_set.row_id.to_numpy()

In [8]:
from sentence_transformers import SentenceTransformer
import umap
import hdbscan

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') #distilbert-base-nli-mean-tokens
embeddings = model.encode(data, show_progress_bar=True)
reducer = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)
cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit(reducer)
# cluster.labels_

HBox(children=(FloatProgress(value=0.0, description='Batches', max=11.0, style=ProgressStyle(description_width…




In [9]:
docs_df = pd.DataFrame(list(zip(row_id, data)), columns=["row_id", "text"])
docs_df['label'] = cluster.labels_
docs_df.head()

Unnamed: 0,row_id,text,label
0,547,love this phone however the sim card received did not work so had to go to verizon store to get new sim card,1
1,423,do not use cell phone lot but love the functionality of this phone it is more like mini tablet the pen that comes with it is very handy and activates the phone when you pull it out it is also great for more precise selections and for note taking am very happy with the resolution of the screen and the response of both touch and pen buying the phone from amazon and setting up new verizon account was painless easily got the account and on line payment set up without hitch also transferred my at phone number to the phone without any problems will not reiterate the tech specs of the phone as you can read all of that on the product page just want to let you know what think of it as user of the phone at this point in time could not be happier with this phone or the verizon service update have had this phone for little over years and still have no desire to get new phone did however recently decide to switch my service provider from verizon to mobile because of the offerings of mobile which will not get into was assured by the mobile rep that as long as the phone was unlocked could replace the verizon sim card with the one from mobile and the phone would work with no problems after activating the mobile sim card am updating this review so that others do not experience what did if they want to change their provider first of all verizon has never locked any of their phones verizon uses cdma network which is also used by sprint and us cellular mobile and at use gsm network one difference is the polarity used so phone for each type of network are manufactured to use the specific polarity of the network the other difference that should be considered by the end user is that on the gsm network customer information is stored on the sim card this means that you can take the sim card from one phone and insert it into another and it will work gsm carriers must accept any gsm phone so they do not have control over the phone you are using cdma carriers use network based white lists so you can only switch phones with your carrier permission and the carrier does not have to accept any particular phone there are other differences between the two network types but those have mentioned affect the end user the most in my case had to buy new phone in order to switch to mobile as carrier was also told by verizon that this phone may work with another cdma carrier but it my lose functionality have had to buy another phone to use the gsm network and have decided to get the galaxy note since have to give up my note hope this helps other in choosing carrier and phone,0
2,517,bought it as gift and she loves it large screen fast gets updates and since she plays lot of games on it it is exactly what she needs,0
3,626,when bought work fine but days stopped working you are going to waste your money the only good thing is that amazon helped me get my money back,-1
4,794,phone arrived in time instructions to activate was very simple love my note sometimes have had issues with phone call quality but otherwise the phone is great buy love samsung products,0


In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(2, 2)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

docs_per_topic = docs_df.groupby(['label'], as_index = False).agg({'text': ' '.join})
tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(data))

In [11]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.label)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['label'])
                     .text
                     .count()
                     .reset_index()
                     .rename({"label": "Topic", "text": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes    

In [12]:
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

Unnamed: 0,Topic,Size
0,-1,153
1,0,123
2,1,56


In [14]:
for i in top_n_words.values():
  print(i)

[('love phone', 0.00908459117751686),
 ('battery life', 0.008811919877942697),
 ('great phone', 0.007432799122504856),
 ('samsung galaxy', 0.006754165247317133),
 ('best phone', 0.0061062139052438635)]

In [15]:
# from sklearn.metrics.pairwise import cosine_similarity

# for i in range(20):
#     # Calculate cosine similarity
#     similarities = cosine_similarity(tf_idf.T)
#     np.fill_diagonal(similarities, 0)

#     # Extract label to merge into and from where
#     topic_sizes = docs_df.groupby(['label']).count().sort_values("text", ascending=False).reset_index()
#     topic_to_merge = topic_sizes.iloc[-1].label
#     topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

#     # Adjust topics
#     docs_df.loc[docs_df.label == topic_to_merge, "Topic"] = topic_to_merge_into
#     old_topics = docs_df.sort_values("Topic").Topic.unique()
#     map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
#     docs_df.Topic = docs_df.Topic.map(map_topics)
#     docs_per_topic = docs_df.groupby(['label'], as_index = False).agg({'text': ' '.join})

#     # Calculate new topic words
#     m = len(data)
#     tf_idf, count = c_tf_idf(docs_per_topic.text.values, m)
#     top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

# topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [17]:
#  sentences = docs_df.head()

In [115]:
# sentences = train_review_set.sample(3).processed_review.values.tolist()
sentences_dict = train_review_set.sample(3).set_index('row_id')['processed_review'].to_dict()
sentences_dict

{487: 'without getting all technical if you do not need the latest greatest edge note this thing is powerhouse unto itself am very happy with my decision and for the price with yr contract how could not be',
 625: 'twas the day before christmas sat by my tree and opened the amazon box that contained my note the box it was beautiful the phone even more followed the guide to the letter and was online by four downloaded my apps from the great google play christmas had come early on that wonderful day there is so much to learn about this beast of mine will write more about it some other time for now it is just calls and music and text will have to say it is just as good sex',
 674: 'works perfectly really clear screen best samsung model phone ever only problem the replacement bezel used in this refurbishment is even cheaper than original samsung one silver paint chips off even when in protective case'}

In [119]:
# doc = nlp(str(sentences))
# tokens = [token.lemma_.strip() for token in doc if not token.is_stop]

# def normalize(comment, lowercase, remove_stopwords):
#   if lowercase:
#       comment = comment.lower()
#   comment = nlp(comment)
#   lemmatized = []
#   for word in comment:
#       lemma = word.lemma_.strip()
#       if lemma:
#           if not remove_stopwords or (remove_stopwords and lemma not in stops):
#               lemmatized.append(lemma)
#   return " ".join(lemmatized)

token_list = []
for sentence in sentences_dict.values():
  doc = nlp(sentence)
  tokens = [token.lemma_.strip() for token in doc if not token.is_stop]
  token_list.append(tokens)

from itertools import chain
corpus = sorted(set(chain(*token_list)))
# corpus

# keyword_vect = [1 if token in ("battery", "life") else 0 for token in corpus]

# print(keyword_vect)

In [125]:
# for i in top_n_words.values():
#   for word, *args in i:
#     print(word)

from itertools import chain
# corpus = sorted(set(chain(*token_list)))

topic_vect_dict = {}
for topic, topic_words in top_n_words.items():
  aspects = []
  for keywords, score in topic_words:
    aspects.append(keywords.split())
    aspects_ls = sorted(set(chain(*aspects)))
  topic_vect = [1 if token in aspects_ls else 0 for token in corpus]
  topic_vect_dict[topic] = topic_vect
print(topic_vect_dict)

{-1: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]}


In [67]:
# text = 'bought it as gift and she loves it large screen fast gets updates and since she plays lot of games on it it is exactly what she needs'
# text_vect = [1 if token in text.split() else 0 for token in corpus]

# print(text_vect)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
sentence_vect_dict = {}
for id, sentence in sentences_dict.items():
  text_vect = [1 if token in sentence.split() else 0 for token in corpus]
  sentence_vect_dict[id] = text_vect

In [126]:
cos_sim = lambda x, y: dot(x,y)/(norm(x)*norm(y))
for r_id, s_vec in sentence_vect_dict.items():
  print(r_id)
  for t_id, t_vec in topic_vect_dict.items():
    print('\t', t_id, cos_sim(s_vec, t_vec))

674
	 -1 0.1951800145897066
	 0 0.3464101615137754
	 1 0.14907119849998599
625
	 -1 0.29649972666444047
	 0 0.2631174057921088
	 1 0.22645540682891918
487
	 -1 0.3273268353539886
	 0 0.12909944487358055
	 1 0.0


In [80]:
from numpy import dot
from numpy.linalg import norm

# cos_sim = lambda x, y: dot(x,y)/(norm(x)*norm(y))
cos_sim(text_vect, keyword_vect)
cos_sim=np.dot(text_vect,keyword_vect)/(np.linalg.norm(text_vect)*np.linalg.norm(keyword_vect))

  after removing the cwd from sys.path.


nan

In [74]:
np.isfinite(keyword_vect).all()

True

In [75]:
np.isfinite(text_vect).all()

True

In [128]:
docs_df.head()

Unnamed: 0,row_id,text,label
0,547,love this phone however the sim card received did not work so had to go to verizon store to get new sim card,1
1,423,do not use cell phone lot but love the functionality of this phone it is more like mini tablet the pen that comes with it is very handy and activates the phone when you pull it out it is also great for more precise selections and for note taking am very happy with the resolution of the screen and the response of both touch and pen buying the phone from amazon and setting up new verizon account was painless easily got the account and on line payment set up without hitch also transferred my at phone number to the phone without any problems will not reiterate the tech specs of the phone as you can read all of that on the product page just want to let you know what think of it as user of the phone at this point in time could not be happier with this phone or the verizon service update have had this phone for little over years and still have no desire to get new phone did however recently decide to switch my service provider from verizon to mobile because of the offerings of mobile which will not get into was assured by the mobile rep that as long as the phone was unlocked could replace the verizon sim card with the one from mobile and the phone would work with no problems after activating the mobile sim card am updating this review so that others do not experience what did if they want to change their provider first of all verizon has never locked any of their phones verizon uses cdma network which is also used by sprint and us cellular mobile and at use gsm network one difference is the polarity used so phone for each type of network are manufactured to use the specific polarity of the network the other difference that should be considered by the end user is that on the gsm network customer information is stored on the sim card this means that you can take the sim card from one phone and insert it into another and it will work gsm carriers must accept any gsm phone so they do not have control over the phone you are using cdma carriers use network based white lists so you can only switch phones with your carrier permission and the carrier does not have to accept any particular phone there are other differences between the two network types but those have mentioned affect the end user the most in my case had to buy new phone in order to switch to mobile as carrier was also told by verizon that this phone may work with another cdma carrier but it my lose functionality have had to buy another phone to use the gsm network and have decided to get the galaxy note since have to give up my note hope this helps other in choosing carrier and phone,0
2,517,bought it as gift and she loves it large screen fast gets updates and since she plays lot of games on it it is exactly what she needs,0
3,626,when bought work fine but days stopped working you are going to waste your money the only good thing is that amazon helped me get my money back,-1
4,794,phone arrived in time instructions to activate was very simple love my note sometimes have had issues with phone call quality but otherwise the phone is great buy love samsung products,0


In [130]:
def topic_sentence_similarity_score(sentence, topic, corpus):
  t_vec = topic_vect_dict[topic]
  s_vec = [1 if token in sentence.split() else 0 for token in corpus]
  return cos_sim(s_vec, t_vec)

In [132]:
docs_df['score'] = docs_df[['text', 'label']].apply(lambda row: topic_sentence_similarity_score(row['text'], row['label'], corpus), axis=1)

  """Entry point for launching an IPython kernel.


In [134]:
docs_df.sample(10)

# row_id (721)

Unnamed: 0,row_id,text,label,score
195,721,concerned the product was sold as new and believe it is refurbished only had couple months and already having issues son mad at me yes he is an it and works on these he has note feels it was misrepresented,1,0.0
239,499,went through lot of extra trouble to get refurbished new note huge fan of the phone but am giving because for refurbished phone it will not even read micro sd card at all its missing pin connecter,1,0.333333
48,658,one of the best android phones out there it only takes about half hour to charge and takes almost two days to be completely dead atleast that is how it is for me have had this phone for about four years jumping from phone service carrier to other phone service carriers and still enjoy it even though had repurchased it again when switching to different phone service provider because loved it so much if your looking for an electronic easy access hand notebook with efficient camera and strong battery this is great phone choice for you,-1,0.534522
121,766,this phone is great just matter of getting used to its capabilities overall no comparison to other phones in the market even the iphones the best phone out there period,-1,0.534522
119,449,thought this phone would be much faster but its no faster than my old phone the picture quality is ok just disappointed over all plus the seller sent me the wrong charger so had to go out and purchase one not satisfied customer,1,0.57735
225,717,very good looking and solid phone almost like new no issues in the first months good price like it even more than the note that it replaced the screen is slightly bigger than the note screen,-1,0.845154
86,647,decided to wait utnil reviewed this phone we ordered four of these for our family and we have had them for five months now one of the phone had to be replaced after we got it because it would not hold charge so we got replacement for that one now after five months all four phones are having charging problems my wife phone will not charge anymore no matter what cord we use all other phones you have to jiggle the cord and have it placed in certain way to charge and then it takes way too long to charge this is an obvious manufacture defect one phone can understand but four phones or actually five if you count the one that was replaced that is unnaceptable and poor quality control and design on samsung part this will be the last samsung phone we buy it is unfortunate as the phone has great features but it does no good if you can not charge the phones properly so beware you could have battery charging problems after few months,-1,0.507093
325,443,top dog imo best phone out there great for multitasking excellent screen fast lightweight camera takes great pictures in good lighting pen works great speaker output nice and crisp battery life really good if you turn off some un needed functions like gestures turn off gps when your not needing it turn off auto sync and set screen time out to sec been getting easy days without charging with light usage like streaming youtube web browsing and some email call quality very good can hear callers loud and sharp on verizon network that is blazing fast on verizon wifi antenna can be little stronger overall great phone by samsung,0,0.676123
212,788,love my gn the screen is huge and beautiful so clear have only had it couple of weeks so still have much to learn no regrets,0,0.223607
218,775,samsung is the best note is the best its not too big get an otterbox to protect sign up for class to learn all about it,0,0.447214
