In [40]:
# Lots of this code is taken from https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import string
import spacy
import re

parser = spacy.load('en')

# A custom stoplist
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + \
          ["-----", "---", "...", "“", "”", "'", "'s"]

In [2]:
# Every step in a pipeline needs to be a "transformer". 
# Define a custom transformer to clean text using spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [3]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    
    # import a dictionary of English contractions from another file
    from contractions import english_contractions
    contraction_dict = english_contractions()

    # replace the contractions with their expanded form
    for contraction, expansion in contraction_dict.items():
        text = text.replace(contraction.lower(),expansion.lower())
    
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")

    # lowercase
    text = text.lower()

    return text

In [4]:
# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() 
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    
    return tokens

In [5]:
# extract reviews from the json file

def parse(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

reviews_df = getDF('data/reviews_Electronics_5_first1000.json')
reviews_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [49]:
test_asin  = reviews_df['asin'].value_counts().idxmax()
#test_asin  = '0528881469'
test_df   = reviews_df[reviews_df['asin'] == test_asin]
test_df   = test_df.replace('', np.nan)
test_df   = test_df.dropna()
test_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
19,A2IDCSC6NVONIZ,972683275,2Cents!,"[1, 1]",This mount is just what I needed. It is stron...,5.0,Perfect,1367280000,"04 30, 2013"
20,A1EDI0X3GI1SK7,972683275,AGW,"[0, 1]","Great deal, easy to mount and it appears to be...",5.0,32 In TV Mount,1323993600,"12 16, 2011"
21,A3BMUBUC1N77U8,972683275,ahoffoss,"[0, 0]",This mount works really well once you get it u...,4.0,"Pretty simple, but definitely good!",1385164800,"11 23, 2013"
22,AVRFGGCCCR6QU,972683275,"Alberto Dieguez ""premiere purchaser of random...","[3, 4]",This wall mount does everything it's supposed ...,4.0,Fairly good wall mount,1283126400,"08 30, 2010"
23,A3UOSOCRKS3WIH,972683275,Allen Coberly,"[0, 0]",for the price you just cant beat this item. I ...,5.0,Cant Beat it!,1368316800,"05 12, 2013"


In [51]:
def define_model(df):
    # define the number features, topics, and how many 
    # words/documents to display later on
    n_features      = 1000
    n_topics        = min(int(df['reviewText'].size/2),10)
    n_top_words     = 10
    n_top_documents = min(int(df['reviewText'].size/2),3)

    # Use tf-idf vectorizer
    vectorizer = TfidfVectorizer(max_features=n_features,
                                 tokenizer=tokenizeText,
                                 stop_words='english', 
                                 ngram_range=(1,2),
                                 max_df=0.9, min_df=3)

    # use NMF model with the Frobenius norm
    clf = NMF(n_components=n_topics, random_state=1, 
              solver='mu', beta_loss='frobenius')

    # put it all in a pipeline
    pipe = Pipeline([('cleanText', CleanTextTransformer()), 
                     ('vectorizer', vectorizer), 
                     ('nmf', clf)
                    ])
    
    return vectorizer, clf, pipe, n_top_words, n_top_documents

vectorizer, clf, pipe, n_top_words, n_top_documents = define_model(test_df)

In [52]:
# Fit the model
pipe.fit(test_df['reviewText']);

In [53]:
# grab term-document matrix
transform = pipe.fit_transform(test_df['reviewText'])

In [80]:
def display_topics(vectorizer, nmf, W, df, n_top_words, n_top_documents):
    ''' Print out topics discovered by a model '''
    
    feature_names = vectorizer.get_feature_names()
    analyser = SentimentIntensityAnalyzer()    
    
    for topic_id, topic in enumerate(nmf.components_):
        print('Topic #%02d: %s' % (topic_id+1, ' '.join([str(feature_names[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]])))
        
        test_words = []
        for i in range(3):
            test_words.append(feature_names[topic.argsort()[:-n_top_words - 1:-1][i]])
        
        top_doc_indices = np.argsort( W[:,topic_id] )[::-1][0:n_top_documents]
        avg_polarity = 0
        for doc_index in top_doc_indices:
            doc = TextBlob(df['reviewText'].iloc[doc_index])
            
            vader = analyser.polarity_scores(df['reviewText'].iloc[doc_index])
            avg_polarity += vader['compound']
            
            print('User %20s on %s with rating %s' % (df['reviewerName'].iloc[doc_index][:20], 
                                     df['reviewTime'].iloc[doc_index], df['overall'].iloc[doc_index]))            
            print("TextBlob: %f,  VADER: %f" % (doc.sentiment[0], vader['compound']))
            
            found, review = False, df['reviewText'].iloc[doc_index].lower()
            for test_word in test_words:
                if test_word in review:
                    found = True
            if not found:
                print("WARNING: TOPIC NOT IN DOCUMENT")
            print(doc_index)    
            print(df['reviewText'].iloc[doc_index]+"\n")
        print("Average polarity: %4.2f" % (avg_polarity / n_top_documents))
        print()

display_topics(vectorizer, clf, transform, test_df, n_top_words, n_top_documents)

Topic #01: mount tv wall tv mount mount tv arm wall mount attach screw hold
User            m m smits on 08 15, 2012 with rating 4.0
TextBlob: 0.196767,  VADER: 0.939400
141
Got this to replace a Peerless Paramount wall mount when I got my new TV.  Let me start off by saying it works as advertised.  But that won't get you 5 stars.  The problem is installation.  Maybe it's my own fault for thinking all TV wall mounts worked like the Peerless I had.  See, that one has a separate TV plate that you screw to the TV first.  Then you have the second piece (the actual wall mount) that you screw to the wall.  Once that's done, using the bracket mounted on the TV, you hang the TV on the wall mount and secure it with a single bolt.  One person can easily put a TV on the wall.Not so much with this one.  Yes, I will admit that once it's mounted, it's a more secure connection, but it's the mounting that's the problem.  You can't do it alone.  Depending on the weight of the TV, you will need at least

# Extract Relevant Sentences

In [94]:
from nltk import sent_tokenize
new_df = test_df[test_df['reviewerName'] == 'so cal'].head()
test_review = test_df['reviewText'].iloc[189]
sentences = sent_tokenize(test_review)
print(sentences)

['got this to hang a 32" sony lcd in my bedroom.', 'used my stud finder to locate the studs, drilled 6 pilot holes and hung it up.', 'and extra set of hands is helpful when hanging the mount and mounting the tv.', 'holes on mount lined up perfectly with my tv.', 'screws fit just right.no complaints.']


In [95]:
analyser = SentimentIntensityAnalyzer()
for sentence in sentences:
    snt = analyser.polarity_scores(sentence)
    print(sentence)
    print(snt)

got this to hang a 32" sony lcd in my bedroom.
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
used my stud finder to locate the studs, drilled 6 pilot holes and hung it up.
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
and extra set of hands is helpful when hanging the mount and mounting the tv.
{'neg': 0.0, 'neu': 0.833, 'pos': 0.167, 'compound': 0.4215}
holes on mount lined up perfectly with my tv.
{'neg': 0.0, 'neu': 0.656, 'pos': 0.344, 'compound': 0.6369}
screws fit just right.no complaints.
{'neg': 0.511, 'neu': 0.217, 'pos': 0.272, 'compound': -0.296}


# Testing if Documents are Mislabeled

In [46]:
def find_mislabeled(vectorizer, nmf, W, df, n_top_words, n_top_documents):
    ''' Print out topics discovered by a model '''

    n_mislabeled = 0
    feature_names = vectorizer.get_feature_names()
    
    for topic_id, topic in enumerate(nmf.components_):
        
        # grab the topic words to test
        test_words = []
        for i in range(3):
            test_words.append(feature_names[topic.argsort()[:-n_top_words - 1:-1][i]])

        # check if those words are in a review
        top_doc_indices = np.argsort( W[:,topic_id] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            found, review = False, df['reviewText'].iloc[doc_index].lower()
            for test_word in test_words:
                if test_word in review:
                    found = True
            if not found:
                n_mislabeled += 1
                
    return n_mislabeled

In [32]:
# test model for many reviews
asin_list = list(reviews_df['asin'].unique())
print(len(asin_list))

# loop over asin
total_mis = 0
for asin in asin_list:
    misclassified = 0
    
    test_df   = reviews_df[reviews_df['asin'] == test_asin]
    test_df   = test_df.replace('', np.nan)
    test_df   = test_df.dropna()

    # fit model
    vectorizer, clf, pipe, n_top_words, n_top_documents = define_model(test_df)
    pipe.fit(test_df['reviewText']);
    transform = pipe.fit_transform(test_df['reviewText'])
    
    # count how many reviews are mis-classified
    misclassified += find_mislabeled(vectorizer, clf, transform, 
                                     test_df, n_top_words, n_top_documents)

    print(asin, misclassified)
#    total_mis += misclassified
print("Total Mislabeled: "+total_mis)

40
0528881469 0
0594451647 0
0594481813 0
0972683275 0
1400501466 0
1400501520 0
1400501776 0
1400532620 0
1400532655 0
140053271X 0
1400532736 0
1400599997 0
1400698987 0
1400699169 0
1615527613 0
3744295508 0
3930992868 0
3936710058 0
6301977173 0
7214047977 0
7507825604 0
7799813393 0
8862935293 0
8862936826 0
8918010656 0
9043413585 0
9573212900 0
9573212919 0
9575871979 0
9625993428 0
9862510447 0
9876050621 0
9888002198 0
9966338926 0
9966541551 0
9966569863 0
9966694544 0
9981739588 0
9983891204 0
9983891212 0
0
