In [16]:
# Lots of this code is taken from https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk import sent_tokenize
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import string
import spacy

parser = spacy.load('en')

# A custom stoplist
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + \
          ["-----", "---", "...", "“", "”", "'", "'s"]

In [17]:
# Every step in a pipeline needs to be a "transformer". 
# Define a custom transformer to clean text using spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [18]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    
    # import a dictionary of English contractions from another file
    from contractions import english_contractions
    contraction_dict = english_contractions()

    # replace the contractions with their expanded form
    for contraction, expansion in contraction_dict.items():
        text = text.replace(contraction.lower(),expansion.lower())
    
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")

    # lowercase
    text = text.lower()

    return text

In [19]:
# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() 
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    
    return tokens

In [20]:
# extract reviews from the json file

def parse(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

reviews_df = getDF('data/reviews_Electronics_5_first1000.json')
reviews_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [30]:
test_asin  = reviews_df['asin'].value_counts().idxmax()
#test_asin  = '0528881469'
test_asin  = '9983891204'
test_df   = reviews_df[reviews_df['asin'] == test_asin]
test_df   = test_df.replace('', np.nan)
test_df   = test_df.dropna()
test_df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
900,A2NEASNMHL1A4O,9983891204,"Amazon Customer ""Raja""","[1, 1]",Prompt shipping. On time delivery. Excellent q...,5.0,Good cable,1346630400,"09 3, 2012"
901,AB5S5DS16EVMJ,9983891204,apg231,"[0, 0]",What pay $50 for a Canon brand cable when you ...,5.0,Premium 10 feet Canon HTC-100 Upgrade Replacem...,1366761600,"04 24, 2013"
902,A3OJNGNBWD7KNN,9983891204,Be the Plunk,"[1, 1]",this cable lets me show pictures and videos fr...,5.0,Works as it should,1360454400,"02 10, 2013"
903,A2BX0E26599UPQ,9983891204,Birdieman,"[0, 0]",We use this cable to hook our camera to the RV...,5.0,Hook camera to RV TV.,1385942400,"12 2, 2013"
904,A5U3O5KSTUOY0,9983891204,Charles E McAllister,"[0, 0]",Can view my pictures right from my Cannon 4Ti ...,5.0,Works Great,1367798400,"05 6, 2013"
905,A1TND9THMFVTQG,9983891204,"Denver Consumer ""bluetinkerbell""","[0, 1]",And I like the length....makes it much easier ...,4.0,Not yet tried but seems substantial,1367884800,"05 7, 2013"
906,A18A4AMMJ5KV7T,9983891204,D. Fournier,"[1, 1]",This HDMI cable is perfect for sending image a...,5.0,ideal,1318723200,"10 16, 2011"
907,A39JL3OBJIBIX7,9983891204,Don L. Binder,"[0, 0]",Works flawlessly to connect my new Canon camer...,5.0,Works like a charm!,1388707200,"01 3, 2014"
908,A1LY7Z60TNV629,9983891204,John Coleman,"[1, 1]",Works just like my shorter cables only with mo...,5.0,Cable is fine,1358726400,"01 21, 2013"
909,A3VBORHS4H8LES,9983891204,"John E. Wente ""okiebikinguy""","[0, 0]",I bought this to use with my Canon G10 camera....,5.0,Does the job - no reason to pay more,1376179200,"08 11, 2013"


In [36]:
for review in test_df[test_df['reviewerName']=='Mr Satisfied']['reviewText']:
    print(review)

The cable is doing just what it is suppose to do. The connectors fit snuggly into the HDMI input on the TV and the output of my Canon 5D, Mark II. Saved a lot of money vs. the cable offered by Canon.


In [23]:
def define_model(df):
    # define the number features, topics, and how many 
    # words/documents to display later on
    n_features      = 1000
    n_topics        = min(int(df['reviewText'].size/2),10)
    n_top_words     = 3
    n_top_documents = min(int(df['reviewText'].size/2),3)

    # Use tf-idf vectorizer
    vectorizer = TfidfVectorizer(max_features=n_features,
                                 tokenizer=tokenizeText,
                                 stop_words='english', 
                                 ngram_range=(1,2),
                                 max_df=0.9, min_df=3)

    # use NMF model with the Frobenius norm
    clf = NMF(n_components=n_topics, random_state=1, 
              solver='mu', beta_loss='frobenius')

    # put it all in a pipeline
    pipe = Pipeline([('cleanText', CleanTextTransformer()), 
                     ('vectorizer', vectorizer), 
                     ('nmf', clf)
                    ])
    
    return vectorizer, clf, pipe, n_top_words, n_top_documents

vectorizer, clf, pipe, n_top_words, n_top_documents = define_model(test_df)

In [24]:
# Fit the model
pipe.fit(test_df['reviewText']);

In [25]:
# grab term-document matrix
transform = pipe.fit_transform(test_df['reviewText'])

In [26]:
def display_topics(vectorizer, nmf, W, df, n_top_words, n_top_documents):
    ''' Print out topics discovered by a model '''
    
    # get list of feature names
    feature_names = vectorizer.get_feature_names()
    
    # get VADER sentiment analyzer
    analyser      = SentimentIntensityAnalyzer()    
    
    # list of topics and reviews to return
    topics, reviews = [], []
    
    # loop over all the topics
    for topic_id, topic in enumerate(nmf.components_):
        
        # grab the list of words describing the topic
        word_list = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            word_list.append(feature_names[i])
        
        # split words in case there are some bigrams and get unique set
        split_list = []
        for word in word_list:
            for split in word.split():
                split_list.append(split)
        topic_words = list(set(split_list))
        
        # append topic words as a single string
        topics.append(' '.join([word for word in topic_words]))
        
        # print topic number and topic words
        print('Topic #%02d: %s' % (topic_id+1, ' '.join([word for word in topic_words])))

        # loop over reviews for each topic
        top_doc_indices = np.argsort( W[:,topic_id] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:

            # check that the review contains one of the topic words
            if any(word in df['reviewText'].iloc[doc_index].lower() for word in topic_words):
                
                # seniment analysis
                vader = analyser.polarity_scores(df['reviewText'].iloc[doc_index])
                
                # append current review to the list 
                reviews.append(df.iloc[doc_index].to_dict())
                reviews[-1]['topic']     = topic_id
                reviews[-1]['sentiment'] = vader['compound']
                
                print('User %20s on %s with rating %s' % (df['reviewerName'].iloc[doc_index][:20], 
                                     df['reviewTime'].iloc[doc_index], df['overall'].iloc[doc_index]))            
                print("VADER: %f" % vader['compound'])
            else:
                print("WARNING: TOPIC NOT IN DOCUMENT")
            
        print()
        
    return topics, reviews

topics, reviews = display_topics(vectorizer, clf, transform, 
                                 test_df, n_top_words, n_top_documents)

Topic #01: canon money cable
User         Mr Satisfied on 12 18, 2012 with rating 5.0
VADER: 0.648600
User              Rutgers on 03 22, 2014 with rating 4.0
VADER: 0.594500
User ProPhotographer1 "ka on 07 28, 2012 with rating 5.0
VADER: -0.487400

Topic #02: tv picture great
User Charles E McAllister on 05 6, 2013 with rating 5.0
VADER: 0.943200
User         Be the Plunk on 02 10, 2013 with rating 5.0
VADER: 0.680800
User         John Coleman on 01 21, 2013 with rating 5.0
VADER: 0.794300

Topic #03: work connect great
User        Don L. Binder on 01 3, 2014 with rating 5.0
VADER: 0.709600
User          Paul Fisher on 01 4, 2014 with rating 3.0
VADER: 0.924600
User John E. Wente "okieb on 08 11, 2013 with rating 5.0
VADER: 0.381800

Topic #04: quality use good
User                Salem on 03 21, 2013 with rating 5.0
VADER: 0.591600
User Amazon Customer "Raj on 09 3, 2012 with rating 5.0
VADER: 0.920000
User               apg231 on 04 24, 2013 with rating 5.0
VADER: 0.647800

Topic #0

In [39]:
# extract relevant sentences from a review for this topic
# do sentiment analysis just for those sentences
analyser = SentimentIntensityAnalyzer() 
for i, review in enumerate(reviews):
    summary     = []
    sentences   = sent_tokenize(review['reviewText'])
    topic_words = topics[review['topic']].split()
    for sentence in sentences:
        if any(word in sentence.lower() for word in topic_words):
            summary.append(sentence)

    # save info for summarized reviews
    reviews[i]['summarized_reviewText'] = ' '.join([sent for sent in summary])
    vader = analyser.polarity_scores(reviews[i]['summarized_reviewText'])
    reviews[i]['summary_seniment']      = vader['compound']
    
    # print information
    print(topic_words, review['reviewerName'])
    print(reviews[i]['summarized_reviewText'])
    print("Vader: %f\n" % vader['compound'])
    

['canon', 'money', 'cable'] Mr Satisfied
The cable is doing just what it is suppose to do. The connectors fit snuggly into the HDMI input on the TV and the output of my Canon 5D, Mark II. Saved a lot of money vs. the cable offered by Canon.
Vader: 0.648600

['canon', 'money', 'cable'] Rutgers
the cable itself is very flimsy compared to the OEM Canon. Seems to work ok so represents exceptional value for money.
Vader: 0.594500

['canon', 'money', 'cable'] ProPhotographer1 "kaisphotographycom"
this cable does work with my 5D Mark iii and my canon point and shoot...Audio seems a little off ... not sure if it is the cable or not...
Vader: -0.241100

['tv', 'picture', 'great'] Charles E McAllister
Can view my pictures right from my Cannon 4Ti on my big screen TV with this. It works great, but be sure to follow Cannon's directions. You must have the cable connected and have the TV on before turning on the camera. It is great to look at the pictures on the big screen before downloading them so

# Testing if Documents are Mislabeled

In [28]:
def find_mislabeled(vectorizer, nmf, W, df, n_top_words, n_top_documents):
    ''' Print out topics discovered by a model '''

    n_mislabeled = 0
    feature_names = vectorizer.get_feature_names()
    
    for topic_id, topic in enumerate(nmf.components_):
        
        # grab the topic words to test
        test_words = []
        for i in range(3):
            test_words.append(feature_names[topic.argsort()[:-n_top_words - 1:-1][i]])

        # check if those words are in a review
        top_doc_indices = np.argsort( W[:,topic_id] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            found, review = False, df['reviewText'].iloc[doc_index].lower()
            for test_word in test_words:
                if test_word in review:
                    found = True
            if not found:
                n_mislabeled += 1
                
    return n_mislabeled

In [29]:
# test model for many reviews
asin_list = list(reviews_df['asin'].unique())
print(len(asin_list))

# loop over asin
total_mis = 0
for asin in asin_list:
    misclassified = 0
    
    test_df   = reviews_df[reviews_df['asin'] == test_asin]
    test_df   = test_df.replace('', np.nan)
    test_df   = test_df.dropna()

    # fit model
    vectorizer, clf, pipe, n_top_words, n_top_documents = define_model(test_df)
    pipe.fit(test_df['reviewText']);
    transform = pipe.fit_transform(test_df['reviewText'])
    
    # count how many reviews are mis-classified
    misclassified += find_mislabeled(vectorizer, clf, transform, 
                                     test_df, n_top_words, n_top_documents)

    print(asin, misclassified)
#    total_mis += misclassified
print("Total Mislabeled: "+total_mis)

40
0528881469 1
0594451647 1
0594481813 1
0972683275 1
1400501466 1
1400501520 1
1400501776 1
1400532620 1
1400532655 1
140053271X 1
1400532736 1
1400599997 1
1400698987 1
1400699169 1
1615527613 1
3744295508 1
3930992868 1
3936710058 1
6301977173 1
7214047977 1
7507825604 1
7799813393 1
8862935293 1
8862936826 1
8918010656 1
9043413585 1
9573212900 1
9573212919 1
9575871979 1
9625993428 1
9862510447 1
9876050621 1
9888002198 1
9966338926 1
9966541551 1
9966569863 1
9966694544 1
9981739588 1
9983891204 1
9983891212 1


TypeError: must be str, not int