In [1]:
from __future__ import print_function
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np

In [2]:
# Read in the data
reviews = pd.read_csv('home_products_additional_features.csv', header=0, encoding="ISO-8859-1" )

# Read in Reviews broken out by sentence
sentences = pd.read_csv('sentence_home_products_additional_features.csv', header=0, encoding="ISO-8859-1" )

In [28]:
#reviews.columns.values
#reviews['Review Rating'][3]

#reviews_20 = reviews[reviews['Review Rating']==20]
#reviews_40 = reviews[reviews['Review Rating']==40]
#reviews_60 = reviews[reviews['Review Rating']==60]
#reviews_80 = reviews[reviews['Review Rating']==80]
#reviews_100 = reviews[reviews['Review Rating']==100]

#reviews[reviews['Review Rating']==100]
#reviews['Review Rating'].unique()

In [3]:
def run_nmf(nmf_features, nmf_topics, nmf_top_words, nmf_data_samples, nmf_max_df, nmf_min_df, nmf_alpha, nmf_l1_ratio):
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=nmf_max_df, min_df=nmf_min_df,
                                       max_features=nmf_features,
                                       stop_words='english')

    tfidf = tfidf_vectorizer.fit_transform(nmf_data_samples)

    nmf = NMF(n_components=nmf_topics, random_state=1,
              alpha=nmf_alpha, l1_ratio=nmf_l1_ratio).fit(tfidf)

    print("\nTopics in NMF model:")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, nmf_top_words)

In [9]:
def run_lda(lda_features, lda_topics, lda_top_words, lda_data_samples, lda_max_df, lda_min_df, lda_max_iter, lda_learning_offset):
    print("Fitting LDA models with tf features...")
    tf_vectorizer = CountVectorizer(max_df=lda_max_df, min_df=lda_min_df,
                                    max_features=lda_features,
                                    stop_words='english')
    
    tf = tf_vectorizer.fit_transform(lda_data_samples)
    
    lda = LatentDirichletAllocation(n_topics=lda_topics, max_iter=lda_max_iter,
                                    learning_method='online',
                                    learning_offset=lda_learning_offset,
                                    random_state=0)
    
    lda.fit(tf)
    
    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, lda_top_words)    

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [6]:
def remove_nulls(df, column):
    data = df.dropna(subset = [column])
    data = list(data[column])
    return data    

In [7]:
#////////////////////////SET NMF PARAMETERS AND RUN MODEL/////////////////////////////

# Uncomment one of the lines (and only one line) that begins with nmf_data_samples to change the data set the model runs on
#nmf_data_samples = remove_nulls(reviews, 'Text')
#nmf_data_samples = remove_nulls(reviews, 'Title')
nmf_data_samples = remove_nulls(reviews, 'text_and_title')
#nmf_data_samples = remove_nulls(reviews, 'double_title')
#nmf_data_samples = remove_nulls(reviews, 'text_and_title_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'double_title_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'text_and_title_negation')
#nmf_data_samples = remove_nulls(reviews, 'double_title_negation')
#nmf_data_samples = remove_nulls(reviews, 'text_and_title_negation_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'double_title_negation_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'lemma_text_title_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'lemma_double_title_no_stops')
#nmf_data_samples = remove_nulls(reviews, 'nouns_and_adjectives')

nmf_features = 12000      # Size of the vocabulary
nmf_topics = 75           # Number of topics
nmf_top_words = 9         # Words to include in the topic
nmf_max_df=0.95           # Ignore terms that have a doc frequency (percent or int) strictly higher than the given threshold
nmf_min_df=2              # Ignore terms that have a doc frequency (percent or int) strictly lower than the given threshold
nmf_alpha=.1              # Constant that multiplies the regularization terms. Set to zero for no regularization.
nmf_l1_ratio=.5           # Regularization mixing parameter.  0 <= l1_ratio <= 1

run_nmf(nmf_features, nmf_topics, nmf_top_words, nmf_data_samples, nmf_max_df, nmf_min_df, nmf_alpha, nmf_l1_ratio)

Extracting tf-idf features for NMF...

Topics in NMF model:
Topic #0:
clorox wipes family trust bleach thank know love household
Topic #1:
value pack money multi excellent containers useful super came
Topic #2:
easy makes use grab cleanup super wipes effective store
Topic #3:
smell fresh strong pleasant bleach harsh lemon makes chemicals
Topic #4:
stars described thank worked service excellent advertised ok didn
Topic #5:
love absolutely wipes especially em room having ease clorox
Topic #6:
good stuff quality pretty say size overall keyboard expensive
Topic #7:
clean way know fresh disinfected help ups feel surfaces
Topic #8:
best market thing tried disinfectant ve far brands cleaners
Topic #9:
bathroom sink toilet boys container especially kitchen room cleanings
Topic #10:
convenient super wipes powerful effective extremely way canister especially
Topic #11:
kitchen bath grease bathroom daily day surfaces room stove
Topic #12:
cleaning makes power easier surfaces powerful wiping breez

In [20]:
#////////////////////////SET LDA PARAMETERS AND RUN MODEL/////////////////////////////

# Uncomment one of the lines (and only one line) that begins with lda_data_samples to change the data set the model runs on

#lda_data_samples = remove_nulls(reviews, 'Text')
#lda_data_samples = remove_nulls(reviews, 'Title')
#lda_data_samples = remove_nulls(reviews, 'text_and_title')
#lda_data_samples = remove_nulls(reviews, 'double_title')
#lda_data_samples = remove_nulls(reviews, 'text_and_title_no_stops')
#lda_data_samples = remove_nulls(reviews, 'double_title_no_stops')
#lda_data_samples = remove_nulls(reviews, 'text_and_title_negation')
#lda_data_samples = remove_nulls(reviews, 'double_title_negation')
#lda_data_samples = remove_nulls(reviews, 'text_and_title_negation_no_stops')
#lda_data_samples = remove_nulls(reviews, 'double_title_negation_no_stops')
#lda_data_samples = remove_nulls(reviews, 'lemma_text_title_no_stops')

#lda_data_samples = remove_nulls(reviews, 'lemma_double_title_no_stops')
#lda_data_samples = remove_nulls(reviews_20, 'lemma_double_title_no_stops')

#lda_data_samples = remove_nulls(reviews, 'nouns_and_adjectives')
#lda_data_samples = remove_nulls(reviews_100, 'nouns_and_adjectives')


#lda_data_samples = remove_nulls(sentences, 'Sentence')
#lda_data_samples = remove_nulls(sentences, 'sentence_no_stops')
#lda_data_samples = remove_nulls(sentences, 'sentence_lemma_no_stops')
lda_data_samples = remove_nulls(sentences, 'sentence_nouns_and_adjectives')


lda_features = 12000      # Size of the vocabulary 
lda_topics = 75           # Number of topics
lda_top_words = 9         # Words to include in the topic
lda_max_df= 0.80          # Ignore terms that have a doc frequency (percent or int) strictly higher than the given threshold
lda_min_df= 15             # Ignore terms that have a doc frequency (percent or int) strictly lower than the given threshold
lda_max_iter=6            # Number of iterations to compute
lda_learning_offset=40.   # A parameter that downweights early iterations in online learning. Should be > 1

run_lda(lda_features, lda_topics, lda_top_words, lda_data_samples, lda_max_df, lda_min_df, lda_max_iter, lda_learning_offset)

Fitting LDA models with tf features...

Topics in LDA model:
Topic #0:
great clorox wipes price excellent switches beat bonus true
Topic #1:
counters office phone desk disinfecting purse uses desks cat
Topic #2:
lots sturdy trick ton convienent described bath baths general
Topic #3:
order months live packs green cheap damage hot task
Topic #4:
value dogs continue feet dish play linoleum sense ball
Topic #5:
room need easier size children better wipes bleach spots
Topic #6:
delivery pair options mother chemicals close trust costco cat
Topic #7:
favorite wipes wish liquid end reason disappointed roll eyeglass
Topic #8:
handy stainless steel wipes stains look scrub marks pets
Topic #9:
bathroom water everyday try keeps case soap seat past
Topic #10:
review grandson helpful reading tube packs ability reasons thank
Topic #11:
right recommend regular wipes action dual contact routine sides
Topic #12:
wife complaint rip bargain biggest unable feed waste process
Topic #13:
kitchen lysol wipes 