In [173]:
import pandas as pd
import numpy as np
import string
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.linear_model import LinearRegression

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestRegressor 

# 1. LOAD DATA

In [27]:
import pyspark as ps    # for the pyspark suite
import os               # for environ variables in Part 3

%load_ext autoreload
%autoreload 2

spark = ps.sql.SparkSession.builder \
            .appName('capstone') \
            .getOrCreate()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
df_reviews = spark.read.csv('Data/amazon_small.tsv', sep='\t', header=True, inferSchema=True)

In [8]:
df_beauty = df_reviews.toPandas()

# 2. Data cleaning

In [28]:
def data_cleaning(url):
    
    ''' 
    1. create new columns with review headline + review text
    2. create new df  
    3. Remove products with no reviews
    4. Calls clean text, to clean the text reviews
    5. Drops old text review
    6. Calculates the sentiment 
    7. Calls get_aspects to get key words
    
    
    '''
    cols=['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'] 

    df = pd.read_csv(url, sep = '\t', names = cols)
    df['review_text'] = df['review_headline']+". "+ df['review_body']
    df = pd.concat([df['product_id'], 
                    df['product_title'], 
                    df['review_text'],
                    df['star_rating']],axis=1)
    
    
    df = df[pd.notnull(df['review_text'])]
    
    df['clean_text'] = df['review_text'].apply(text_cleaner)
    
    df = df.drop('review_text', axis=1)
    
    df['sentiment_score'] = df['clean_text'].apply(sentiment_analyzer_scores)
    
    df['key_words'] = df['clean_text'].apply(get_aspects)
    
    df['key_words'] = df['key_words'].apply(', '.join)

    
    return df 
    

In [29]:
import unicodedata
import re
import numpy as np 


def text_cleaner(name): 
    
    
    ''' 
    Text cleaner is called in the clean data function. 
    Takes in text and cleans it. 
    
    '''
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', name)
    cleantext = str(cleantext).lower()
    cleaned = re.sub(r'[?|!|\'|"|#|$|%]',r'',cleantext)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = str(cleaned).lower()
    
    #removing accented characters
    cleaned = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    return cleaned


In [30]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    
    
    ''' 
    Returns Compound score for each sentence.
    The Compound score is a metric that calculates 
    the sum of all the lexicon ratings 
    which have been normalized between 
    -1(most extreme negative) and +1 (most extreme positive)
    
    '''
    score = analyser.polarity_scores(sentence)
    return score['compound']

In [49]:
n_samples = 10000
n_features = 10000
n_components = 10
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)


# Use tf-idf features for NMF.

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(prod_B["clean_text"])




# Fit the NMF model with tf-idf features

nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)


# Topics in NMF model (Frobenius norm)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


NameError: name 'NMF' is not defined

In [48]:
#Fitting LDA models with tf features

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(prod_B["clean_text"])



lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)



lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

NameError: name 'LatentDirichletAllocation' is not defined

In [95]:
## NEED TO FIGURE OUT THE CORRECT AMOUNT OF FEATURES: USING GRID SEARCH?

# Part 3: Practing with two products

## Sentiment Analysis 

In [161]:
prod_A = data_cleaning('Data/test_product_A.tsv')


In [162]:
prod_B = data_cleaning('Data/test_product_B.tsv')

In [165]:
prodab = pd.concat([prod_A, prod_B])


In [164]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()



"""Create a list of common words to remove"""
stop_words=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
            "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", 
            "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", 
            "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
            "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
            "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", 
            "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
            "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "nice"]


"""Load the pre-trained NLP model in spacy"""
nlp = en_core_web_sm.load()


"""Define a function to extract keywords"""

def get_aspects(x):

    
    """Apply the function to get aspects from reviews"""

    doc=nlp(x) ## Tokenize and extract grammatical components
    doc=[i.text for i in doc if i.text not in stop_words and i.pos_=="NOUN"] ## Remove common words and retain only nouns
    doc=list(map(lambda i: i.lower(),doc)) ## Normalize text to lower case
    doc=pd.Series(doc)
    doc=doc.value_counts().head(11).index.tolist() ## Get 5 most frequent nouns
    return doc




"""Apply the function to get aspects from reviews"""
get_aspects(prod_B['clean_text'][10])



['one', 'travel', 'hair', 'bag', 'style', 'flatiron', 'cloth']

In [157]:
prod_A

Unnamed: 0,product_id,product_title,star_rating,clean_text,sentiment_score,key_words
0,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",3,great to keep them white not so great to get ...,0.9185,"teeth, product, step, gel, crest, whitestrips,..."
1,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",3,not so great for sensitive teeth i was super ...,0.9359,"step, day, teeth, system, use, mission, coffee..."
2,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",5,fresh from the dentist clean smooth white te...,0.9896,"teeth, system, brilliance, cleansing, dentist,..."
3,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",4,i liked how this product cleaned my teeth and ...,0.8689,"product, teeth, whitening, tube, tubes, mouth"
4,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",4,great at home whitening maintenance ive been ...,0.9891,"home, whitening, brilliance, cleansing, thanks..."
5,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",2,good product just not for people with sensiti...,-0.6385,"teeth, product, people, mouth, parts, crest, w..."
6,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",5,easy system i love this system i recieved th...,0.9659,"system, teeth, diffrent, time, way, mouthwashe..."
7,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",5,great whitening product great whitening produ...,0.9359,"product, whitening, kit, dentist, results"
8,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",5,smileverous not only did i love this product ...,0.9851,"product, friends, mission, word, smile, review..."
9,B00ZKLLZAI,"Crest 3D White Brilliance Toothpaste, Teeth Wh...",3,its okay i dont really see any benefits from ...,0.0303,"product, step, whitening, week, time, products..."


In [38]:

from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(prod_A['key_words'], prod_A['sentiment_score'], test_size=0.33, random_state=42)

## Linear Regression with TfidfVectorizer 

In [59]:
tf_vectorizer = TfidfVectorizer()
training_features = tf_vectorizer.fit_transform(X_train)
test_features = tf_vectorizer.transform(X_test)

L = tf_vectorizer.get_feature_names() 

Dict_features = {idx: value for idx, value in enumerate(L)}



In [103]:

tf_model = LinearRegression()
tf_model.fit(training_features, y_train)
y_pred = tf_model.predict(test_features)


top_ten = np.argsort(tf_model.coef_)[675:685]

top_ten_list = []
for i in top_ten:
    top_ten_list.append(Dict_features[i])
    
tf_model.coef_

array([ 2.73105836e-01,  8.67692450e-02, -2.67565179e+00, -1.01156233e+00,
       -5.24347454e-01, -1.95636754e+00,  1.71816638e+00, -1.30800537e+00,
        3.96952693e-01,  4.29011529e-02,  8.69669185e-01,  2.58242110e+00,
        1.86586599e+00,  1.66118320e+00, -7.99797585e-02,  9.85207657e-01,
        8.57766967e-02,  1.81292634e+00,  6.95542944e-02, -7.80531437e-01,
       -3.07918632e-01, -2.49524447e-01,  4.50096064e-01, -1.39557674e+00,
       -4.06438600e-01,  3.28305372e-01, -6.09422191e-01,  1.32194915e+00,
        2.86343378e-01, -2.55827630e+00,  1.13609406e+00, -9.06403203e-01,
       -1.23594487e-01, -1.75100495e+00, -1.81602657e+00, -2.41444696e+00,
       -3.76939526e-02,  7.94027439e-01, -2.74109041e-01,  8.84823120e-02,
        8.37469506e-02, -6.42603179e-01, -3.76882581e-01,  1.24636113e+00,
       -1.35771034e+00,  1.04661060e+00,  6.90929270e-01,  6.11466446e-01,
        1.04676790e-01,  6.89892528e-01, -4.74573190e-01,  4.12993679e-01,
       -6.26455092e-01, -

In [83]:
## RIDGE 
from sklearn.linear_model import Ridge

clf = Ridge(alpha=1.0)
clf.fit(training_features, y_train) 
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

R_y_pred = clf.predict(test_features)


top_ten = np.argsort(clf.coef_)[0:10]

ttop_ten_list = []
for i in top_ten:
    top_ten_list.append(Dict_features[i])
    
top_ten_list 


['future',
 'minutes',
 'spots',
 'crest',
 'paste',
 'time',
 'system',
 'staining',
 'brush',
 'smile']

## Linear Regression with Count Vectorizer 
##### TDIF counts less common words. Perhaps do count vectorizer 

In [62]:


# Transform each text into a vector of word counts
C_vectorizer = CountVectorizer()
cv_training_features = C_vectorizer.fit_transform(X_train)
cv_test_features = C_vectorizer.transform(X_test)


L2 = C_vectorizer.get_feature_names() 

Dict_features_2 = {idx: value for idx, value in enumerate(L2)}

In [90]:

cv_model = LinearRegression()
cv_model.fit(cv_training_features, y_train)
y_pred = cv_model.predict(cv_test_features)


top_ten2 = np.argsort(cv_model.coef_)[675:685]

top_ten_list2 = []
for i in top_ten2:
    top_ten_list2.append(Dict_features_2[i])
    
top_ten_list2

['aftertaste',
 'favor',
 'ffor',
 'buzzagent',
 'vanity',
 'lineup',
 'photos',
 'coke',
 'opiniom',
 'bzzzagent']

In [101]:
## RIDGE 
from sklearn.linear_model import Ridge

clf2 = Ridge(alpha=1.0)
clf2.fit(cv_training_features, y_train) 
Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

R_y_pred = clf2.predict(cv_test_features)


top_ten = np.argsort(clf2.coef_)[675:685]

top_ten_list_4 = []
for i in top_ten:
    top_ten_list_4.append(Dict_features_2[i])
    
top_ten_list_4

['agent',
 'brush',
 'testers',
 'staining',
 'smoking',
 'freebie',
 'bonus',
 'spots',
 'alternative',
 'car']

In [None]:
# See if these features are negative or positive
# Maybe split reviews in negative or positive 

In [362]:
def RF(product, vectorizer=CountVectorizer()):


    ''' 

    Input product data frame
    Returns train/test split 
    Vectorizes data, and creates dictionary of {index:feature_name}
    Inputs vectorized data into a ridge regression to get coefficents of each features
    Returns top negative values and top positive values

    ''' 

    #Returns train/test split 
    
    y = product['sentiment_score']


    #Vectorizes data, and creates dictionary of {index:feature_name}

    X = vectorizer.fit_transform(product['key_words'])
    

    L = vectorizer.get_feature_names() 
    Dict_features = {idx: value for idx, value in enumerate(L)}


    
    regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=100)
    regr.fit(X, y)
    RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
               max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
    

    #Gets top ten features based on largest coefficents!!! (Here should return largest/ smallest for negative & positive)
    top_ten = np.argsort(regr.feature_importances_)
    top_ten_list = []
    for i in top_ten:
        top_ten_list.append(Dict_features[i])
    
    return top_ten_list

In [352]:
def sent_dict(prod_a, prod_b, lst):
    sent_dict_a = {}
    sent_dict_b = {}
    for word in lst:
        sent_dict_a[word] = [prod_a[prod_a['key_words'].str.contains(word)]['sentiment_score'].mean(),
                      prod_a[prod_a['key_words'].str.contains(word)]['key_words'].count()] 
        
        sent_dict_b[word] = [prod_b[prod_b['key_words'].str.contains(word)]['sentiment_score'].mean(),
                      prod_b[prod_b['key_words'].str.contains(word)]['key_words'].count()] 
        
    df_a = pd.DataFrame.from_dict(sent_dict_a, orient='index', 
                                  columns=['Product_A_avg_sentiment_score', 'Product_A_review_count'])
    
    df_b = pd.DataFrame.from_dict(sent_dict_b, orient='index',
                                  columns=['Product_B_avg_sentiment_score', 'Product_B_review_count'])
    
    
    result = pd.concat([df_a, df_b], axis=1)
                       
 
    return result
    

In [372]:
sent_dict(prod_A, prod_B, Vec_Ridge(prodab))

Unnamed: 0,Product_A_avg_sentiment_score,Product_A_review_count,Product_B_avg_sentiment_score,Product_B_review_count
alternative,0.9456,1,0.9413,1
courtesy,0.9176,9,,0
staining,0.900493,15,,0
timing,0.9542,2,,0
freebie,0.8496,1,,0
whiteners,0.963529,7,,0
places,0.7784,1,,0
diamond,0.900033,3,0.995,1
commitment,0.8267,1,,0
dollars,0.9896,1,0.9576,1


In [370]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge


def Vec_Ridge(product, vectorizer=CountVectorizer()):


    ''' 

    Input product data frame
    Returns train/test split 
    Vectorizes data, and creates dictionary of {index:feature_name}
    Inputs vectorized data into a ridge regression to get coefficents of each features
    Returns top negative values and top positive values

    ''' 

    #Returns train/test split 
    X_train, X_test, y_train, y_test = train_test_split(product['key_words'], product['sentiment_score'], test_size=0.33, random_state=42)


    #Vectorizes data, and creates dictionary of {index:feature_name}

    training_features = vectorizer.fit_transform(X_train)
    test_features = vectorizer.transform(X_test)

    L = vectorizer.get_feature_names() 
    Dict_features = {idx: value for idx, value in enumerate(L)}


    #Inputs vectorized data into a ridge regression to get coefficents of each features
    clf = Ridge(alpha=1.0)
    clf.fit(training_features, y_train) 
    Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='auto', tol=0.001)

    y_pred = clf.predict(test_features)

    #Gets top ten features based on largest coefficents!!! (Here should return largest/ smallest for negative & positive)
    top_ten = np.argsort(clf.coef_)[:-10 - 1:-1]
    top_ten_list = []
    for i in top_ten:
        top_ten_list.append(Dict_features[i])
    
    return top_ten_list





In [371]:
Vec_Ridge(prodab)

['alternative',
 'courtesy',
 'staining',
 'timing',
 'freebie',
 'whiteners',
 'places',
 'diamond',
 'commitment',
 'dollars']