<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Words-indicative-of-(5-star)-and-(1-star)-reviews" data-toc-modified-id="Words-indicative-of-(5-star)-and-(1-star)-reviews-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Words indicative of (5-star) and (1-star) reviews</a></span></li><li><span><a href="#Customer-segmentation" data-toc-modified-id="Customer-segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Customer segmentation</a></span></li><li><span><a href="#Customer-segmentation-w/-PCA-&amp;-Scaling-before-clustering" data-toc-modified-id="Customer-segmentation-w/-PCA-&amp;-Scaling-before-clustering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Customer segmentation w/ PCA &amp; Scaling before clustering</a></span></li></ul></div>

# Words indicative of (5-star) and (1-star) reviews

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daviderickson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_reviews(size='small'): 
    if size == 'small':
        filename = r'../../data/small-review.json'
    elif size == 'intermediate':
        filename = r'../../data/intermediate-review.json'
    elif size == 'full':
        filename = r'../../data/review.json'
    new_list = []
    for line in open(filename):
       new_list.append(json.loads(line))
    return pd.DataFrame.from_records(new_list)

dfreviews = load_reviews(size='intermediate')

In [3]:
dfreviews.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
1,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
3,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
4,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ


In [4]:
dfreviews.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')

In [5]:
dfreviews['text'][0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [6]:
# For simplicity, drop anything that isn't a letter
# Numbers and symbols may have interesting meaning and could be explore later

def lettersOnly(string):
    return re.sub("[^a-zA-Z]", " ", string) 

dfreviews['text'] = dfreviews['text'].apply(lettersOnly)


In [7]:
dfreviews['text'][0]

'Total bill for this horrible service  Over   Gs  These crooks actually had the nerve to charge us     for   pills  I checked online the pills can be had for    cents EACH  Avoid Hospital ERs at all costs '

In [8]:
def review_to_words(string):
    string = re.sub("[^a-zA-Z]", " ", string) # keep only letters. more complex model possible later
    words =  string.lower().split() # make everything lowercase. split into words
    stops = set(stopwords.words('english')) # create a fast lookup for stopwords
    words = [w for w in words if not w in stops] # remove stopwords
    return( " ".join( words )) # recombine review and return
    
# dfreviews['text'] = dfreviews['text'].apply(review_to_words) # apply to reviews in dataframe


In [9]:
dfreviews['text'] = dfreviews['text'].apply(review_to_words)

In [10]:
dfreviews['text'][0]

'total bill horrible service gs crooks actually nerve charge us pills checked online pills cents avoid hospital ers costs'

In [11]:
print("Creating the TFIDF...\n")
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "TfidfVectorizer" object, which is scikit-learn's
# term frequency, inverse document frequency (TFIDF) tool.  
vectorizer = TfidfVectorizer(max_df = 0.1, min_df=15, \
                             analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) # 5000

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(dfreviews['text'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the TFIDF...



In [12]:
print(train_data_features.shape)

(100000, 5000)


In [13]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)



# Customer segmentation

In [14]:
# train_data_features has tfidf featues
# Add other data to each review (stars, useful, ...)
# GroupBy User and avg across each review
# Cluster users using K-means
# Interpret user clusters


# Add non-text data back to feature matrix
review_features = ['cool', 'funny', 'stars', 'useful', 'user_id']
all_features_labels = vectorizer.get_feature_names() + review_features
all_features = np.append(train_data_features, dfreviews[review_features].to_numpy(), 1)


In [None]:
# GroupBy User and avg across each review
all_features_df = pd.DataFrame(data=all_features, columns=all_features_labels)
all_features_df_tmp = all_features_df.iloc[:,:-1].astype('float64')
all_features_df_tmp['user_id'] = all_features_df['user_id']
all_features_df = all_features_df_tmp
del all_features_df_tmp
all_features_User = all_features_df.groupby(by='user_id').mean()

In [None]:
all_features_User.head()

In [None]:
# Cluster users using K-means
from sklearn.cluster import KMeans

max_clusters = 10
kmeans_cost = []
for num_clusters in range(1,max_clusters):
    k_means_clutering = KMeans(n_clusters=num_clusters)
    k_means_clutering.fit(all_features_User)
    kmeans_cost.append(k_means_clutering.inertia_)
    

In [None]:
# Determine the best value of K to use (the number of clusters)
# plot the cost against K values 
plt.plot(range(1, max_clusters), kmeans_cost, color ='g', linewidth ='3') 
plt.title('4 Clusters of Users:', fontsize=20)
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 

In [None]:
# Cluster users using K-means
# Interpret user clusters

num_clusters = 4
k_means_clutering = KMeans(n_clusters=num_clusters)
k_means_clutering.fit(all_features_User)

In [None]:
user_cluster_centers_df = pd.DataFrame(data=k_means_clutering.cluster_centers_, columns=all_features_User.columns)
user_cluster_centers_df.head()

In [None]:
user_cluster_centers_df.iloc[0].to_dict()

In [None]:
from wordcloud import WordCloud

for cluster in range(len(user_cluster_centers_df)):
    series = user_cluster_centers_df.iloc[cluster,:-4] #Use relevant row, drop non-word cols
    wc_dict = series.to_dict()
    wordcloud = WordCloud().generate_from_frequencies(wc_dict)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Cluster: {}'.format(cluster), fontsize=20)
    plt.axis('off')
    plt.show()
    

# Customer segmentation w/ PCA & Scaling before clustering

In [None]:
# Min Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# PCA all_features_User
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(scaler.fit_transform(all_features_User))

In [None]:
plt.bar(x=range(len(pca.explained_variance_ratio_)), height=pca.explained_variance_ratio_, log=True)
plt.xlim((0,500))

In [None]:
pca = PCA(n_components=200)
pca_user_features = pca.fit_transform(all_features_User)

In [None]:
# Cluster users using K-means
from sklearn.cluster import KMeans

max_clusters = 10
kmeans_cost = []
for num_clusters in range(1,max_clusters):
    k_means_clutering = KMeans(n_clusters=num_clusters)
    k_means_clutering.fit(pca_user_features)
    kmeans_cost.append(k_means_clutering.inertia_)
    

In [None]:
# Determine the best value of K to use (the number of clusters)
# plot the cost against K values 
plt.plot(range(1, max_clusters), kmeans_cost, color ='g', linewidth ='3') 
plt.title('4 Clusters of Users:', fontsize=20)
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 

In [None]:
# Cluster users using K-means
# Interpret user clusters

num_clusters = 4
k_means_clutering = KMeans(n_clusters=num_clusters)
k_means_clutering.fit(pca_user_features)

In [None]:
# Find the center of each cluster and report 

pca_user_cluster_centers_df = all_features_User.copy()
pca_user_cluster_centers_df['cluster'] = k_means_clutering.labels_ #Add cluster label
pca_user_cluster_centers_df = pca_user_cluster_centers_df.groupby(by='cluster').mean()
pca_user_cluster_centers_df.head()

In [None]:
from wordcloud import WordCloud

for cluster in range(len(pca_user_cluster_centers_df)):
    series = pca_user_cluster_centers_df.iloc[cluster,:-4] #Use relevant row, drop non-word cols
    wc_dict = series.to_dict()
    wordcloud = WordCloud().generate_from_frequencies(wc_dict)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Cluster: {}'.format(cluster), fontsize=20)
    plt.axis('off')
    plt.show()
    