# Yelp Data Challenge - Clustering and PCA


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [3]:
df = pd.read_csv('../dataset/2017_restaurant_reviews.csv')

In [5]:
df.head(10)

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-02-14,0,VETXTwMw6qxzOVDlXfe6Tg,5,went for dinner tonight. Amazing my husband ha...,0,ymlnR8UeFvB4FZL56tCZsA
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-04,0,S8-8uZ7fa5YbjnEtaW15ng,5,This was an amazing dinning experience! ORDER ...,0,9pSSL6X6lFpY3FCRLEH3og
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-05-20,0,Pnkrj90xfykhHyo4BSFRsw,5,ABSOLUTE MUST IN VEGAS! Loved everything my bo...,0,cZVQGCZ_fHtTdfiyGVJPdg
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-30,0,Oeh7e6U2xaDQI9L9i4x_Gw,2,I had high hopes for Delmonico's Steakhouse in...,0,li2cBZl60vgqihDJJG7jeA
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g
5,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-04-02,0,kjhk2-OENBAP55Y3KfbpyA,5,In Vegas a few nights seeking a great steak di...,0,P1O81PTvXaR9kYKaP3_qwg
6,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2018-01-04,0,zwhQh7xRqEFlMr5ZDErIgg,5,My husband and I came out to Vegas for a mini ...,1,SRVUSTaHsxw9jrWA6mrylA
7,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-05-08,0,ftrTcWgixeSzIhUxdtfueg,5,Wonderful pre-show dinner! We enjoyed the han...,0,f0apW99d7gn2ztFkF8d_Zg
8,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-08-30,0,VL1e4ZrVH_21SfWeuMkMxQ,5,Went here for my husbands 40th birthday last y...,0,y0cleYfYK1FISjZeX5elrA
9,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2018-01-24,0,nGtACcrQLEhz7H1V5HRfkQ,2,"Very bland room, very bland sides, steak was t...",1,xSG7VajnEch1oA33FAZ0Jw


## 1. Cluster the review text data for all the restaurants

### Define feature variables

In [6]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

### Define target variable

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [7]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = (df['stars']>4)
target = df['favorable'].values

In [8]:
target[:10]

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
       False])

#### You may want to look at the statistic of the target variable

In [9]:
target.mean(), target.std()

(0.49914993914337047, 0.49999927739601785)

### Create training dataset and test dataset

In [10]:
from sklearn.cross_validation import train_test_split



In [11]:
# documents is X, target is y
# Now split the data to training set and test set
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.2,
    random_state = 3)

### Get NLP representation of the documents

#### Fit TfidfVectorizer with training data only, then tranform all the data to tf-idf

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# Create TfidfVectorizer, and name it vectorizer, choose a reasonable max_features, e.g. 1000
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)

In [15]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [16]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [18]:
vectors_train.shape

(248452, 1000)

In [19]:
# Use the trained model to transform all the reviews
vectors_documents = vectorizer.transform(documents).toarray()

### Cluster reviews with KMeans

#### Fit k-means clustering with the training vectors and apply it on all the data

In [20]:
from sklearn.cluster import KMeans

kmeans = KMeans()
kmeans.fit(vectors_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

#### Make predictions on all your data

In [21]:
assigned_cluster = kmeans.predict(vectors_documents)

#### Inspect the centroids
To find out what "topics" Kmeans has discovered we must inspect the centroids. Print out the centroids of the Kmeans clustering.

   These centroids are simply a bunch of vectors.  To make any sense of them we need to map these vectors back into our 'word space'.  Think of each feature/dimension of the centroid vector as representing the "average" review or the average occurances of words for that cluster.

In [23]:
print("Cluster centers:")
print(kmeans.cluster_centers_.shape)

Cluster centers:
(8, 1000)


#### Find the top 10 features for each cluster.
For topics we are only really interested in the most present words, i.e. features/dimensions with the greatest representation in the centroid.  Print out the top ten words for each centroid.

* Sort each centroid vector to find the top 10 features
* Go back to your vectorizer object to find out what words each of these features corresponds to.


In [24]:
n_feat = 10
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-n_feat:-1]

In [25]:
print ("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ",".join(words[i] for i in centroid)))

top features for each cluster:
0: chicken,good,fried,food,rice,ordered,place,great,delicious
1: sushi,place,rolls,roll,ayce,good,great,service,fresh
2: burger,fries,burgers,good,place,food,cheese,shake,great
3: good,food,place,like,really,just,service,ordered,nice
4: great,food,service,place,good,friendly,staff,atmosphere,definitely
5: order,food,time,minutes,service,just,came,asked,said
6: amazing,food,best,place,vegas,love,service,delicious,friendly
7: pizza,crust,good,place,great,cheese,slice,order,best


#### Try different k
If you set k == to a different number, how does the top features change?

In [26]:
kmeans = KMeans(n_clusters = 10)
kmeans.fit(vectors_train)
assigned_cluster = kmeans.predict(vectors_documents)

In [27]:
n_feat = 10
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-n_feat:-1]

In [28]:
print ("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ",".join(words[i] for i in centroid)))

top features for each cluster:
0: vegas,las,best,food,place,great,time,service,restaurant
1: sushi,place,rolls,roll,ayce,great,good,service,fresh
2: great,food,service,place,friendly,awesome,staff,atmosphere,definitely
3: order,food,minutes,time,service,just,came,asked,said
4: pizza,crust,good,place,great,cheese,slice,order,ordered
5: chicken,fried,good,food,rice,ordered,place,great,sauce
6: burger,fries,burgers,good,place,cheese,food,shake,great
7: amazing,food,service,place,great,best,definitely,love,delicious
8: place,food,delicious,like,just,love,service,ordered,time
9: good,food,really,place,service,nice,pretty,like,great


#### Print out the rating and review of a random sample of the reviews assigned to each cluster to get a sense of the cluster.

In [30]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, vectors_documents.shape[0])[assigned_cluster == i]
    sample_reviews = np.random.choice(cluster, 2, replace = False)
    print("="*10)
    print("Cluster %d:" %i)
    for review_index in sample_reviews:
        print("   %s -" % df.loc[review_index]['stars'])
        print("%s" % df.loc[review_index]['text'])
    print

Cluster 0:
   5 -
This is probably my favorite restaurant in Las Vegas. My wife and I have been coming for years.

We arrived in Las Vegas and went straight here for dinner. We had the stuffed mushrooms with chorizo, stuffed dates, manchego cheese / apple cheese salad, ceviche and camarones diablo. All delicious and reasonably priced.

The drinks are refreshing and delicious. I started off with the Passion Fruit Mojito and ended with a Watermelon flavored mojito. Both delicious but entirely different flavors. The passion fruit sour and watermelon sweet.

The waiters and waitresses are very knowledgeable and service was impeccable. Can not wait to come back once visiting Vegas again.
   5 -
Wednesday, January 3rd 2018 3 PM

This review will be my last of Glazier's since Smith's bought the building..  We've been shopping here over 6 years now and we drive out of our way to shop there.  I will miss this East Coast family market. They had the best steak dinners for under $10!  The carried 

## 2. Cluster all the reviews of the most reviewed restaurant
Let's find the most reviewed restaurant and analyze its reviews

In [31]:
# Find the business who got most reviews, get your filtered df, name it df_top_restaurant
most_reviewed_restaurant= df ['business_id'].value_counts().index[0]

We can also load restaurant profile information from the business dataset (optional)

In [43]:
# Load business dataset 
# Take a look at the most reviewed restaurant's profile 
df_top_restaurant = df[df['business_id'] == most_reviewed_restaurant].copy().reset_index()
df_top_restaurant

Unnamed: 0,index,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id,favorable
0,135917,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-09-09,0,mQfl6ci46mu0xaZrkRUhlA,5,"This buffet is amazing. Yes, it is expensive,...",0,f638AHA_GoHbyDB7VFMz7A,True
1,135918,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-02-08,0,lMarDJDg4-e_0YoJOKJoWA,2,This place....lol our server was nice. But fo...,0,A21zMqdN76ueLZFpmbue0Q,False
2,135919,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-12-22,0,30xmXTzJwHPcqt0uvSLQhQ,3,One star knocked off for the cold air conditio...,0,uNHEnP28MMmVy96ZSJKaMA,False
3,135920,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-09-22,0,SOUuNn4f1fHKxFHntYzonw,3,Was torn between 2 and 3. Caught the last of ...,0,WvVqnHU_eVBUfL-CI9efdw,False
4,135921,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-06-18,0,nnmcNHGLa6TTZ4KP7ZrU7A,1,"Food is subpar. There's a line to get to pay, ...",0,KIbfAG_JrTEPt6aQXcnQJg,False
5,135922,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,1,2017-02-28,0,NXOgf4JQNbskd2XYhPJB7A,5,"This buffet was amazing! It was a bit pricey, ...",1,wHeH7PwKHubMjmx9SroN4w,True
6,135923,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-07-02,0,Xci8QJaDOB7OazJIP9U0Cw,5,I'm staying at Caesar's Palace for 4th of July...,0,J90_7u_8vPYa03GMohNxAA,True
7,135924,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-11-26,0,n6rHMUT-AWKkAro8Ho6-Fw,4,Sunday brunch at Caesar's is pricey. The food ...,2,sfpgA58pdGKebocTe4N_-w,False
8,135925,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-04-06,0,vAohFUTT0d8GkMPyItcT5Q,5,Best buffet in Vegas! By far this buffet in i...,0,Hc0v8M1RP6sUuFQ2_zLP6Q,True
9,135926,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",4.0,0,2017-05-21,0,bwi8o0poN-38SrLla5cPCg,1,"Below average, most items not served hot. Too ...",0,sY3dmQqrXChk_Xu8Oq3bNw,False


### Vectorize the text feature

In [44]:
# Take the values of the column that contains review text data, save to a variable named "documents_top_restaurant"
documents_top_restaurant = df_top_restaurant['text'].values

In [45]:
documents_top_restaurant.dtype, documents_top_restaurant.shape

(dtype('O'), (2269,))

### Define your target variable (for later classification use)

#### Again, we look at perfect (5 stars) and imperfect (1-4 stars) rating

In [46]:
df_top_restaurant['favorable'] = (df_top_restaurant['stars'] > 4)

In [47]:
target_top_restaurant = df_top_restaurant['favorable'].values.astype(int)
target_top_restaurant

array([1, 0, 0, ..., 0, 0, 1])

#### Check the statistic of the target variable

In [48]:
target_top_restaurant.mean()

0.3869546055531071

In [50]:
documents_top_restaurant.shape, target_top_restaurant.shape

((2269,), (2269,))

### Create training dataset and test dataset

In [51]:
from sklearn.cross_validation import train_test_split

In [52]:
# documents_top_restaurant is X, target_top_restaurant is y
# Now split the data to training set and test set
X_train, X_test, y_train, Y_test = train_test_split(
    documents_top_restaurant, target_top_restaurant, test_size = 0.2, random_state = 3)

### Get NLP representation of the documents

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)

In [57]:
# Train the model with training data
vectors_train = vectorizer.fit_transform(X_train).toarray()

In [58]:
# Get the vocab of tfidf
words = vectorizer.get_feature_names()

In [61]:
vectors_train.shape

(1815, 1000)

In [62]:
# Use the trained model to transform the test data
vectors_test = vectorizer.transform(X_test).toarray()

In [63]:
# Use the trained model to transform all the data
vectors_documents_top_restaurant = vectorizer.transform(documents_top_restaurant).toarray()

### Cluster reviews with KMeans

#### Fit k-means clustering on the training vectors and make predictions on all data

In [65]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 5)
kmeans.fit(vectors_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

#### Make predictions on all your data

In [66]:
assigned_cluster = kmeans.predict(vectors_documents_top_restaurant)

#### Inspect the centroids

In [67]:
print("Cluster centers:")
print(kmeans.cluster_centers_.shape)

Cluster centers:
(5, 1000)


#### Find the top 10 features for each cluster.

In [68]:
n_feat = 10
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-n_feat:-1]

#### Print out the rating and review of a random sample of the reviews assigned to each cluster to get a sense of the cluster.

In [69]:
print ("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ",".join(words[i] for i in centroid)))

top features for each cluster:
0: wait,line,time,food,long,buffet,hour,seated,worth
1: food,good,buffet,great,service,place,worth,just,price
2: crab,legs,buffet,good,food,king,just,worth,oysters
3: best,buffet,vegas,food,buffets,quality,ve,great,las
4: buffet,seafood,food,station,good,dessert,section,asian,vegas


## 3. Use PCA to reduce dimensionality

### Stardardize features
Your X_train and X_test

In [None]:
from sklearn.preprocessing import StandardScaler

# To be implemented
pass


### Use PCA to transform data (train and test) and get princial components

In [None]:
from sklearn.decomposition import PCA

# Let's pick a n_components
n_components = 50

# To be implemented
pass


### See how much (and how much percentage of) variance the principal components explain

In [None]:
# To be implemented
pass

In [None]:
# To be implemented
pass

### Viz: plot proportion of variance explained with top principal components

For clear display, you may start with plotting <=20 principal components

In [None]:
# To be implemented
pass

## Classifying positive/negative review with PCA preprocessing

### Logistic Regression Classifier
#### Use standardized tf-idf vectors as features

In [None]:
# Build a Logistic Regression Classifier, train with standardized tf-idf vectors

from sklearn.linear_model import LogisticRegression

# To be implemented
pass

In [None]:
# Get score for training set
pass

In [None]:
# Get score for test set
pass

#### Use (Stardardized + PCA) tf-idf vectors as features

In [None]:
# Build a Logistic Regression Classifier, train with PCA tranformed X

from sklearn.linear_model import LogisticRegression

# To be implemented
pass

In [None]:
# Get score for training set
pass

In [None]:
# Get score for test set, REMEMBER to use PCA-transformed X!
pass

#### Q: What do you see from the training score and the test score? How do you compare the results from PCA and non-PCA preprocessing?

A: (insert your comments here)

#### You can plot the coefficients against principal components


In [None]:
# To be implemented
pass

### Random Forest Classifier
#### Use standardized tf-idf vectors as features

In [None]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# To be implemented
pass

In [None]:
# Get score for training set
pass

In [None]:
# Get score for test set
pass

#### Use (Stardardized + PCA) tf-idf vectors as features

In [None]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# To be implemented
pass

In [None]:
# Get score for training set
pass

In [None]:
# Get score for test set, REMEMBER to use PCA-transformed X!
pass

#### Q: What do you see from the training result and the test result?

A: (insert your comments here)

#### You can plot the feature importances against principal components


In [None]:
# To be implemented
pass

## Extra Credit #1: Can you cluster restaurants from their category information?
Hint: a business may have mutiple categories, e.g. a restaurant can have both "Restaurants" and "Korean"

In [None]:
# To be implemented

## Extra Credit #2: Can you try different distance/similarity metrics for clusterings, e.g. Pearson correlation, Jaccard distance, etc. 

Hint: You can take a look at [scipy](http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html#scipy.spatial.distance.pdist) documentations to use other distances

#### Q: How do you compare with Cosine distance or Euclidean distance?

In [None]:
# To be implemented

## Extra Credit #3: Can you cluster categories from business entities? What does it mean by a cluster?
Hint: Think the example where words can be clustered from the transposed tf-idf matrix.

In [None]:
# To be implemented

## Extra Credit #4: What are the characteristics of each of the clustered  ? For each cluster, which restaurant can best represent ("define") its cluster?
Hint: how to interpret "best"?

In [None]:
# To be implemented

## Extra Credit #5: Can you think of other use cases that clustering can be used? 
Hint: of course you can make use of other yelp dataset. You can try anything you want as long as you can explain it.

In [None]:
# To be implemented