# yelp - sentiment extraction using supervised learning methods

## Data preparation

In [14]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [15]:
# read yelp.csv into a DataFrame
yelp = pd.read_csv('yelp.csv')

In [16]:
yelp.shape

(10000, 10)

In [17]:
yelp.columns

Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')

In [18]:
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [19]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp["sentiment"] = yelp["stars"]

In [20]:
yelp["sentiment"].unique()

array([5, 4, 2, 3, 1], dtype=int64)

In [21]:
yelp["sentiment"] = yelp["sentiment"].replace([1,2,3,4,5],["bad","bad","neutral","good","good"])

In [22]:
yelp.stars.value_counts()

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [23]:
yelp["sentiment"].value_counts()

good       6863
bad        1676
neutral    1461
Name: sentiment, dtype: int64

In [24]:
# Create training data
X = yelp[["text","sentiment"]]
X.shape

(10000, 2)

## Vectorize Text using Count Vectorization

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
# initialize vectorizer

vect = CountVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)

In [27]:
# fit the count vectorizer and get the feature matrix with frequency of words in each text

vect.fit(X["text"])
X_features = vect.transform(X["text"])

In [15]:
X_features.shape

(10000, 1010)

### Reduce the no of features in count vector by increasing min_df

In [43]:
# initialize vectorizer

vect = CountVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.05)

In [44]:
vect.fit(X["text"])
X_features = vect.transform(X["text"])

In [45]:
X_features.shape

(10000, 156)

In [46]:
# When min_df is increased (from 0.01 to 0.05) then number of features decreases from (1010 to 156)
# min_df controls the number of features and size of the matrix
# min_df of 0.05 means take the word as a feature only if it appears in 5% of the reviews, 
# ie if it appears in 5% of 10000 ie 500 reviews.

In [47]:
### Get the list of features

len(vect.get_feature_names())

156

In [49]:
# list all the class variables and method in the count vectorizer class

dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_sort_features',
 '_stop_words_id',
 '_validate_custom_analyzer',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words'

In [50]:
# list the dictionary of words and indices in vocabulary
vect.vocabulary_

{'took': 138,
 'excellent': 36,
 'perfect': 96,
 'outside': 93,
 'food': 42,
 'like': 68,
 'place': 99,
 'pretty': 101,
 'better': 13,
 'best': 12,
 've': 144,
 'sure': 127,
 'fresh': 44,
 'order': 91,
 'amazing': 2,
 'menu': 82,
 'tasty': 131,
 'delicious': 26,
 'came': 17,
 'bread': 16,
 'meal': 80,
 'wait': 146,
 'people': 95,
 'bad': 9,
 'probably': 104,
 'friend': 45,
 'thought': 135,
 'said': 112,
 'll': 70,
 'got': 52,
 'drink': 32,
 'prices': 103,
 'good': 51,
 'wanted': 148,
 'small': 120,
 'pizza': 98,
 'try': 141,
 'huge': 61,
 'awesome': 8,
 'sauce': 115,
 'home': 57,
 'great': 53,
 'things': 133,
 'love': 76,
 'selection': 118,
 'lot': 75,
 'scottsdale': 117,
 'clean': 21,
 'area': 3,
 'let': 67,
 'don': 31,
 'just': 63,
 'did': 27,
 'say': 116,
 'staff': 123,
 'inside': 62,
 'restaurant': 109,
 'want': 147,
 'eat': 34,
 'didn': 28,
 'ordered': 92,
 'salad': 113,
 'know': 65,
 'tried': 140,
 'service': 119,
 'long': 72,
 'time': 136,
 'minutes': 83,
 'really': 107,
 'wine'

In [62]:
#list all the stop words in the text data
vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### Get the no of features in count vector by using max_features parameter

In [63]:
# usually the max no of of features is used to set the required number of features. Ex: 500, 5000

vect = CountVectorizer(ngram_range=(1,2),stop_words='english', max_features=500)

In [64]:
vect.fit(X["text"])
X_features = vect.transform(X["text"])
X_features.shape

(10000, 500)

In [65]:
type(X_features)

scipy.sparse.csr.csr_matrix

In [70]:
# Create a dataframe with each text as each input row, 
# the columns as the features obtained out of the count vectorization of text column,
# the dataframe values as the count/frequency of the words in text for each row

In [67]:
#Analyze the kind of words that are obtained using count vectorization
#word cloud is a word representation which is usually used for analysis of common words in a text
#convert the X features into an array first and then into a dataframe
#the columns in the dataframe are the feature_names obatined from count vectorization

X_features_df = pd.DataFrame(X_features.toarray())
X_features_df.columns = vect.get_feature_names()

In [68]:
X_features_df.shape

(10000, 500)

In [69]:
#most of the values for each column are zeros, because not all words in each text will be present in the features

X_features_df.sample(10)

Unnamed: 0,00,10,12,15,20,30,50,able,absolutely,actually,...,working,worth,wouldn,wow,wrong,year,years,yelp,yes,yummy
8701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3936,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
424,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1481,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8321,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
6092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Get frequent words

In [72]:
# How often each of the words in the dataset are appearing?
# From the dataset, find the average count or average frequency of each of these words.
# Apply mean on every single column , ie every feature in the dataframe,
# Sort the data to find out the words that appeared most based on their average value
# and then find the average frequency of the words

In [73]:
word_freq = X_features_df.mean().sort_values(ascending=False)

In [74]:
#shape of word frequency should be same as no of features obtained from count vectorizer
word_freq.shape

(500,)

In [31]:
# Average frequency of each word in descending order gives the most frequent words in descending order
#The top 10 most frequent words are :
word_freq[0:10]

good       0.6801
place      0.6662
food       0.6184
great      0.5127
like       0.5041
just       0.4567
time       0.3504
really     0.3366
service    0.3169
ve         0.2594
dtype: float64

In [75]:
#we can look only at the indices if we want only the most frequent words and not the frequencies of the words
word_freq.index[0:100]

Index(['good', 'place', 'food', 'great', 'like', 'just', 'time', 'really',
       'service', 've', 'don', 'love', 'little', 'nice', 'best', 'got',
       'pretty', 'restaurant', 'chicken', 'try', 'ordered', 'menu', 'people',
       'know', 'bar', 'order', 'didn', 'think', 'better', 'friendly', 'make',
       'went', 'staff', 'night', 'did', 'way', 'going', 'cheese', 'pizza',
       'right', 'delicious', 'came', 'say', 'want', 'salad', 'lunch', 'come',
       'new', 'day', 'fresh', 'sauce', 'eat', 'll', 'experience', 'sure',
       'definitely', 'happy', 'wait', 'times', 'amazing', 'bit', 'meal',
       'area', 'said', 'bad', 'table', 'location', 'dinner', 'thing', 'lot',
       'hour', 'phoenix', 'small', 'big', 'prices', 'home', 'favorite',
       'sandwich', 'wasn', 'hot', 'tasty', 'burger', 'stars', 'drinks', 'feel',
       'drink', 'sweet', 'things', 'awesome', 'fries', 'store', 'beer', 'long',
       'wine', 'price', 'looking', 'atmosphere', 'minutes', 'worth', 'bread'],
      dty

In [76]:
#list of all the words and their frequencies
word_freq

good             0.6801
place            0.6662
food             0.6184
great            0.5127
like             0.5041
just             0.4567
time             0.3504
really           0.3366
service          0.3169
ve               0.2594
don              0.2518
love             0.2250
little           0.2221
nice             0.2217
best             0.1952
got              0.1831
pretty           0.1812
restaurant       0.1750
chicken          0.1746
try              0.1721
ordered          0.1705
menu             0.1678
people           0.1675
know             0.1613
bar              0.1602
order            0.1589
didn             0.1583
think            0.1552
better           0.1541
friendly         0.1527
                  ...  
feel like        0.0241
onion            0.0241
neighborhood     0.0240
eggs             0.0239
crispy           0.0239
pho              0.0238
corn             0.0237
doing            0.0237
service great    0.0235
girl             0.0234
glad            

## Building a classification model

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, X["sentiment"], random_state=2)

In [85]:
# import and instantiate a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)

In [86]:
# train the model using X_train_dtm
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [87]:
# make class predictions 
y_pred_class = rf.predict(X_test)

In [88]:
y_pred_class

array(['good', 'good', 'good', ..., 'good', 'good', 'bad'], dtype=object)

In [89]:
pred_df = pd.DataFrame({"actual":y_test, "pred":y_pred_class})

In [90]:
pred_df

Unnamed: 0,actual,pred
7878,good,good
3224,good,good
1919,bad,good
4432,good,good
4835,neutral,bad
4895,neutral,good
7269,good,good
1451,neutral,good
1742,good,good
4628,good,good


In [91]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7384

In [92]:
pd.crosstab(pred_df["actual"],pred_df["pred"])

pred,bad,good,neutral
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,152,270,8
good,28,1682,11
neutral,37,300,12


### Re-train the model after stemming the document. 
### Vectorization using tf-idf vectorizer

In [93]:
from nltk import word_tokenize          
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Stemming

In [94]:
stem = PorterStemmer()

In [95]:
text_stem = []
for review in yelp.text:
    text_stem.append(" ".join([stem.stem(x) for x in word_tokenize(review)]))

In [96]:
text_stem[0]

"My wife took me here on my birthday for breakfast and it wa excel . the weather wa perfect which made sit outsid overlook their ground an absolut pleasur . our waitress wa excel and our food arriv quickli on the semi-busi saturday morn . It look like the place fill up pretti quickli so the earlier you get here the better . Do yourself a favor and get their bloodi mari . It wa phenomen and simpli the best I 've ever had . I 'm pretti sure they onli use ingredi from their garden and blend them fresh when you order it . It wa amaz . while everyth on the menu look excel , I had the white truffl scrambl egg veget skillet and it wa tasti and delici . It came with 2 piec of their griddl bread with wa amaz and it absolut made the meal complet . It wa the best `` toast '' I 've ever had . anyway , I ca n't wait to go back !"

### Build a tf-idf vectorizer and vectorize the text

In [119]:
#vect = CountVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)
vect = TfidfVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)
vect.fit(text_stem)
X_features = vect.transform(text_stem)
X_features.shape

(10000, 1043)

### text classification

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_features, X["sentiment"], random_state=2)

In [100]:
# import and instantiate a Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)

In [101]:
# train the model using X_train_dtm
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [104]:
# make class predictions 
y_pred_class = rf.predict(X_test)

In [103]:
y_pred_class

array(['good', 'good', 'good', ..., 'good', 'good', 'bad'], dtype=object)

In [105]:
pred_df = pd.DataFrame({"actual":y_test, "pred":y_pred_class})

In [107]:
pred_df.head()

Unnamed: 0,actual,pred
7878,good,good
3224,good,good
1919,bad,good
4432,good,good
4835,neutral,bad


In [108]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7476

In [109]:
pd.crosstab(pred_df["actual"],pred_df["pred"])

pred,bad,good,neutral
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,151,274,5
good,11,1705,5
neutral,23,313,13


### Re-train the model after Lemmatizing the document.
### Vectorization using tf-idf vectorizer

In [112]:
from nltk import word_tokenize # tokenize the word
from nltk import pos_tag # identify the pos tags of words
from nltk.corpus import wordnet # wordnet is used for finding correct synonym of the POS tags
from nltk.stem import WordNetLemmatizer  # Lemmatize the word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Lemmatization

In [113]:
# function to return pos tags of words in a text

def get_wordnet_pos_tag(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [114]:
lemma = WordNetLemmatizer()

In [116]:
text_pos = []
text_words = []
text_postags = []
text_lemma = []
for review in yelp.text:
    text_pos = pos_tag(word_tokenize(review))
    text_words = [x[0] for x in text_pos]
    text_postags = [x[1] for x in text_pos]
    text_lemma.append(" ".join([lemma.lemmatize(a,get_wordnet_pos_tag(b)) for a,b in zip(text_words, text_postags)]))

In [117]:
text_lemma[0]

"My wife take me here on my birthday for breakfast and it be excellent . The weather be perfect which make sit outside overlook their ground an absolute pleasure . Our waitress be excellent and our food arrive quickly on the semi-busy Saturday morning . It look like the place fill up pretty quickly so the early you get here the good . Do yourself a favor and get their Bloody Mary . It be phenomenal and simply the best I 've ever have . I 'm pretty sure they only use ingredient from their garden and blend them fresh when you order it . It be amaze . While EVERYTHING on the menu look excellent , I have the white truffle scramble egg vegetable skillet and it be tasty and delicious . It come with 2 piece of their griddle bread with be amaze and it absolutely make the meal complete . It be the best `` toast '' I 've ever have . Anyway , I ca n't wait to go back !"

### Build a tf-idf vectorizer and vectorize the text

In [120]:
#vect = CountVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)
vect = TfidfVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)
vect.fit(text_lemma)
X_features = vect.transform(text_lemma)
X_features.shape

(10000, 915)

### text classification

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X_features, X["sentiment"], random_state=2)

In [122]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)

In [123]:
# train the model using X_train_dtm
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [124]:
# make class predictions 
y_pred_class = rf.predict(X_test)

In [125]:
y_pred_class

array(['good', 'good', 'good', ..., 'good', 'good', 'bad'], dtype=object)

In [129]:
pred_df = pd.DataFrame({"actual":y_test, "pred":y_pred_class})

In [130]:
pred_df.head()

Unnamed: 0,actual,pred
7878,good,good
3224,good,good
1919,bad,good
4432,good,good
4835,neutral,bad


In [127]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7428

In [128]:
pd.crosstab(pred_df["actual"],pred_df["pred"])

pred,bad,good,neutral
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,153,269,8
good,20,1693,8
neutral,27,311,11


### Information extraction using chunking

In [131]:
#import ghostscript
import os

In [132]:
os.environ['PATH'] = os.environ['PATH'] + ";C:\\Program Files\\gs\\gs9.51\\bin\\"

In [133]:
import nltk
from nltk import pos_tag, word_tokenize

In [134]:
# function to return noun phrases as chunks from a text

def return_np_chunks(sentence):
    grammar = "NP: {<JJ><NN>}"
    chunker = nltk.RegexpParser(grammar) 
    result = chunker.parse(pos_tag(word_tokenize(sentence))) 
    
    noun_phrases = []
    for n in result:
        if isinstance(n, nltk.tree.Tree):               
            if n.label() == 'NP':
                temp_phrase = list(n)
                temp_phrase = ' '.join([x[0] for x in temp_phrase])
                noun_phrases.append(temp_phrase)
    return noun_phrases

In [135]:
## Running function on all yelp reviews

In [136]:
yelp_phrases = [return_np_chunks(x) for x in yelp["text"]]

In [137]:
yelp_phrases[0:5]

[['absolute pleasure', 'pretty sure', 'white truffle', 'vegetable skillet'],
 ['own fault', 'sweetish sauce'],
 [],
 ['wonderful job'],
 ['good egg', 'awesome staff']]

In [138]:
flat_list = []
for sublist in yelp_phrases:
    for item in sublist:
        flat_list.append(item)

In [139]:
pd.Series(flat_list).value_counts().head()

happy hour     480
first time     353
next time      244
great place    222
last night     193
dtype: int64

In [140]:
res = pd.DataFrame(pd.Series(flat_list).value_counts())

In [141]:
res.head()

Unnamed: 0,0
happy hour,480
first time,353
next time,244
great place,222
last night,193


In [142]:
res.to_csv("res.csv")