In [116]:

import nltk

In [None]:
# Step 0: Receiving and reading the file.
import pandas as pd
reviews = pd.read_csv('yelp_review.csv')

In [None]:
xyz = reviews[:200000]
xyz.head()
xyz.shape


In [None]:
# taking relevant columns from the reviews
review = xyz[['text', 'stars']]
review.head()

In [None]:
# will help to check how many reviews are there per rating
review.stars.value_counts()

In [None]:
X = review["text"]
y = review.stars
X.shape
y.shape

In [67]:
#STEP 1/2: PREPROCESSING AND FEATURES EXTRACTION.
#STEMMING OF DOC USING NLTK
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])


In [68]:

# WILL USE TF-IDF VECTORIZER, WHICH IS COMBINATION OF COUNT VECTORIZER AND TF IDF TRANSFORMER
# preprocessing and feature extraction
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

#UNIGRAM
vectorizer_1 = StemmedTfidfVectorizer(stop_words='english')

In [69]:
X_train_dtm = vectorizer_1.fit_transform(X_train)
#dtm is data term matrix
#do fitting and transfrom in single step


In [70]:
tokens = vectorizer_1.get_feature_names()
print(len(tokens))
# number of features in unigram

81363


In [71]:
X_test_dtm = vectorizer_1.transform(X_test)

In [72]:
# REPEATING WITH BIGRAM METHOD
vectorizer_2 =  StemmedTfidfVectorizer(stop_words="english", ngram_range=(1,2))
X_train_dtm_2 = vectorizer_2.fit_transform(X_train)


In [73]:
tokens_2 = vectorizer_2.get_feature_names()
print(len(tokens_2))
#number of features in bigram

2849324


In [74]:
X_test_dtm_2 = vectorizer_2.transform(X_test)

In [75]:
print(tokens_2[200000:200059])

['bag 25', 'bag 30', 'bag 3rd', 'bag 48hr', 'bag 50', 'bag 60', 'bag 600', 'bag 75', 'bag 99', 'bag abercrombi', 'bag absurd', 'bag accessori', 'bag accus', 'bag actu', 'bag ad', 'bag adjust', 'bag adv', 'bag afterward', 'bag ag', 'bag aliant', 'bag alleg', 'bag allegi', 'bag almond', 'bag amaz', 'bag amen', 'bag anxi', 'bag apolog', 'bag appear', 'bag appet', 'bag appl', 'bag approach', 'bag area', 'bag arriv', 'bag ask', 'bag ass', 'bag assassin', 'bag assort', 'bag athletet', 'bag attach', 'bag attempt', 'bag authent', 'bag avail', 'bag avocado', 'bag aw', 'bag away', 'bag awesom', 'bag babi', 'bag bad', 'bag bag', 'bag bagel', 'bag bak', 'bag balaclava', 'bag bar', 'bag barbequ', 'bag bargain', 'bag bas', 'bag bean', 'bag beef', 'bag beer']


In [76]:
#STEP 3: SUPERVISED LEARNING/ EVALUATION
# MULTINOMIAL NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB
nb1 = MultinomialNB()

nb1.fit(X_train_dtm, y_train)



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
nb2 = MultinomialNB()
nb2.fit(X_train_dtm_2, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [78]:
y_pred_nb1 = nb1.predict(X_test_dtm)


In [79]:
y_pred_nb2 = nb2.predict(X_test_dtm_2)

In [80]:
from sklearn.metrics import f1_score
#F1 score for unigram NB
f1_score(y_test, y_pred_nb1, average= 'weighted')


0.43847600823977745

In [81]:
# F1 score for bigram NB
# to ignore warning due to classes with no predictions made
import warnings
warnings.filterwarnings('ignore')
f1_score(y_test, y_pred_nb2, average= 'weighted')


0.28645948412433225

In [82]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(class_weight='balanced')
lr1.fit(X_train_dtm, y_train)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [83]:
y_pred_lr1 = lr1.predict(X_test_dtm)

In [84]:
lr2 = LogisticRegression(class_weight='balanced')
lr2.fit(X_train_dtm_2, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [85]:
y_pred_lr2 = lr2.predict(X_test_dtm_2)

In [86]:
# F1 score for unigram LR
f1_score(y_test, y_pred_lr1, average= 'weighted')

0.63806374281639222

In [87]:
# F1 score for bigram LR
f1_score(y_test, y_pred_lr2, average= 'weighted')

0.64862254289943477

In [None]:
# DECISION TREE CLASSIFICATION
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
param_grid = {'max_depth': [ 20, 30, 35,40],"min_samples_leaf": [5,10,15], "min_samples_split": [5,10,15] }
clf = RandomForestClassifier(class_weight="balanced")

grid_obj1 = GridSearchCV(clf, param_grid, scoring="f1_weighted")

In [None]:
grid_fit1 = grid_obj1.fit(X_train_dtm, y_train)

In [None]:
grid_best_1 = grid_fit1.best_params_

In [None]:
grid_best_1
# for unigram, following are the best parameters

In [None]:
grid_obj2 = GridSearchCV(clf, param_grid, scoring="f1_weighted")

In [None]:
grid_fit2 = grid_obj2.fit(X_train_dtm_2, y_train)

In [None]:
grid_best_2 = grid_fit2.best_params_