In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()

gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [2]:
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))


title: yelp_review.csv, id: 1Pb-HyMgtDS7YLKy4fvHTmijQ8e5btxhf
title: assignment2_final.ipynb, id: 1bUcuXQXc7yxQJp3syyV1SKQrXhkyh5Pd
title: assignment2_final.ipynb, id: 1BzTVuxNs89te-CweYhAmmvAsoyJUVKyy
title: assignment2.ipynb, id: 1GgWimMgPEgfs5yK0wPE9ldZTOaDQIx7T
title: assignment2, id: 1GQ42YnoJY06wW6Boe6Md7hmX2mGcsCTv
title: TwitterAPIadnauseam.ipynb, id: 17CEE7esgukbxdksSofGcn0xtZLiCQL9z
title: Colab Notebooks, id: 1ifvtXVHfqG_RStAHTvCKFxq_EIVndFdI
title: Untitled0.ipynb, id: 1eao2LiZo6_CjDu8Fc7NRN4uXuFEX-tQK
title: Pics, id: 0B4Z5tSBhDSLqWnNGc2pCU3loakE
title: DSC_3425.JPG, id: 0B4Z5tSBhDSLqcTA2MWpkMVR4Zjg
title: mobile pics, id: 0B4Z5tSBhDSLqNmdYeWhUNGVmS3c
title: DSC_1395.jpg, id: 0B4Z5tSBhDSLqUHZ2cHRzdXZ2NE0
title: DSC_1393.jpg, id: 0B4Z5tSBhDSLqWnI3WXY2bTJiU3c
title: DSC_1392.jpg, id: 0B4Z5tSBhDSLqbkVzOExfQThMS1E
title: novofest, id: 0B4Z5tSBhDSLqflpZUy1XLWdNY3BsNlM3QkRyOHpMSHNGd0htb2NUb0gxTzhhVmJjZ29rN00


In [3]:
file2 = drive.CreateFile({'id': "1Pb-HyMgtDS7YLKy4fvHTmijQ8e5btxhf"})
print('title: %s, mimeType: %s' % (file2['title'], file2['mimeType']))

title: yelp_review.csv, mimeType: text/csv


In [0]:
file2.GetContentFile('yelp_review.csv') 

In [0]:
# Step 0: Receiving and reading the file.
import pandas as pd
reviews = pd.read_csv('yelp_review.csv')

In [6]:
xyz = reviews[:200000]
xyz.head()
xyz.shape


(200000, 9)

In [7]:
# taking relevant columns from the reviews
review = xyz[['text', 'stars']]
review.head()

Unnamed: 0,text,stars
0,Super simple place but amazing nonetheless. It...,5
1,Small unassuming place that changes their menu...,5
2,Lester's is located in a beautiful neighborhoo...,5
3,Love coming here. Yes the place always needs t...,4
4,Had their chocolate almond croissant and it wa...,4


In [8]:
# will help to check how many reviews are there per rating
review.stars.value_counts()

5    86501
4    46788
1    27375
3    22779
2    16557
Name: stars, dtype: int64

In [9]:
X = review["text"]
y = review.stars
X.shape
y.shape

(200000,)

In [0]:
#STEP 1/2: PREPROCESSING AND FEATURES EXTRACTION.
#STEMMING OF DOC USING NLTK

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])


In [0]:

# WILL USE TF-IDF VECTORIZER, WHICH IS COMBINATION OF COUNT VECTORIZER AND TF IDF TRANSFORMER
# preprocessing and feature extraction
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

#UNIGRAM
vectorizer_1 = StemmedTfidfVectorizer(stop_words='english')

In [0]:
X_train_dtm = vectorizer_1.fit_transform(X_train)
#dtm is data term matrix
#do fitting and transfrom in single step


In [13]:
tokens = vectorizer_1.get_feature_names()
print(len(tokens))
# number of features in unigram

81363


In [0]:
X_test_dtm = vectorizer_1.transform(X_test)

In [0]:
# REPEATING WITH BIGRAM METHOD
vectorizer_2 =  StemmedTfidfVectorizer(stop_words="english", ngram_range=(1,2))
X_train_dtm_2 = vectorizer_2.fit_transform(X_train)


In [16]:
tokens_2 = vectorizer_2.get_feature_names()
print(len(tokens_2))
#number of features in bigram

2849324


In [0]:
X_test_dtm_2 = vectorizer_2.transform(X_test)

In [18]:
print(tokens_2[200000:200059])

['bag 25', 'bag 30', 'bag 3rd', 'bag 48hr', 'bag 50', 'bag 60', 'bag 600', 'bag 75', 'bag 99', 'bag abercrombi', 'bag absurd', 'bag accessori', 'bag accus', 'bag actu', 'bag ad', 'bag adjust', 'bag adv', 'bag afterward', 'bag ag', 'bag aliant', 'bag alleg', 'bag allegi', 'bag almond', 'bag amaz', 'bag amen', 'bag anxi', 'bag apolog', 'bag appear', 'bag appet', 'bag appl', 'bag approach', 'bag area', 'bag arriv', 'bag ask', 'bag ass', 'bag assassin', 'bag assort', 'bag athletet', 'bag attach', 'bag attempt', 'bag authent', 'bag avail', 'bag avocado', 'bag aw', 'bag away', 'bag awesom', 'bag babi', 'bag bad', 'bag bag', 'bag bagel', 'bag bak', 'bag balaclava', 'bag bar', 'bag barbequ', 'bag bargain', 'bag bas', 'bag bean', 'bag beef', 'bag beer']


In [19]:
#STEP 3: SUPERVISED LEARNING/ EVALUATION
# MULTINOMIAL NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB
nb1 = MultinomialNB()

nb1.fit(X_train_dtm, y_train)



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
nb2 = MultinomialNB()
nb2.fit(X_train_dtm_2, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
y_pred_nb1 = nb1.predict(X_test_dtm)


In [0]:
y_pred_nb2 = nb2.predict(X_test_dtm_2)

In [23]:
from sklearn.metrics import f1_score
#F1 score for unigram NB
f1_score(y_test, y_pred_nb1, average= 'weighted')


0.43847600823977745

In [24]:
# F1 score for bigram NB
# to ignore warning due to classes with no predictions made
import warnings
warnings.filterwarnings('ignore')
f1_score(y_test, y_pred_nb2, average= 'weighted')


0.28645948412433225

In [25]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(class_weight='balanced')
lr1.fit(X_train_dtm, y_train)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [0]:
y_pred_lr1 = lr1.predict(X_test_dtm)

In [27]:
lr2 = LogisticRegression(class_weight='balanced')
lr2.fit(X_train_dtm_2, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [0]:
y_pred_lr2 = lr2.predict(X_test_dtm_2)

In [29]:
# F1 score for unigram LR
f1_score(y_test, y_pred_lr1, average= 'weighted')

0.6380637428163922

In [30]:
# F1 score for bigram LR
f1_score(y_test, y_pred_lr2, average= 'weighted')

0.6486225428994348

In [0]:
# DECISION TREE CLASSIFICATION
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
param_grid = {'max_depth': [45, 65, 95, 100],"min_samples_leaf": [5,8,11,14], "min_samples_split": [5,8,11,14] }
clf = RandomForestClassifier(class_weight="balanced")

grid_obj1 = GridSearchCV(clf, param_grid, scoring="f1_weighted")

In [0]:
grid_fit1 = grid_obj1.fit(X_train_dtm, y_train)

In [0]:
grid_best_1 = grid_fit1.best_params_