<a href="https://colab.research.google.com/github/giaayutari/bigdata/blob/main/1_Restaurant_Reviews.tsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
import pandas as pd

In [26]:
df = pd.read_csv('1_Restaurant_Reviews.tsv', sep='\t', quoting=3)

In [27]:
from sklearn.model_selection import train_test_split
X = df['Review']
y = df['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [28]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def text_process(document):
    document = re.sub('[^a-zA-Z]', ' ', document)
    document = document.lower()
    document = document.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    document = [word for word in document if not word in set(all_stopwords)]
    ps = PorterStemmer()
    document = [ps.stem(word) for word in document]
    return document

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [29]:
rf_param_grid = {
    'bag_of_words__ngram_range': [(1, 1)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__criterion': ['gini'],
    'estimator__n_estimators': [100, 300]
}

nb_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__alpha': [0.01, 1.0]
}

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

rf_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', RandomForestClassifier())
])

nb_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', MultinomialNB())
])

In [31]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(rf_pipe, rf_param_grid, verbose=2, cv=2)
rf_grid.fit(X_train, y_train)

nb_grid = GridSearchCV(nb_pipe, nb_param_grid, verbose=2, cv=2)
nb_grid.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.5s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.8s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=300; total time=   1.5s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=300; total time=   1.1s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.4s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngr



[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.5s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.4s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.5s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.4s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.5s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.3s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.3s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.3s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.3s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.3s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s


In [32]:
rf_grid.best_params_

{'bag_of_words__max_df': 1.0,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__criterion': 'gini',
 'estimator__n_estimators': 300}

In [33]:
nb_grid.best_params_

{'bag_of_words__max_df': 0.85,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__alpha': 0.01}

In [35]:
rf_y_pred = rf_grid.predict(X_test)
nb_y_pred = nb_grid.predict(X_test)

In [36]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, rf_y_pred).ravel()
print('Confusion matrix:\n', metrics.confusion_matrix(y_test, rf_y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, rf_y_pred))
print('Precision:', metrics.precision_score(y_test, rf_y_pred))
print('Recall:', metrics.recall_score(y_test, rf_y_pred))
print('F1-Score:', metrics.f1_score(y_test, rf_y_pred))
print(metrics.classification_report(y_test, rf_y_pred)) # Better for multiclass problem

Confusion matrix:
 [[104  30]
 [ 36  80]]
Accuracy: 0.736
Precision: 0.7272727272727273
Recall: 0.6896551724137931
F1-Score: 0.7079646017699116
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       134
           1       0.73      0.69      0.71       116

    accuracy                           0.74       250
   macro avg       0.74      0.73      0.73       250
weighted avg       0.74      0.74      0.74       250



In [37]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, nb_y_pred).ravel()
print('Confusion matrix:\n', metrics.confusion_matrix(y_test, nb_y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, nb_y_pred))
print('Precision:', metrics.precision_score(y_test, nb_y_pred))
print('Recall:', metrics.recall_score(y_test, nb_y_pred))
print('F1-Score:', metrics.f1_score(y_test, nb_y_pred))
print(metrics.classification_report(y_test, nb_y_pred)) # Better for multiclass problem

Confusion matrix:
 [[89 45]
 [27 89]]
Accuracy: 0.712
Precision: 0.664179104477612
Recall: 0.7672413793103449
F1-Score: 0.7120000000000001
              precision    recall  f1-score   support

           0       0.77      0.66      0.71       134
           1       0.66      0.77      0.71       116

    accuracy                           0.71       250
   macro avg       0.72      0.72      0.71       250
weighted avg       0.72      0.71      0.71       250



In [38]:
rf_grid.best_params_

{'bag_of_words__max_df': 1.0,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__criterion': 'gini',
 'estimator__n_estimators': 300}

In [39]:
rf_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process, max_df=1.0, min_df=0.01, ngram_range=(1,1))),
    ('tf_idf', TfidfTransformer()),
    ('estimator', RandomForestClassifier(n_estimators=300, criterion='gini'))
])

rf_pipe.fit(X_train, y_train)

In [41]:
feature_importance = pd.DataFrame(rf_pipe.steps[2][1].feature_importances_,
                                  rf_pipe.steps[0][1].get_feature_names_out(),
                                  columns=['importance'])
feature_importance.sort_values('importance', ascending=False).head(20)

Unnamed: 0,importance
not,0.072989
great,0.063918
good,0.059811
love,0.027718
amaz,0.026897
delici,0.023849
servic,0.023648
nice,0.022512
friendli,0.021917
place,0.020886
