In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,confusion_matrix,recall_score,f1_score,accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
# importing csv file

amazon_df=pd.read_csv('Amazon_Reviews.csv')
amazon_df

Unnamed: 0,Review,Label
0,Stuning even for the non-gamer: This sound tr...,1
1,The best soundtrack ever to anything.: I'm re...,1
2,Amazing!: This soundtrack is my favorite musi...,1
3,Excellent Soundtrack: I truly like this sound...,1
4,"Remember, Pull Your Jaw Off The Floor After H...",1
...,...,...
194,A Book That Is Worth a Second Look: This book...,1
195,Best game ever: This games makes even amazing...,1
196,Guitar in Absentia: With all due respect to a...,0
197,Stiff and Smells like drying paint: You get w...,0


In [3]:
# assigning the output column to y variable and drop the column from dataframe
y=amazon_df['Label']
amazon_df.drop(columns='Label', inplace=True)

In [4]:
# train_test_split the amazon_df
x_train,x_test,y_train,y_test=train_test_split(amazon_df,y, test_size=0.2, random_state=42)


In [5]:
regexp=RegexpTokenizer(r'\w+') # r'\w+' eliminates the special characters
stopwords_en=stopwords.words('english') # to remove the stopwords from the text
lemmatizer=WordNetLemmatizer()
vectorizer=TfidfVectorizer()  
stopwords_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
vectorizer.decode(x_train['Review'][1])

" The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."

In [7]:
def preprocessing(review):
    tokens=regexp.tokenize(review)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords_en]
    lemma_tokens=[lemmatizer.lemmatize(pure_token, pos='v') for pure_token in pure_tokens]
    return ' '.join(lemma_tokens)

In [8]:
# applying preprocessing steps on x_train and x_test
x_train['Review']=x_train['Review'].apply(preprocessing)
x_test['Review']=x_test['Review'].apply(preprocessing)

In [9]:
# TF-IDF vectorization
x_train_tfidf=vectorizer.fit_transform(x_train['Review'])
x_test_tfidf=vectorizer.transform(x_test['Review'])

In [10]:
# Logistic Regression
logreg=LogisticRegression()
logreg.fit(x_train_tfidf,y_train)
logreg_pred=logreg.predict(x_test_tfidf)

In [11]:
# Evaluation of Logistic Regression
confusion_matrix(y_test,logreg_pred)
Recall_LR=recall_score(y_test,logreg_pred)
Precision_LR=precision_score(y_test,logreg_pred)
accuracy_score(y_test,logreg_pred)
F1_Score_LR=f1_score(y_test,logreg_pred)

In [12]:
tree=DecisionTreeClassifier()
forest=RandomForestClassifier(random_state=42)
adaboost=AdaBoostClassifier(random_state=42)
gradientbc=GradientBoostingClassifier(random_state=42)
svc=SVC(random_state=42)
multinomial=MultinomialNB()

In [13]:
# Decision Tree Classifier and evaluation
tree.fit(x_train_tfidf,y_train)
tree_pred=tree.predict(x_test_tfidf)
train_score_tree=tree.score(x_train_tfidf,y_train)
test_score_tree=tree.score(x_test_tfidf,y_test)
F1_Score_DT=f1_score(y_test,tree_pred)


In [14]:
Precision_DT=precision_score(y_test,tree_pred)
Recall_DT=recall_score(y_test,tree_pred)

In [15]:
# Random Forest Classifier with GridSearchCV and F1 Score
param_dict={'n_estimators': [15,20,50],
           'max_depth':[4,6,8],
           'min_samples_split':[2,3,4],
           'max_features':['auto','sqrt','log2']}
gcv_forest=GridSearchCV(forest,param_grid=param_dict,cv=5,verbose=5)
gcv_forest.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15, score=0.531, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15, score=0.688, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15, score=0.594, total=   0.0s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15, score=0.594, total=   0.0s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=15, score=0.645, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20, score=0.562, total=   0.0s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20, score=0.656, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20, score=0.562, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20, score=0.625, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=20, score=0.645, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=50, score=0.531, total=   0.1s
[CV] max_depth=4, max_features=auto, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=auto, min_samples_split=2, n_estimators=50, score=0.625, total=   0.1s
[CV] ma

[CV]  max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.531, total=   0.1s
[CV] max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.625, total=   0.1s
[CV] max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.562, total=   0.1s
[CV] max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.688, total=   0.1s
[CV] max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.548, total=   0.1s
[CV] max_depth=4, max_features=sqrt, min_samples_split=3, n_estimators=15 
[CV]  max_depth=4, max_features=sqrt, min_samples_split=3, n_estimators=15, score=0.531, total=   0.0s
[CV] ma

[CV]  max_depth=4, max_features=log2, min_samples_split=2, n_estimators=50, score=0.562, total=   0.1s
[CV] max_depth=4, max_features=log2, min_samples_split=2, n_estimators=50 
[CV]  max_depth=4, max_features=log2, min_samples_split=2, n_estimators=50, score=0.645, total=   0.1s
[CV] max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15 
[CV]  max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15, score=0.531, total=   0.0s
[CV] max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15 
[CV]  max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15, score=0.594, total=   0.0s
[CV] max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15 
[CV]  max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15, score=0.531, total=   0.0s
[CV] max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15 
[CV]  max_depth=4, max_features=log2, min_samples_split=3, n_estimators=15, score=0.594, total=   0.0s
[CV] ma

[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=15, score=0.613, total=   0.1s
[CV] max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20 
[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20, score=0.500, total=   0.1s
[CV] max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20 
[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20, score=0.750, total=   0.1s
[CV] max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20 
[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20, score=0.531, total=   0.0s
[CV] max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20 
[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20, score=0.562, total=   0.1s
[CV] max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20 
[CV]  max_depth=6, max_features=auto, min_samples_split=3, n_estimators=20, score=0.581, total=   0.1s
[CV] ma

[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=20, score=0.581, total=   0.1s
[CV] max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50, score=0.500, total=   0.1s
[CV] max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50, score=0.719, total=   0.1s
[CV] max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50, score=0.625, total=   0.1s
[CV] max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50, score=0.688, total=   0.1s
[CV] max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=sqrt, min_samples_split=3, n_estimators=50, score=0.613, total=   0.1s
[CV] ma

[CV]  max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50, score=0.594, total=   0.2s
[CV] max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50, score=0.562, total=   0.1s
[CV] max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50, score=0.625, total=   0.1s
[CV] max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50 
[CV]  max_depth=6, max_features=log2, min_samples_split=3, n_estimators=50, score=0.645, total=   0.1s
[CV] max_depth=6, max_features=log2, min_samples_split=4, n_estimators=15 
[CV]  max_depth=6, max_features=log2, min_samples_split=4, n_estimators=15, score=0.594, total=   0.0s
[CV] max_depth=6, max_features=log2, min_samples_split=4, n_estimators=15 
[CV]  max_depth=6, max_features=log2, min_samples_split=4, n_estimators=15, score=0.625, total=   0.0s
[CV] ma

[CV]  max_depth=8, max_features=auto, min_samples_split=3, n_estimators=50, score=0.656, total=   0.1s
[CV] max_depth=8, max_features=auto, min_samples_split=3, n_estimators=50 
[CV]  max_depth=8, max_features=auto, min_samples_split=3, n_estimators=50, score=0.581, total=   0.1s
[CV] max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15 
[CV]  max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15, score=0.469, total=   0.0s
[CV] max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15 
[CV]  max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15, score=0.625, total=   0.1s
[CV] max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15 
[CV]  max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15, score=0.562, total=   0.0s
[CV] max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15 
[CV]  max_depth=8, max_features=auto, min_samples_split=4, n_estimators=15, score=0.562, total=   0.1s
[CV] ma

[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=15, score=0.710, total=   0.0s
[CV] max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20, score=0.500, total=   0.1s
[CV] max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20, score=0.781, total=   0.1s
[CV] max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20, score=0.562, total=   0.1s
[CV] max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20, score=0.562, total=   0.0s
[CV] max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=sqrt, min_samples_split=4, n_estimators=20, score=0.645, total=   0.1s
[CV] ma

[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=20, score=0.656, total=   0.1s
[CV] max_depth=8, max_features=log2, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=20, score=0.625, total=   0.0s
[CV] max_depth=8, max_features=log2, min_samples_split=4, n_estimators=20 
[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=20, score=0.581, total=   0.0s
[CV] max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50 
[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50, score=0.531, total=   0.1s
[CV] max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50 
[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50, score=0.625, total=   0.1s
[CV] max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50 
[CV]  max_depth=8, max_features=log2, min_samples_split=4, n_estimators=50, score=0.562, total=   0.1s
[CV] ma

[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:   30.2s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [4, 6, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [15, 20, 50]},
             verbose=5)

In [16]:
for_pred=gcv_forest.predict(x_test_tfidf)

In [17]:
F1_Score_RF=f1_score(y_test,for_pred)
Precision_RF=precision_score(y_test,for_pred)
Recall_RF=recall_score(y_test,for_pred)

In [18]:
# AdaBoost Classifier with GridSearchCV and F1 Score
para_dic={'learning_rate':[0.01,0.1,0.2,0.5],
          'n_estimators':[50,80,20,40]}
gcv_adaboost=GridSearchCV(adaboost,param_grid=para_dic,cv=5,verbose=5)
gcv_adaboost.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] learning_rate=0.01, n_estimators=50 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . learning_rate=0.01, n_estimators=50, score=0.656, total=   0.3s
[CV] learning_rate=0.01, n_estimators=50 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] . learning_rate=0.01, n_estimators=50, score=0.625, total=   0.2s
[CV] learning_rate=0.01, n_estimators=50 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] . learning_rate=0.01, n_estimators=50, score=0.500, total=   0.2s
[CV] learning_rate=0.01, n_estimators=50 .............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s


[CV] . learning_rate=0.01, n_estimators=50, score=0.469, total=   0.2s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV] . learning_rate=0.01, n_estimators=50, score=0.516, total=   0.2s
[CV] learning_rate=0.01, n_estimators=80 .............................
[CV] . learning_rate=0.01, n_estimators=80, score=0.688, total=   0.5s
[CV] learning_rate=0.01, n_estimators=80 .............................
[CV] . learning_rate=0.01, n_estimators=80, score=0.688, total=   0.4s
[CV] learning_rate=0.01, n_estimators=80 .............................
[CV] . learning_rate=0.01, n_estimators=80, score=0.562, total=   0.4s
[CV] learning_rate=0.01, n_estimators=80 .............................
[CV] . learning_rate=0.01, n_estimators=80, score=0.469, total=   0.4s
[CV] learning_rate=0.01, n_estimators=80 .............................
[CV] . learning_rate=0.01, n_estimators=80, score=0.548, total=   0.5s
[CV] learning_rate=0.01, n_estimators=20 .............................
[CV] .

[CV] .. learning_rate=0.5, n_estimators=50, score=0.656, total=   0.2s
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] .. learning_rate=0.5, n_estimators=50, score=0.594, total=   0.2s
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] .. learning_rate=0.5, n_estimators=50, score=0.531, total=   0.3s
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] .. learning_rate=0.5, n_estimators=50, score=0.452, total=   0.3s
[CV] learning_rate=0.5, n_estimators=80 ..............................
[CV] .. learning_rate=0.5, n_estimators=80, score=0.656, total=   0.5s
[CV] learning_rate=0.5, n_estimators=80 ..............................
[CV] .. learning_rate=0.5, n_estimators=80, score=0.656, total=   0.4s
[CV] learning_rate=0.5, n_estimators=80 ..............................
[CV] .. learning_rate=0.5, n_estimators=80, score=0.594, total=   0.4s
[CV] learning_rate=0.5, n_estimators=80 ..............................
[CV] .

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   18.8s finished


GridSearchCV(cv=5, estimator=AdaBoostClassifier(random_state=42),
             param_grid={'learning_rate': [0.01, 0.1, 0.2, 0.5],
                         'n_estimators': [50, 80, 20, 40]},
             verbose=5)

In [19]:
boost_pred=gcv_adaboost.predict(x_test_tfidf)

In [20]:
F1_Score_AdaBoost=f1_score(y_test,boost_pred)
Precision_AdaBoost=precision_score(y_test,boost_pred)
Recall_AdaBoost=recall_score(y_test,boost_pred)

In [21]:
pa_dict={'n_estimators':[10,20,30,25],
        'learning_rate':[0.1,0.02,0.5,1],
        'max_features':['auto','sqrt','log2']}

In [22]:
# GradientBoosting Classifier with GridSearchCV and F1 Score
gcv_gradientbc=GridSearchCV(gradientbc, param_grid=pa_dict, cv=5, verbose=5)
gcv_gradientbc.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] learning_rate=0.1, max_features=auto, n_estimators=10 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=10, score=0.656, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=10 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=10, score=0.750, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=10 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=10, score=0.656, total=   0.1s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s



[CV] learning_rate=0.1, max_features=auto, n_estimators=10 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=10, score=0.594, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=10 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=10, score=0.548, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=20 ...........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s


[CV]  learning_rate=0.1, max_features=auto, n_estimators=20, score=0.688, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=20 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=20, score=0.688, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=20 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=20, score=0.594, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=20 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=20, score=0.531, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=20 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=20, score=0.613, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=30 ...........
[CV]  learning_rate=0.1, max_features=auto, n_estimators=30, score=0.656, total=   0.1s
[CV] learning_rate=0.1, max_features=auto, n_estimators=30 ...........
[CV]  learning_rate=0.1, max_features=auto, n_

[CV]  learning_rate=0.02, max_features=auto, n_estimators=10, score=0.562, total=   0.1s
[CV] learning_rate=0.02, max_features=auto, n_estimators=10 ..........
[CV]  learning_rate=0.02, max_features=auto, n_estimators=10, score=0.531, total=   0.0s
[CV] learning_rate=0.02, max_features=auto, n_estimators=10 ..........
[CV]  learning_rate=0.02, max_features=auto, n_estimators=10, score=0.594, total=   0.0s
[CV] learning_rate=0.02, max_features=auto, n_estimators=10 ..........
[CV]  learning_rate=0.02, max_features=auto, n_estimators=10, score=0.548, total=   0.0s
[CV] learning_rate=0.02, max_features=auto, n_estimators=20 ..........
[CV]  learning_rate=0.02, max_features=auto, n_estimators=20, score=0.656, total=   0.1s
[CV] learning_rate=0.02, max_features=auto, n_estimators=20 ..........
[CV]  learning_rate=0.02, max_features=auto, n_estimators=20, score=0.656, total=   0.1s
[CV] learning_rate=0.02, max_features=auto, n_estimators=20 ..........
[CV]  learning_rate=0.02, max_features=a

[CV]  learning_rate=0.02, max_features=log2, n_estimators=30, score=0.562, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=30 ..........
[CV]  learning_rate=0.02, max_features=log2, n_estimators=30, score=0.548, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=25 ..........
[CV]  learning_rate=0.02, max_features=log2, n_estimators=25, score=0.531, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=25 ..........
[CV]  learning_rate=0.02, max_features=log2, n_estimators=25, score=0.531, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=25 ..........
[CV]  learning_rate=0.02, max_features=log2, n_estimators=25, score=0.531, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=25 ..........
[CV]  learning_rate=0.02, max_features=log2, n_estimators=25, score=0.562, total=   0.0s
[CV] learning_rate=0.02, max_features=log2, n_estimators=25 ..........
[CV]  learning_rate=0.02, max_features=l

[CV]  learning_rate=0.5, max_features=log2, n_estimators=20, score=0.500, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=20 ...........
[CV]  learning_rate=0.5, max_features=log2, n_estimators=20, score=0.594, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=20 ...........
[CV]  learning_rate=0.5, max_features=log2, n_estimators=20, score=0.688, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=20 ...........
[CV]  learning_rate=0.5, max_features=log2, n_estimators=20, score=0.656, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=20 ...........
[CV]  learning_rate=0.5, max_features=log2, n_estimators=20, score=0.581, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=30 ...........
[CV]  learning_rate=0.5, max_features=log2, n_estimators=30, score=0.719, total=   0.0s
[CV] learning_rate=0.5, max_features=log2, n_estimators=30 ...........
[CV]  learning_rate=0.5, max_features=log2, n_

[CV]  learning_rate=1, max_features=log2, n_estimators=10, score=0.645, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=20 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=20, score=0.562, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=20 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=20, score=0.688, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=20 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=20, score=0.594, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=20 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=20, score=0.656, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=20 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=20, score=0.613, total=   0.0s
[CV] learning_rate=1, max_features=log2, n_estimators=30 .............
[CV]  learning_rate=1, max_features=log2, n_estimators=30,

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   12.4s finished


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'learning_rate': [0.1, 0.02, 0.5, 1],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 20, 30, 25]},
             verbose=5)

In [23]:
grad_pred=gcv_gradientbc.predict(x_test_tfidf)

In [24]:
F1_Score_GradientBoost=f1_score(y_test,grad_pred)
Precision_Gradient=precision_score(y_test,grad_pred)
Recall_Gradient=recall_score(y_test,grad_pred)

In [25]:
p_dict={'C':[0.001,0.01,0.1,1,10],
     'gamma':['scale','auto'],
     'kernel':['linear','rbf','sigmoid']}

In [26]:
# Support Vector Machine(SVC) with GridSearchCV and F1 Score
gcv_svm=GridSearchCV(svc, param_grid=p_dict, cv=5,verbose=3)
gcv_svm.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, gamma=scale, kernel=linear .............................
[CV] . C=0.001, gamma=scale, kernel=linear, score=0.531, total=   0.1s
[CV] C=0.001, gamma=scale, kernel=linear .............................
[CV] . C=0.001, gamma=scale, kernel=linear, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=linear .............................
[CV] . C=0.001, gamma=scale, kernel=linear, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=linear .............................
[CV] . C=0.001, gamma=scale, kernel=linear, score=0.562, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=linear .............................
[CV] . C=0.001, gamma=scale, kernel=linear, score=0.548, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=rbf ................................
[CV] .... C=0.001, gamma=scale, kernel=rbf, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=rbf ................................
[CV] .... C=0.0

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .... C=0.001, gamma=scale, kernel=rbf, score=0.562, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=rbf ................................
[CV] .... C=0.001, gamma=scale, kernel=rbf, score=0.548, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=sigmoid ............................
[CV]  C=0.001, gamma=scale, kernel=sigmoid, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=sigmoid ............................
[CV]  C=0.001, gamma=scale, kernel=sigmoid, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=sigmoid ............................
[CV]  C=0.001, gamma=scale, kernel=sigmoid, score=0.531, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=sigmoid ............................
[CV]  C=0.001, gamma=scale, kernel=sigmoid, score=0.562, total=   0.0s
[CV] C=0.001, gamma=scale, kernel=sigmoid ............................
[CV]  C=0.001, gamma=scale, kernel=sigmoid, score=0.548, total=   0.0s
[CV] C=0.001, gamma=auto, kernel=linear ..............................
[CV] .

[CV] .. C=0.1, gamma=scale, kernel=sigmoid, score=0.548, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=linear ................................
[CV] .... C=0.1, gamma=auto, kernel=linear, score=0.531, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=linear ................................
[CV] .... C=0.1, gamma=auto, kernel=linear, score=0.531, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=linear ................................
[CV] .... C=0.1, gamma=auto, kernel=linear, score=0.531, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=linear ................................
[CV] .... C=0.1, gamma=auto, kernel=linear, score=0.562, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=linear ................................
[CV] .... C=0.1, gamma=auto, kernel=linear, score=0.548, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=auto, kernel=rbf, score=0.531, total=   0.0s
[CV] C=0.1, gamma=auto, kernel=rbf ...................................
[CV] .

[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ........ C=10, gamma=auto, kernel=rbf, score=0.531, total=   0.0s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ........ C=10, gamma=auto, kernel=rbf, score=0.531, total=   0.0s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ........ C=10, gamma=auto, kernel=rbf, score=0.531, total=   0.0s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ........ C=10, gamma=auto, kernel=rbf, score=0.562, total=   0.0s
[CV] C=10, gamma=auto, kernel=rbf ....................................
[CV] ........ C=10, gamma=auto, kernel=rbf, score=0.548, total=   0.0s
[CV] C=10, gamma=auto, kernel=sigmoid ................................
[CV] .... C=10, gamma=auto, kernel=sigmoid, score=0.531, total=   0.0s
[CV] C=10, gamma=auto, kernel=sigmoid ................................
[CV] .... C=10, gamma=auto, kernel=sigmoid, score=0.531, total=   0.0s
[CV] C

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    2.3s finished


GridSearchCV(cv=5, estimator=SVC(random_state=42),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             verbose=3)

In [27]:
svc_pred=gcv_svm.predict(x_test_tfidf)
F1_Score_SVC=f1_score(y_test,svc_pred)
Precision_SVC=precision_score(y_test,svc_pred)
Recall_SVC=recall_score(y_test,svc_pred)

In [28]:
multinomial.fit(x_train_tfidf,y_train)
multi_pred=multinomial.predict(x_test_tfidf)

In [29]:
F1_Score_MNB=f1_score(y_test,multi_pred)
Precision_MNB=precision_score(y_test,multi_pred)
Recall_MNB=recall_score(y_test,multi_pred)

In [30]:
# list(dict.keys()) -- list of hyperparameters
# Evaluation Scores 

amazon_review_df=pd.DataFrame({'Model':['Logistic Rgression','Decision Tree','RandomForest','AdaBoost','Gradient Boosting','Support Vector Machine','MultinomialNB'],
                              'HyperParameters':['none','none',list(param_dict.keys()),list(para_dic.keys()),list(pa_dict.keys()),list(p_dict.keys()),'none'],
                              'F1 Score':[F1_Score_LR,F1_Score_DT,F1_Score_RF,F1_Score_AdaBoost,F1_Score_GradientBoost,F1_Score_SVC,F1_Score_MNB],
                              'Precision Score':[Precision_LR,Precision_DT,Precision_RF,Precision_AdaBoost,Precision_Gradient,Precision_SVC,Precision_MNB],
                              'Recall Score':[Recall_LR,Recall_DT,Recall_RF,Recall_AdaBoost,Recall_Gradient,Recall_SVC,Recall_MNB]})
amazon_review_df

Unnamed: 0,Model,HyperParameters,F1 Score,Precision Score,Recall Score
0,Logistic Rgression,none,0.872727,0.8,0.96
1,Decision Tree,none,0.622222,0.7,0.56
2,RandomForest,"[n_estimators, max_depth, min_samples_split, m...",0.877193,0.78125,1.0
3,AdaBoost,"[learning_rate, n_estimators]",0.821429,0.741935,0.92
4,Gradient Boosting,"[n_estimators, learning_rate, max_features]",0.807692,0.777778,0.84
5,Support Vector Machine,"[C, gamma, kernel]",0.884615,0.851852,0.92
6,MultinomialNB,none,0.851852,0.793103,0.92
