In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re,string,unicodedata

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

from tqdm import tqdm
from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from tensorflow.keras.utils import to_categorical
from tensorflow import set_random_seed
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential


from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [4]:
data = pd.read_csv("data/Musical_instruments_reviews.csv")

In [11]:
data["reviewText"].fillna("",inplace = True)

In [12]:
data.drop(columns = ['reviewerID','asin','reviewerName','helpful','unixReviewTime','reviewTime'],inplace = True)

In [13]:
data['review'] = data['reviewText'] + ' ' + data['summary']
data.drop(columns = ['reviewText','summary'],inplace = True)

In [14]:
# Replacing ratings of 1,2,3 with 0 (not good) and 4,5 with 1 (good)
data["overall"] = (data["overall"]>3)*1

In [15]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [16]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            pos = pos_tag([i.strip()])
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

In [18]:
data.text = data.review.apply(lemmatize_words)

  """Entry point for launching an IPython kernel.


In [19]:
X_train,X_test,y_train,y_test = train_test_split(data.review,data.overall,test_size = 0.2 , random_state = 0)

In [20]:
cv = CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train = cv.fit_transform(X_train)
#transformed test reviews
cv_test = cv.transform(X_test)

In [21]:
tv = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train=tv.fit_transform(X_train)
#transformed test reviews
tv_test=tv.transform(X_test)

In [34]:
mnb_model=MultinomialNB()

mnb_model = mnb_model.fit(cv_train, y_train)

mnb_pred = mnb_model.predict(cv_test)

score = accuracy_score(y_test, mnb_pred)

print("Score: ", score)

Score:  0.8334145153433999


In [35]:
mnb_bow_report = classification_report(y_test, mnb_pred)
print(mnb_bow_report)

              precision    recall  f1-score   support

           0       0.29      0.33      0.31       228
           1       0.91      0.90      0.91      1825

    accuracy                           0.83      2053
   macro avg       0.60      0.61      0.61      2053
weighted avg       0.85      0.83      0.84      2053



In [38]:
mnb_model=MultinomialNB()

mnb_model = mnb_model.fit(tv_train, y_train)

mnb_pred = mnb_model.predict(tv_test)

score = accuracy_score(y_test, mnb_pred)

print("Score: ", score)

Score:  0.8889430102289333


In [39]:
mnb_bow_report = classification_report(y_test, mnb_pred)
print(mnb_bow_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.44      0.50      0.47      2053
weighted avg       0.79      0.89      0.84      2053



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
lr = LogisticRegression(penalty='l2',max_iter=1000,C=1,random_state=0)
lr.fit(cv_train, y_train)

lr_pred = lr.predict(cv_test)

score=accuracy_score(y_test, lr_pred)

print("Score: ", score)

Score:  0.8899171943497322


In [37]:
lr_bow_report = classification_report(y_test, lr_pred)
print(lr_bow_report)

              precision    recall  f1-score   support

           0       1.00      0.01      0.02       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.94      0.50      0.48      2053
weighted avg       0.90      0.89      0.84      2053



In [22]:
xgb = XGBClassifier( n_estimators=1000,max_depth=4,min_child_weight = 1 ,gamma = 0 ,
                    subsample = 0.8, class_weigh ="balanced", colsample_bytree = 0.8, nthread=6,seed= 9)

xgb.fit(cv_train, y_train)
xgb_predict = xgb.predict(cv_test)


In [23]:
accuracy_score(y_test, xgb_predict)

0.8889430102289333

In [24]:
xg_report = classification_report(y_test, xgb_predict)
print(xg_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.44      0.50      0.47      2053
weighted avg       0.79      0.89      0.84      2053



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
xgb = XGBClassifier( n_estimators=1000,max_depth=4,min_child_weight = 1 ,gamma = 0 ,
                    subsample = 0.8, class_weigh ="balanced", colsample_bytree = 0.8, nthread=6,seed= 9)

xgb.fit(tv_train, y_train)
xgb_predict = xgb.predict(tv_test)


In [27]:
accuracy_score(y_test, xgb_predict)

0.8889430102289333

In [28]:
xg_report = classification_report(y_test, xgb_predict)
print(xg_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.44      0.50      0.47      2053
weighted avg       0.79      0.89      0.84      2053



In [32]:
svc = LinearSVC(random_state=7,C=1,class_weight="balanced")

In [33]:
svc.fit(cv_train, y_train)

LinearSVC(C=1, class_weight='balanced', random_state=7)

In [34]:
svc_predict = svc.predict(cv_test)

In [35]:
accuracy_score(y_test, svc_predict)

0.8899171943497322

In [36]:
svc_report = classification_report(y_test, svc_predict)
print(svc_report)

              precision    recall  f1-score   support

           0       1.00      0.01      0.02       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.94      0.50      0.48      2053
weighted avg       0.90      0.89      0.84      2053



In [37]:
svc = LinearSVC(random_state=7,C=1,class_weight="balanced")

In [38]:
svc.fit(tv_train, y_train)

LinearSVC(C=1, class_weight='balanced', random_state=7)

In [39]:
svc_predict = svc.predict(tv_test)

In [40]:
accuracy_score(y_test, svc_predict)

0.8904042864101315

In [41]:
svc_report = classification_report(y_test, svc_predict)
print(svc_report)

              precision    recall  f1-score   support

           0       1.00      0.01      0.03       228
           1       0.89      1.00      0.94      1825

    accuracy                           0.89      2053
   macro avg       0.95      0.51      0.48      2053
weighted avg       0.90      0.89      0.84      2053

