In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


from tqdm import tqdm
from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV 


from tensorflow.keras.utils import to_categorical
from tensorflow import set_random_seed
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential


from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv("data/Musical_instruments_reviews.csv")

In [4]:
data["reviewText"].fillna("",inplace = True)

In [5]:
data.drop(columns = ['reviewerID','asin','reviewerName','helpful','unixReviewTime','reviewTime'],inplace = True)

In [6]:
data['review'] = data['reviewText'] + ' ' + data['summary']
data.drop(columns = ['reviewText','summary'],inplace = True)

In [7]:
# Replacing ratings of 1,2,3 with 0 (not good) and 4,5 with 1 (good)
data["overall"] = (data["overall"]>3)*1

In [8]:
X_train, X_test, y_train ,y_test = train_test_split(data.review, data.overall, test_size = 0.2 , random_state = 42)

In [9]:
def clean_sentences(X):
    reviews = []

    for sent in tqdm(X):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [10]:
train_sentences = clean_sentences(X_train)
test_sentences = clean_sentences(X_test)
print(len(train_sentences))
print(len(test_sentences))

100%|██████████| 8208/8208 [00:07<00:00, 1094.42it/s]
100%|██████████| 2053/2053 [00:01<00:00, 1321.54it/s]

8208
2053





In [11]:
unique_words = set()
len_max = 0

for sent in tqdm(train_sentences):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
print(len(list(unique_words)))
print(len_max)

100%|██████████| 8208/8208 [00:00<00:00, 317760.86it/s]

15657
2067





In [12]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))


X_train = tokenizer.texts_to_sequences(train_sentences)
X_test = tokenizer.texts_to_sequences(test_sentences)

X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_test.shape)

(8208, 2067) (2053, 2067)


In [13]:
sm = SMOTE()
X_train,y_train = sm.fit_resample(X_train,y_train)

In [14]:
xgb = XGBClassifier( n_estimators=1000,max_depth=4,min_child_weight = 1 ,gamma = 0 ,
                    subsample = 0.8, colsample_bytree = 0.8, nthread=6,seed= 9,tree_method='gpu_hist', gpu_id=0)

xgb.fit(X_train, y_train)
xgb_predict = xgb.predict(X_test)


In [15]:
accuracy_score(y_test, xgb_predict)

0.8426692644909888

In [16]:
xg_report = classification_report(y_test, xgb_predict)
print(xg_report)

              precision    recall  f1-score   support

           0       0.20      0.11      0.14       244
           1       0.89      0.94      0.91      1809

    accuracy                           0.84      2053
   macro avg       0.54      0.53      0.53      2053
weighted avg       0.81      0.84      0.82      2053



In [17]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1, 1.5, 2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [32]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, thread=6,seed= 9,tree_method='gpu_hist', gpu_id=0)

In [33]:
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=10, scoring='roc_auc', cv=4, verbose=3, random_state=1001 )
random_search.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0, score=0.769, total=   4.0s
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s


[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0, score=0.933, total=   4.0s
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.0s remaining:    0.0s


[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0, score=0.945, total=   4.0s
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=2, colsample_bytree=1.0, score=0.936, total=   4.0s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8 
[CV]  subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8, score=0.844, total=   8.1s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8 
[CV]  subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8, score=0.974, total=   8.0s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8 
[CV]  subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8, score=0.979, total=   8.2s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1, colsample_bytree=0.8 
[CV]  subsample=0.6,

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  4.1min finished


RandomizedSearchCV(cv=4,
                   estimator=XGBClassifier(gpu_id=0, learning_rate=0.02,
                                           n_estimators=600, seed=9,
                                           silent=True, thread=6,
                                           tree_method='gpu_hist'),
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],
                                        'gamma': [0, 0.5, 1, 1.5, 2],
                                        'max_depth': [3, 4, 5],
                                        'min_child_weight': [1, 5, 10],
                                        'subsample': [0.6, 0.8, 1.0]},
                   random_state=1001, scoring='roc_auc', verbose=3)

In [34]:
print('\n Best estimator:')
print(random_search.best_estimator_)
print(random_search.best_score_ )
print('\n Best hyperparameters:')
print(random_search.best_params_)



 Best estimator:
XGBClassifier(colsample_bytree=0.6, gpu_id=0, learning_rate=0.02, max_depth=5,
              n_estimators=600, seed=9, silent=True, subsample=0.8, thread=6,
              tree_method='gpu_hist')
0.942999199005799

 Best hyperparameters:
{'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 5, 'gamma': 0, 'colsample_bytree': 0.6}


In [28]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=1000, objective='binary:logistic',subsample =0.6, 
                    min_child_weight=1, max_depth = 5, gamma = 1, colsample_bytree = 0.8,
                    silent=True, thread=6,seed= 9,tree_method='gpu_hist', gpu_id=0)

In [29]:
xgb.fit(X_train, y_train)
xgb_predict = xgb.predict(X_test)

In [30]:
accuracy_score(y_test, xgb_predict)

0.8202630297126157

In [31]:
xg_report = classification_report(y_test, xgb_predict)
print(xg_report)

              precision    recall  f1-score   support

           0       0.18      0.14      0.16       244
           1       0.89      0.91      0.90      1809

    accuracy                           0.82      2053
   macro avg       0.53      0.53      0.53      2053
weighted avg       0.80      0.82      0.81      2053

