# Import Libraries

In [30]:
import numpy as np
import pandas as pd
import spacy
import re
import string
import inflect
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import spacy
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
import pickle
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

# Data Ingestion


In [6]:
data=pd.read_csv(r'../data/reviews_and_sentiment.csv')
df=data.copy()
df.head()


Unnamed: 0.1,Unnamed: 0,Review,Sentiment
0,0,"Nice product Nice product, good quality, but p...",1
1,1,Don't waste your money They didn't supplied Yo...,0
2,2,Did not meet expectations Worst product. Damag...,0
3,5,Mind-blowing purchase Good quality product. De...,1
4,6,Must buy! BEST PURCHASE It is a good quality a...,1


# Text Pre processing

In [7]:
def text_preprocessing(corpus,flag):
    
    # change  of numbers
    p=inflect.engine()
    corpus=re.sub(r'\d+',lambda x: p.number_to_words(x.group(0)),corpus)
    
    # remove special characters
    corpus=re.sub('[^a-zA-Z]',' ',corpus)
    
    #convert to lower case
    corpus=corpus.lower()
    
    # removal of whitespaces
    corpus=' '.join(corpus.split())

    #tokenize
    words=word_tokenize(corpus)
    if flag=="stemming":
    #stemming
        stemmer=SnowballStemmer(language='english')
        return ' '.join(stemmer.stem(word) for word in words if word not in set(nltk.corpus.stopwords.words('english')))
    else:
    #lemmatization
        lemmatizer=WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in words if word not in set(nltk.corpus.stopwords.words('english')))


#flag is either "stemming" or "lemmatization"


In [9]:
df['Review']=df['Review'].apply(lambda x: text_preprocessing(x,flag="stemming"))

# Text Vectorization

In [10]:
nlp=spacy.load('en_core_web_lg')
df['Vector']=df['Review'].apply(lambda x: nlp(x).vector)
df

Unnamed: 0.1,Unnamed: 0,Review,Sentiment,Vector
0,0,nice product nice product good qualiti price r...,1,"[-0.29536363, 0.5639642, -1.9529978, -0.389275..."
1,1,wast money suppli yonex mavi three hundr fifti...,0,"[0.2179594, 0.46288133, -0.9466747, -0.2688799..."
2,2,meet expect worst product damag shuttlecock pa...,0,"[-0.84949654, 0.2578784, -1.1397167, -0.498824..."
3,5,mind blow purchas good qualiti product deliv t...,1,"[0.44810554, -0.19229555, -0.5889078, 0.121301..."
4,6,must buy best purchas good qualiti durabl aver...,1,"[-0.40196362, 0.3199391, -1.1341728, -0.682335..."
...,...,...,...,...
8008,8495,nice thank deliveri boy servic alway awesom read,1,"[0.20393375, 0.12258506, -1.3418686, -1.853296..."
8009,8496,good choic price even fifti discount price muc...,0,"[-0.08962436, 0.22343633, -1.7200743, 0.363474..."
8010,8497,awesom much price get rs six hundr forti home ...,0,"[-0.33284146, 1.1507919, -1.6580206, -0.524723..."
8011,8499,high cost hii flipkart custom care deliveri bo...,1,"[0.32910556, 1.341987, -1.5979877, 0.45543104,..."


# Split the data

In [13]:
X_train,X_test,y_train,y_test=train_test_split(df['Vector'],df['Sentiment'],test_size=0.2)
print(X_train.shape,X_test.shape)

(6410,) (1603,)


In [14]:
X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)
print(X_train_stack.shape,X_test_stack.shape)

(6410, 300) (1603, 300)


## Use Min max scaler to scale value down to 0 and 1

In [15]:
scaler = MinMaxScaler()
X_train_stack_sc = scaler.fit_transform(X_train_stack)
X_test_stack_sc = scaler.transform(X_test_stack)

In [27]:
with open(r'../models/min_max_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)a

In [28]:
with open(r'../models/min_max_scaler.pkl', 'rb') as file:
    scaler=pickle.load(file)
    y=scaler.transform([X_test_stack[0]])
print(y)

[[0.60732196 0.31169537 0.51970551 0.47556625 0.74542003 0.58331434
  0.6153948  0.52263316 0.32838363 0.33922681 0.65967294 0.6019025
  0.39795461 0.4547091  0.40001814 0.59178362 0.82522888 0.46943475
  0.58969605 0.59258696 0.20753354 0.51580485 0.44940664 0.62774466
  0.45258131 0.61273729 0.58651533 0.49870091 0.29880042 0.58978009
  0.65129421 0.4450124  0.28183532 0.76620028 0.49857197 0.26026361
  0.53276695 0.57287713 0.494191   0.74106469 0.39230223 0.51533461
  0.6396803  0.67921064 0.55945655 0.65315813 0.75029855 0.45969183
  0.59546791 0.39960506 0.67652647 0.49172259 0.50205725 0.3223227
  0.37890002 0.45323628 0.49196414 0.52491295 0.66720128 0.62341275
  0.58787861 0.32139284 0.70861756 0.45270496 0.39516026 0.43346999
  0.44461173 0.39248585 0.45443884 0.48993402 0.47773571 0.55003014
  0.23763355 0.53152862 0.61533783 0.85065164 0.41351143 0.6721741
  0.44179045 0.47356902 0.53602022 0.38992321 0.61146079 0.37336256
  0.40763134 0.58038401 0.41130971 0.53078756 0.585

# Model Building

In [16]:
model = MultinomialNB()
model.fit(X_train_stack_sc, y_train)

In [17]:
y_pred = model.predict(X_test_stack_sc)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
print('Accuracy Score:',accuracy_score(y_test, y_pred))


Accuracy Score: 0.8833437305053026


In [19]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       187
           1       0.88      1.00      0.94      1416

    accuracy                           0.88      1603
   macro avg       0.44      0.50      0.47      1603
weighted avg       0.78      0.88      0.83      1603



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Saving the model

In [20]:
import pickle

# Assuming 'grid' is the trained GridSearchCV model
# Pickle the model to a file
with open(r'../models/naive_bayes_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# TEsting the model

In [24]:
with open(r'../models/naive_bayes_model.pkl', 'rb') as file:
    model = pickle.load(file)
    y=model.predict([X_test_stack_sc[0]])
print(y)

[1]


In [32]:

pipelines = {
    'naive_bayes': Pipeline([
        ('classifier', MultinomialNB())
    ]),
    'decision_tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'logistic_regression': Pipeline([
        ('classifier', LogisticRegression())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1', 
                               return_train_score=True,
                               verbose=1
                              )
    
    grid_search.fit(X_train_stack_sc, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test_stack_sc, y_test))

********** naive_bayes **********
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Score on Test Data:  0.9380589599205036
********** decision_tree **********
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Score on Test Data:  0.9377762665759947
********** logistic_regression **********
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Score on Test Data:  0.9028656494231485




In [35]:
grid_search.best_estimator_

In [37]:
best_model = grid_search.best_estimator_
with open(r'../models/logistic_regression.pkl', 'wb') as file:
    pickle.dump(best_model, file)