# Sentiment Analysis of Product Reviews

### Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import spacy
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
import time, datetime
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

### Read the Dataset

In [None]:
df = pd.read_csv("preprocessed-dataset.csv")
df = df.dropna(how='any',axis=0)

## Feature Engineering and Selection

### Create TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=7000)
features = vectorizer.fit_transform(df['text'])
tf_idf = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names())

### Splitting Dataset into Train and Test Set
We did 80:20 split for training and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf, df['sentiment'], test_size=0.2, random_state=42)

In [None]:
yy=pd.DataFrame(y_train)
train_data = pd.concat([X_train,yy],axis=1)

## Oversampling the Train Data

In [None]:
target_count = train_data['sentiment'].value_counts()
negative_class = train_data[train_data['sentiment'] == 0]
positive_class = train_data[train_data['sentiment'] == 1]
negative_over = negative_class.sample(target_count[1], replace=True)
df_train_over = pd.concat([positive_class, negative_over], axis=0)
df_train_over = shuffle(df_train_over)

In [None]:
counts=df_train_over['sentiment'].value_counts()
plt.title("Train Classes count after Oversampling")
plt.bar(counts.index, counts.values)
plt.show()

# Final Data for Train-Testing

In [None]:
X_train=df_train_over.iloc[:,:-1]
y_train=df_train_over['sentiment']

## Modeling

In [None]:
Here we are defining our models with list of values for parameters to find the best value using GridSearchCV
models_with_default_params=[{'mod' : MultinomialNB(), 'param': {'alpha': [10**-5,10**-4,10**-3,10**-2,10**-1,1,1.5,2]}},
                            {'mod': LinearSVC(), 'param': {'C': [0.1, 1, 10]}},
                            {'mod': KNeighborsClassifier(), 'param': {'n_neighbors': [1,2,3]}},
                            {'mod': XGBClassifier(), 'param': {'n_estimators': [100]}}]


In [None]:
X_train.replace(np.NaN, 0, inplace=True)

# Without Chi-Square Feature Reduction

In [None]:
X_train_vect = X_train
X_test_vect = X_test

for mwdp in models_with_default_params:
    SVM_grid_search = GridSearchCV(mwdp['mod'], mwdp['param'], refit=True, verbose=3)
    SVM_grid_search.fit(X_train_vect, y_train)

    #considering the best model using GridSearchCV
    model = SVM_grid_search.best_estimator_

    print('---------'+'Model: '+model.__class__.__name__+'---------')
    print('Feature Vector Size:',X_train_vect.shape)
    print('Best Model: ',model)

    train_start_time = datetime.datetime.now()
    model.fit(X_train_vect, y_train)
    print('TRAIN TIME: ', datetime.datetime.now() - train_start_time)

    y_pred = model.predict(X_test_vect)


    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Precision: ',precision_score(y_test, y_pred, average="macro"))
    print('Recall: ',recall_score(y_test, y_pred, average="macro"))
    print('F1 Score: ',f1_score(y_test, y_pred, average="macro"))

# With Chi-Square Feature Reduction

In [None]:
X_train_vect = X_train
X_test_vect = X_test

chi_selector = SelectKBest(score_func=chi2, k=500)
X_train_vect_chi=chi_selector.fit_transform(X_train_vect, y_train)
X_test_vect_chi=chi_selector.transform(X_test_vect)


for mwdp in models_with_default_params:
    SVM_grid_search = GridSearchCV(mwdp['mod'], mwdp['param'], refit=True, verbose=3)
    SVM_grid_search.fit(X_train_vect_chi, y_train)

    #considering the best model using GridSearchCV
    model = SVM_grid_search.best_estimator_

    print('---------'+'Model: '+model.__class__.__name__+'---------')
    print('Feature Vector Size:',X_train_vect_chi.shape)
    print('Best Model: ',model)

    train_start_time = datetime.datetime.now()
    model.fit(X_train_vect_chi, y_train)
    print('TRAIN TIME: ', datetime.datetime.now() - train_start_time)

    y_pred = model.predict(X_test_vect_chi)  


    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Precision: ',precision_score(y_test, y_pred, average="macro"))
    print('Recall: ',recall_score(y_test, y_pred, average="macro"))
    print('F1 Score: ',f1_score(y_test, y_pred, average="macro"))