# Sentiment Analysis to Train Models

## Importing Libraries

In [48]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [49]:
nltk.download("stopwords")
#  create a folder like below mentioned in order download the package.
# /usr/local/share/nltk_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Importing Dataset From S3

In [50]:
dataset = pd.read_csv('https://movie-dataset-live.s3.ap-south-1.amazonaws.com/train_and_test_set/reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [51]:
dataset.head()

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [52]:
dataset.shape

(6918, 2)

In [53]:
dataset.isna().sum()

Reviews     0
Comments    0
dtype: int64

In [54]:
# The stopwords are a list of words that are very very commonly used by people.
stopset = set(stopwords.words('english'))

In [55]:
print(len(stopset))
print(stopset) # It's a bag of words

179
{'both', 'hasn', 'own', 'will', 'just', 'up', 'did', 'against', 'hers', 'no', 'themselves', 'yourself', 'should', "hadn't", 'couldn', 'haven', 'does', 'himself', "you'll", 'being', "that'll", 'having', "weren't", 'why', 'myself', 'won', 'and', "she's", 'ain', 'is', 'now', "needn't", 'wasn', 'd', 'me', 'to', 'have', 'before', 'are', "haven't", 'a', 'where', 'mightn', 'had', 'how', 'as', 'who', 'each', 'such', 'we', 'only', 'don', "doesn't", 'itself', 'of', 'ours', 'other', 'not', 'with', 'been', 'or', 'her', 'can', "isn't", 'him', 'at', 'these', 'from', 'again', 'theirs', 'was', 'yours', 'out', 'over', 'what', "aren't", "wouldn't", 'shouldn', 'your', 'while', 'you', 'between', 'shan', 'some', 'doesn', 'below', 'were', 'during', 'll', 'when', 'they', 'he', 'any', 'hadn', 'ma', 'so', 'o', 'our', 'aren', 'it', 'my', 'those', 't', 're', "hasn't", "you'd", 'nor', 'through', "you're", 'that', 'down', "it's", 'by', 'for', 'whom', 'this', 'his', 'very', 'on', 'didn', "mightn't", 'if', "won'

## converting the text data to numerical data

In [56]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

## Splitting Dataset Into X and Y

In [57]:
X = vectorizer.fit_transform(dataset.Comments)
y = dataset.Reviews

In [58]:
# Saving the Transformed Data
import os.path
file_path = "../model/"
filename = 'transformed.pkl'
pickle.dump(vectorizer, open(os.path.join(file_path, filename), 'wb'))

## Splitting Dataset Into Training and Test Set

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Training the Naive Bayes model on the Training set

In [60]:
# The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification).
classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train,y_train)

## Checking the Accuracy score on Test Set

In [61]:
accuracy_score(y_test,classifier.predict(X_test))*100
# if it's score more than > 85 % your model is perfect fit

97.47109826589595

## Training the Naive Bayes model on X and Y

In [62]:
classifier = naive_bayes.MultinomialNB()
classifier.fit(X,y)

In [63]:
accuracy_score(y_test,classifier.predict(X_test))*100
# now it's testing with our test set 
# if it's score more than > 85 % your model is perfect fit

98.77167630057804

## Saving the Trained Model

In [64]:
import os.path
file_path = "../model/"
filename = 'trained_model.pkl'
pickle.dump(classifier, open(os.path.join(file_path, filename), 'wb'))