<a href="https://colab.research.google.com/github/dasnikita/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import os 
from collections import Counter
from tqdm.notebook import tqdm
from sklearn import preprocessing
from bs4 import BeautifulSoup
import re 
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import tensorflow as tf 
import joblib
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#Reading our Training Dataset
data = pd.read_csv('/content/drive/MyDrive/Train.csv')

In [None]:
data.head()

In [None]:
data.shape()

In [None]:
data.isna().sum()

We intend to clean the text by
1) removing stopwords
2) Dealing with auxillaries by seperating 'nots'
3) Removing HTML tags (bs4 library is used for this)
4) Removing special charecters and digits
5) Converting the words into lower case

In [None]:
#stopwords  like not,nor etc. which could influence the sentiment label, were removed from the stopwords list.
stopword= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])
#dealing with auxialaries
def seperate_nots(text):
  text = re.sub("can't",'can not',text) 
  text = re.sub("shan't",'shall not',text)
  text = re.sub("won't",'will not',text)
  text = re.sub("n't",' not',text) 
  text = re.sub("'re",' are',text)
  text = re.sub("'ve",' have',text) 
  text = re.sub("'m",' am',text)
  text = re.sub("'ll",' will',text)
  text = re.sub("'d",' would',text)
  text = re.sub("'s",' is',text) 
  return text

def preprocessing_text(text,stopword):
  lst = []
  text = BeautifulSoup(text, "lxml").text #code for removing html tags 
  text = re.sub('[^a-z A-Z0-9]+','',text) #removing special characters 
  text = re.sub('(\S*\d\S*)','',text).strip() #removing numbers/digits
  text = re.sub('"http:\S+"', '',text) 
  text = seperate_nots(text)

  words = [word for word in text.split() if word.lower() not in stopword]
  new_text = " ".join(words)
  print(new_text)
  print("Old length: ", len(text))
  print("New length: ", len(new_text))

 # for word in text.split():
  #  if word not in stopword:
   #   l = word.lower() 
    #  lst.append(l) 
     # x = ' '.join(lst)
 # return x

In [None]:
sentence = "a great taste"

In [None]:
from bs4 import BeautifulSoup
import re
preprocessing_text(sentence,stopword)

great taste
Old length:  13
New length:  11


In [None]:
data['text'] = data['text'].apply(lambda x: preprocessing_text(x,stopword))

In [None]:
data.to_csv('clean_data.csv',index=False) #saving the clean data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/clean_data.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
text = " ".join(review for review in data.text)
print ("There are {} words in the combination of all review.".format(len(text))) 
plt.figure(figsize=(10,7))
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopword,background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
data_pos = data[data['label'] == 1]
text = " ".join(review for review in data_pos.text)
print ("There are {} words in the combination of all positive reviews.".format(len(text))) 
plt.figure(figsize=(10,7))
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopword,background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
data_neg = data[data['label'] == 0]
text = " ".join(review for review in data_neg.text)
print ("There are {} words in the combination of all negative reviews.".format(len(text))) 
plt.figure(figsize=(10,7))
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopword,background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
tf_idf = CountVectorizer(ngram_range=(1,1))  
tf_idf.fit(data['text']) 
print("Printing Some Features: ", tf_idf.get_feature_names()[0:10]) 
final_tfidf = tf_idf.transform(data['text']) 
print('The type of tfidf matrix: ', type(final_tfidf)) 
print('The shape of tfidf matrix: ',final_tfidf.get_shape()) 
print('The number of unique values: ',final_tfidf.get_shape()[1])

In [None]:
y=data.label.values #converting the label column into an array
x=final_tfidf


In [None]:
y

In [None]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
X_cv,X_test,y_cv,y_test=train_test_split(X_train,y_train,test_size=0.2)
print("The shape of X_train: ",X_train.shape) 
print("The shape of y_train: ",y_train.shape) 
print("The shape of X_test: ",X_test.shape) 
print("The shape of y_test: ",y_test.shape) 

In [None]:
clf=DecisionTreeClassifier(criterion='gini').fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy on Decision Tree:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
clf=KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy on KNN:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
clf=RandomForestClassifier(n_estimators=600,n_jobs=-1,max_depth=2).fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy on RFC:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
gbc=GradientBoostingClassifier(n_estimators=200,learning_rate=0.1,max_depth=2) 
gbc.fit(X_train,y_train)
ypred = gbc.predict(X_test) 

In [None]:

from sklearn import metrics
print("Accuracy on GradientBoostingClassifier: ",metrics.accuracy_score(y_test, ypred))

In [None]:
print(classification_report(y_test,ypred))

In [None]:
print(confusion_matrix(y_test,ypred))

In [None]:
MNB = MultinomialNB().fit(X_train,y_train)
y_pred = MNB.predict(X_test)
from sklearn import metrics
print("Accuracy on Naive_Bayes:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
clf=LogisticRegressionCV(cv=10,scoring='accuracy',n_jobs=-1,verbose=1,max_iter=500)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
params = clf.get_params()
params

In [None]:
clf2 = LogisticRegression(dual=False,max_iter=500,n_jobs=-1,penalty='l2',solver='lbfgs',tol=0.0001,verbose=1)
clf2.fit(X_train,y_train)
y_pred = clf2.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy on Logistic Regression: ",metrics.accuracy_score(y_test, y_pred))

In [None]:
import pickle
pickle.dump(clf2,open('/content/LR movie new.pkl','wb'))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cm=confusion_matrix(y_test,y_pred)

In [None]:
plt.clf() 
plt.imshow(cm,interpolation='nearest',cmap='coolwarm') 
classnames = ['Negative','Positive'] 
tick_marks = np.arange(len(classnames)) 
plt.ylabel('True Label') 
plt.xlabel('Predicted label') 
plt.title('Positive or Negative Sentiment- Confusion matrix')
plt.xticks(tick_marks,classnames,rotation=45) 
plt.yticks(tick_marks,classnames) 
S = [['TN',"FP"],['FN','TP']] 
for i in range(2):
    for j in range(2):
        plt.text(j,i,str(S[i][j])+" = "+str(cm[i][j])) 
plt.show()

In [None]:
import pickle
pickle.dump(clf,open('/content/drive/MyDrive/LR movie.pkl','wb'))
pickle.dump(tf_idf,open('/content/drive/MyDrive/tf_idf movie.pkl','wb'))

In [None]:
import dill
wd = "/content/"
with open(wd + "filename.obj","wb") as f:
    dill.dump(clf,f)


In [None]:
clf.coef_

In [None]:
a = ["I cannot phrase it better, so I will quote Rex Reed who called Inception's storyline prattling drivel. A friend claimed Inception is a thinking person's movie, but a thinking person will realize that it is only masquerading as a thinking person's movie. At bottom, intellectually, there is no there there. Add to this that someone clearly believed the film needed to be pumped up with overwrought drama to qualify it as a summer blockbuster. I couldn't wait for it to end, and when it did, the intrusiveness of the loud, schlocky music over the closing credits seemed to crystallize all the incipient negative feelings I had been having throughout the movie. I hope that this director will go back to doing smaller films that do not stretch his concepts beyond what they can support."]
a =tf_idf.transform(a).toarray()

In [None]:
MNB.predict(a)

In [None]:
import numpy as np 
m = np.array([2]) 
type(m[0]) 

Conclusion: After implementing various models, We arrive at the conclusion that the following three are giving the best results:
1)Logistic Regression--> Accuracy_Score : 88.59%
2)Multinomial Naive Bayes--> Accuracy_Score: 85.73%