<a href="https://colab.research.google.com/github/debolina201/SentimentAnalysis/blob/master/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1: Loading and exploring the dataset

In [8]:
import pandas as pd
data = pd.read_excel('/content/drive/My Drive/Datasets/ReviewsFileName.xlsx')
data.head(5)

Unnamed: 0,Review,Sentiment
0,the rock is destined to be the 21st century's ...,0
1,"the gorgeously elaborate continuation of "" the...",0
2,effective but too-tepid biopic\n,0
3,if you sometimes like to go to the movies to h...,0
4,"emerges as something rare , an issue movie tha...",0


In [9]:
data.Review[:10]

0    the rock is destined to be the 21st century's ...
1    the gorgeously elaborate continuation of " the...
2                     effective but too-tepid biopic\n
3    if you sometimes like to go to the movies to h...
4    emerges as something rare , an issue movie tha...
5    the film provides some great insight into the ...
6    offers that rare combination of entertainment ...
7    perhaps no picture ever made has more literall...
8    steers turns in a snappy screenplay that curls...
9    take care of my cat offers a refreshingly diff...
Name: Review, dtype: object

In [10]:
print(data.Review[3500])

the filmmakers wisely decided to let crocodile hunter steve irwin do what he does best , and fashion a story around him . 



# 2: Data Preprocessing

* Lowercasing
* Removal of non-alphabetical characters
* Tokenization
* Stop-word removal
* Stemming

In [11]:
import nltk
import re
import string
'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
'''
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop = stopwords.words('english')
punc = string.punctuation

In [12]:
data['Review'] = data['Review'].str.lower() #lowercasing
data['Review'] = data['Review'].str.replace('[^a-z\s]', '') #removing non-alphabetical chars

In [13]:
data['Review'] = data['Review'].apply(word_tokenize)
print(data.Review[3500])

['the', 'filmmakers', 'wisely', 'decided', 'to', 'let', 'crocodile', 'hunter', 'steve', 'irwin', 'do', 'what', 'he', 'does', 'best', 'and', 'fashion', 'a', 'story', 'around', 'him']


In [14]:
stemmer = PorterStemmer()

def clean(df):
  cleaned_tokens = []
  for word in df:
    if(word not in stop and word not in punc):
      cleaned_tokens.append(stemmer.stem(word))
  return cleaned_tokens

data['Review'] = data['Review'].apply(clean)

In [15]:
print(data.Review[3500])

['filmmak', 'wise', 'decid', 'let', 'crocodil', 'hunter', 'steve', 'irwin', 'best', 'fashion', 'stori', 'around']


In [16]:
data.head(3)

Unnamed: 0,Review,Sentiment
0,"[rock, destin, st, centuri, new, conan, he, go...",0
1,"[gorgeous, elabor, continu, lord, ring, trilog...",0
2,"[effect, tootepid, biopic]",0


# 3: Analysis of Data (here, word frequency distribution analysis)

In [17]:
from nltk.probability import FreqDist

fdist = FreqDist()
for doc in data.Review:
  for word in doc:
    fdist[word] += 1

In [18]:
vocab = fdist.most_common(50)
print(vocab)

[('film', 1804), ('movi', 1544), ('like', 805), ('one', 763), ('make', 611), ('stori', 536), ('charact', 483), ('time', 466), ('comedi', 392), ('good', 389), ('even', 388), ('much', 386), ('work', 368), ('perform', 359), ('feel', 337), ('way', 336), ('get', 310), ('littl', 302), ('love', 296), ('look', 294), ('funni', 285), ('director', 274), ('enough', 267), ('never', 262), ('take', 249), ('may', 245), ('come', 242), ('us', 241), ('seem', 240), ('thing', 240), ('bad', 238), ('best', 237), ('doesnt', 232), ('life', 232), ('entertain', 231), ('see', 231), ('end', 231), ('watch', 226), ('would', 222), ('well', 220), ('plot', 212), ('there', 211), ('new', 207), ('year', 204), ('go', 201), ('someth', 201), ('interest', 200), ('could', 199), ('realli', 197), ('audienc', 196)]


In [19]:
lst = [tup[0] for tup in vocab] #most common 50 words are stored in lst

In [20]:
with open('nlargest.txt', 'w') as f:
    for item in lst:
        f.write("%s\n" % item)

# 4. Feature preparation

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [22]:
# Preparing data to create TF-IDF features

d = data.Review
merged =[]
for doc in d:
  merged.append((' '.join(doc)))

data.Review = merged
data.head(10)

Unnamed: 0,Review,Sentiment
0,rock destin st centuri new conan he go make sp...,0
1,gorgeous elabor continu lord ring trilog huge ...,0
2,effect tootepid biopic,0
3,sometim like go movi fun wasabi good place start,0
4,emerg someth rare issu movi that honest keenli...,0
5,film provid great insight neurot mindset comic...,0
6,offer rare combin entertain educ,0
7,perhap pictur ever made liter show road hell p...,0
8,steer turn snappi screenplay curl edg clever w...,0
9,take care cat offer refreshingli differ slice ...,0


In [24]:
# Create TfidfVectorizer object
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(data.Review)

In [25]:
print(tfidf_matrix.shape)
print(tfidf_matrix.toarray())

(10662, 102003)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# 5: Training

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data.Sentiment, test_size=0.33, random_state=42)

In [28]:
print(X_train[0])
print(X_train.shape)

  (0, 55201)	0.20860710848171762
  (0, 13529)	0.20860710848171762
  (0, 26821)	0.20860710848171762
  (0, 46838)	0.20860710848171762
  (0, 38675)	0.20860710848171762
  (0, 18898)	0.20860710848171762
  (0, 62815)	0.20860710848171762
  (0, 3354)	0.20860710848171762
  (0, 58484)	0.20860710848171762
  (0, 5336)	0.20860710848171762
  (0, 74380)	0.20860710848171762
  (0, 44173)	0.20860710848171762
  (0, 96420)	0.20860710848171762
  (0, 26820)	0.20860710848171762
  (0, 3353)	0.20860710848171762
  (0, 74379)	0.20860710848171762
  (0, 44166)	0.17356623626397244
  (0, 96413)	0.16201333534313217
  (0, 55175)	0.14338367178671627
  (0, 3067)	0.199779274284321
  (0, 99810)	0.12998977743204018
  (0, 3062)	0.14692205310450412
  (0, 5255)	0.10911847868627512
  (0, 18888)	0.1511492407703165
  (0, 62810)	0.1398725226401414
  (0, 46837)	0.199779274284321
  (0, 38672)	0.17149113177825626
  (0, 58472)	0.09431106119946413
  (0, 13490)	0.1259712698115124
(7143, 102003)


In [29]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
preds = clf.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, preds)

array([[1366,  412],
       [ 396, 1345]])

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

0.7703893151463483