# Sentiment Analysis on IMBD review

Sentiment analysis (or opinion mining) is a natural language processing technique 
used to determine whether data is positive, negative or neutral. 
Sentiment analysis is often performed on textual data to 
help businesses monitor brand and product sentiment in customer feedback, and understand customer needs.

### Importing libraries

In [None]:
import numpy as np
import pandas as pd

### Importing dataset

In [None]:
imbd_data=pd.read_csv('IMDB Dataset.csv')

In [None]:
imbd_data.head()

In [None]:
imbd_data.shape

In [None]:
imbd_data['sentiment'].value_counts()

### NLP 

#### Text Preprocessing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

In [None]:
imbd_data.head()

In [None]:
x=imbd_data.drop('sentiment',axis=1)

In [None]:
imbd_data['sentiment']=imbd_data['sentiment'].replace({'positive':1,'negative':0})

In [None]:
y=imbd_data['sentiment']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemm=PorterStemmer()
import re

In [None]:
from datetime import datetime
now = datetime.now()
 
print("now =", now)
# To check the running time

In [None]:
# Converting to lowercase
# Tokenization
# Stemming
# Stopwords removal

In [None]:
corpus=[]
for i in range(len(imbd_data)):
    review=re.sub('[^a-zA-Z]',' ',imbd_data['review'][i]) # replace char with blank only alphabet allowed
    review=review.lower()
    review=review.split()
    
    review=[stemm.stem(i) for i in review if i not in set(stopwords.words('english'))] # removing the stopwords
    review=' '.join(review) #combining the all spaces with join
    corpus.append(review)

In [None]:
now = datetime.now()
 
print("now =", now)

#To check the ending time

In [None]:
#pip install wordcloud

#### Importing Wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
wordcloud = WordCloud(stopwords=review).generate(review)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud11.png')
plt.show()

### Vectorization

In [None]:
# Countvectorizer
# bag of words
bow=CountVectorizer(max_features=1000) # from all the words take most frequent 1000 features occuring words
x=bow.fit_transform(corpus).toarray()

In [None]:
y=imbd_data['sentiment']

### Train test split

In [None]:
# dividing dataset into Train test split
# 70% data will go to training and 30% will go FOR TESTING 
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.3,random_state=0)

In [None]:
bow.get_feature_names()[:10]

In [None]:
bow.get_params()

In [None]:
# Getting Vectorized data
count_data=pd.DataFrame(xtrain,columns=bow.get_feature_names())
count_data

### Naive bayes

### MultinomialNB Algorithm

In [None]:
from sklearn import metrics

In [None]:
from sklearn.naive_bayes import MultinomialNB # Multiple categories of output
nb=MultinomialNB()

In [None]:
# fitting with train
nb.fit(xtrain,ytrain)
pred=nb.predict(xtest)
score=metrics.accuracy_score(ytest,pred)

In [None]:
# Getting Accuracy scores
score