# Sentimental Analysis of IMDB Movie reviews.

## 1.Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print('Done !')

Done !


In [2]:
data = pd.read_csv('MovieReviewTrainingDatabase.csv')

## 2. Dataset preparation

In [3]:
data.head()

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


In [4]:
print(data.shape)

(25000, 2)


In [5]:
X = data['review']
y = data['sentiment']

##### Train-Validataion split

In [6]:
from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(data['review'], data['sentiment'])

##### Converting categorical data to nuerical

In [7]:
encoding = {
    'Positive' : 1,
    'Negative' : 0,
}

train_y = [encoding[s] for s in train_y]
valid_y = [encoding[s] for s in valid_y]

y = [encoding[s] for s in y]

## 3. Feature Engineering

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data['review'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(data['review'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

## 3. Model Building

In [9]:
# Naive Bayes on Word Level TF IDF Vectors
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

multinomial_word = MultinomialNB()
multinomial_word.fit(xtrain_tfidf, train_y)
predictions_word = multinomial_word.predict(xvalid_tfidf)

accuracy = metrics.accuracy_score(predictions_word, valid_y)

print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.85888


In [10]:
# Naive Bayes on Ngram Level TF IDF Vectors

multinomial_ngram = MultinomialNB()
multinomial_ngram.fit(xtrain_tfidf_ngram, train_y)
predictions_ngrams = multinomial_word.predict(xvalid_tfidf_ngram)

accuracy = metrics.accuracy_score(predictions_ngrams, valid_y)

print("NB, N-Gram Vectors: ", accuracy)

NB, N-Gram Vectors:  0.47952


## 4. Training the complete traing data using Tf-Idf, Naive Bayes on word level

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data['review'])
Xtrain_tfidf =  tfidf_vect.transform(X)

model = MultinomialNB()

model.fit(Xtrain_tfidf, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 5. Prepairing the test data

In [12]:
predictions_word

array([0, 0, 1, ..., 0, 1, 0])

In [13]:
test = pd.read_csv('Test.csv')

test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [14]:
test.shape

(10000, 1)

## 6. Testing

In [15]:
xtest_tfidf =  tfidf_vect.transform(test['review'])

In [16]:
y_pred = model.predict(xtest_tfidf)

In [17]:
y_pred

array([0, 0, 0, ..., 1, 1, 0])

In [18]:
decoding = {
    0 : 'neg',
    1 : 'pos'
}

y_pred = [decoding[y] for y in y_pred]

In [19]:
id = np.arange(10000)

In [20]:
output = pd.DataFrame({
    'Id' : id,
    'label' : y_pred,
})

##### Saving Sol to a csv file

In [21]:
output.to_csv('Predictions.csv', index = False)

# On submission I got a score of 85%