# Movie rating Prediction using NLP and Naive Bayes

In [1]:
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# Load training and testing data

train = pd.read_csv('./train.csv')
X_test = pd.read_csv('./test.csv').values.reshape((-1, ))
X_train = train[:]['review']
y_train = train[:]['label']

print(X_train.shape, y_train.shape, X_test.shape)


# Label encoding of target vector

le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(X_train[1], y_train[1])

(40000,) (40000,) (10000,)
http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules. 1


In [3]:
# Custom tokenizer

re = RegexpTokenizer('[a-zA-Z]+')
ps = PorterStemmer()
sw = set(stopwords.words('english'))
    
def myTokenizer(text):
    text = text.replace('<br /><br />', ' ')
    words = re.tokenize(text.lower())
    useful_words = [ps.stem(w) for w in words if w not in sw]
    return useful_words

In [4]:
print(myTokenizer(X_train[1]))

['http', 'video', 'googl', 'com', 'videoplay', 'docid', 'hl', 'en', 'distribut', 'tri', 'opt', 'mass', 'appeal', 'want', 'best', 'possibl', 'view', 'rang', 'forgo', 'profit', 'continu', 'manual', 'labor', 'job', 'gladli', 'entertain', 'work', 'view', 'texa', 'tale', 'pleas', 'write', 'like', 'like', 'alex', 'like', 'stuie', 'texa', 'texa', 'tale', 'write', 'opinion', 'rule']


In [5]:
# Vectorization and cleaning of reviews

tfidf = TfidfVectorizer(tokenizer=myTokenizer, max_features=30000, ngram_range=(1,2))

X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

print(X_train.shape)

(40000, 30000)


In [6]:
# Training Multinomal Naive bayes classifier

clf = MultinomialNB(alpha=1e-2)
clf.fit(X_train, y_train)

# Testing and Scoring

print(clf.score(X_train, y_train))

0.907075


In [8]:
# Make predictions on test data

y_pred = clf.predict(X_test)
y_pred = le.inverse_transform(y_pred)

df = pd.DataFrame(y_pred, columns=['label'])
df.to_csv('./y_pred.csv', index_label=['Id'])

In [None]:
# Optimizing parameters using GridSearchCV

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()) ])
parameters = {'tfidf__tokenizer': [myTokenizer], 'tfidf__use_idf': [True, False], 'tfidf__max_features': [20000, 40000],
               'tfidf__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
# Get best score and params

print(gs_clf.best_score_)
print(gs_clf.best_params_)