# Movie rating Prediction using NLP and Naive Bayes

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv').values.reshape((-1,))
X_train = train[:]['review']
Y_train = train[:]['label']

In [3]:
y_le = LabelEncoder()
Y_train = y_le.fit_transform(Y_train)

In [4]:
def myTokenizer(text):
    re = RegexpTokenizer('[a-zA-Z]+')
    ps = PorterStemmer()
    sw = set(stopwords.words('english'))
    
    text = text.replace('<br /><br />', ' ')
    words = re.tokenize(text.lower())
    useful_words = [ps.stem(w) for w in words if w not in sw]
    return useful_words

In [5]:
tfidf = TfidfVectorizer(tokenizer=myTokenizer, use_idf=False, max_features=40000, ngram_range=(1,2))
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print(X_train.shape)

(40000, 40000)


In [7]:
clf = MultinomialNB(alpha=1e-2)
clf.fit(X_train, Y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [8]:
print(clf.score(X_train, Y_train))

0.90695


In [9]:
pred = clf.predict(X_test)
pred = y_le.inverse_transform(pred)
df = pd.DataFrame(pred, columns=['label'])
df.to_csv('pred.csv', index_label=['Id'])

In [None]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()) ])
parameters = {'tfidf__tokenizer': [myTokenizer], 'tfidf__use_idf': [False], 'tfidf__max_features': [20000, 40000],
               'tfidf__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, Y_train)

In [None]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)