In [11]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import os
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

In [2]:
positive = "/Users/gaurav/Desktop/Kaggle/imdb/imdb-movie-reviews-dataset/aclImdb/train/pos/"
negative = "/Users/gaurav/Desktop/Kaggle/imdb/imdb-movie-reviews-dataset/aclImdb/train/neg/"

test_positive = "/Users/gaurav/Desktop/Kaggle/imdb/imdb-movie-reviews-dataset/aclImdb/test/pos/"
test_negative = "/Users/gaurav/Desktop/Kaggle/imdb/imdb-movie-reviews-dataset/aclImdb/test/neg/"

In [3]:
def content(dir, filename):
    path=dir+filename
#     print (path)
    doc = file(path, 'r')
    rating = filename.split("_")[1].split('.')[0]
    text = doc.read()
    return text, rating

In [4]:
def get_reviews(dir, dic):
    for x in os.listdir(dir):
        text_rating = content(dir, x)
#         print (rev, rating)
        dic.append(text_rating)


In [5]:
def rate(x):
    if int(x[1])>5:
        return 1
    else:
        return 0

In [6]:
def clean_data(text):
    stop = set(stopwords.words("english"))
    text = re.sub("<[a-z0-9]+>", " ", text)
    text = re.sub("[0-9]+", "NUMBER", text)
    text = re.sub("[.,#_%$&\"\'\/\(\):!]", "", text)
    words = text.split()
    words_filtered = [x for x in words if x not in stop]
    text = " ".join(words_filtered)
    return text

In [7]:
review = []
get_reviews(positive, review)
get_reviews(negative, review)

In [8]:
test_review = []
get_reviews(test_positive, test_review)
get_reviews(test_negative, test_review)

In [9]:
df = pd.DataFrame(review, columns=["review", "rating"])
df_test = pd.DataFrame(test_review, columns=["review", "rating"])

In [10]:
df["class"] = df.apply(rate, axis=1)
df_test["class"] = df_test.apply(rate, axis=1)

In [11]:
df['review_cleaned'] = df['review'].apply(lambda x: clean_data(x))
df_test['review_cleaned'] = df_test['review'].apply(lambda x: clean_data(x))

In [13]:
df.to_csv("imdb/train.csv", index=False)
df_test.to_csv("imdb/test.csv", index=False)

In [7]:
df = pd.read_csv("imdb/train.csv")
df_test = pd.read_csv("imdb/test.csv")

In [47]:
def classify(vect, clf, params, df, df_test):
    bow = vect.fit_transform(df['review_cleaned'].tolist())
    bow_test = vect.transform(df_test['review_cleaned'].tolist())
    print("Number of features: " + str(len(vect.get_feature_names())))
    model = GridSearchCV(clf, params, cv=2)
    model.fit(bow, df['class'])
    print("Best estimator: \n")
    print(model.best_estimator_)
    preds = model.predict(bow_test)
    score = accuracy_score(df_test['class'], preds)
    return model, preds, score

In [15]:
def classify2(vect, clf, df, df_test):
    bow = vect.fit_transform(df['review_cleaned'].tolist())
    bow_test = vect.transform(df_test['review_cleaned'].tolist())
    print("Number of features: " + str(len(vect.get_feature_names())))
#     model = GridSearchCV(clf, params, cv=2)
    clf.fit(bow, df['class'])
    preds = clf.predict(bow_test)
    score = accuracy_score(df_test['class'], preds)
    return clf, preds, score

In [4]:
%%time
vect = CountVectorizer(min_df=5, max_features=10000)
tc = tree.DecisionTreeClassifier()
parameters = {'max_features':(1000, 5000, 10000)}
tc, preds, score = classify(vect, tc, parameters, df, df_test)
print "Accuracy: " + str(score)

Number of features: 10000
Best estimator: 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=10000, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy: 0.71348
Wall time: 4min 18s


In [7]:
%%time
vect = CountVectorizer(min_df=5, max_features=10000, ngram_range=(1,2))
tc = tree.DecisionTreeClassifier()
parameters = {'max_features':(1000, 5000, 9999)}
tc2, preds, score = classify(vect, tc, parameters, df, df_test)
print "Accuracy: " + str(score)

Number of features: 10000
Best estimator: 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=5000, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy: 0.71208
Wall time: 4min 8s


In [8]:
%%time
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2))
tc = tree.DecisionTreeClassifier()
parameters = {'max_features':(1000, 5000, 9999)}
tc3, preds, score = classify(vect, tc, parameters, df, df_test)
print "Accuracy: " + str(score)

Number of features: 102215
Best estimator: 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=9999, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy: 0.717
Wall time: 2min 47s


In [9]:
%%time
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2), max_features=10000)
rfc = RandomForestClassifier(n_estimators=200)
parameters = {'n_estimators':(50, 100, 200)}
rf, preds, score = classify(vect, rfc, parameters, df, df_test)
print "Accuracy: " + str(score)

Number of features: 10000
Best estimator: 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy: 0.85352
Wall time: 17min


In [12]:
%%time
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2), max_features=10000)
lr2 = LogisticRegression()
parameters = {}
lr2, preds, score = classify(vect, lr2, parameters, df, df_test)
print "Accuracy: " + str(score)

Number of features: 10000
Best estimator: 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.8872
Wall time: 1min 32s


In [16]:
%%time
vect = CountVectorizer(min_df=5, ngram_range=(1,2), max_features=10000)
lr = LogisticRegression()
lr, preds, score = classify2(vect, lr, df, df_test)
print "Accuracy: " + str(score)

Number of features: 10000
Accuracy: 0.85832
Wall time: 1min 38s


In [36]:
def get_coef(word, vect, model):
    i=0
    index=-1
    for x in vect.get_feature_names():
        if x==word:
            index = i
        i=i+1
#     print("Word not found in features")
    if index != -1:
        print("Weight of " + word + ": " + str(model.coef_[0][index]))

In [42]:
get_coef("excellent", vect, lr)
get_coef("worst", vect, lr)
get_coef("bad", vect, lr)

Weight of excellent: 1.1407312792412023
Weight of worst: -1.9396562892559814
Weight of bad: -0.7057011326587759


In [49]:
%%time
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2), max_features=10000)
bow = vect.fit_transform(df['review_cleaned'].tolist())
bow_test = vect.transform(df_test['review_cleaned'].tolist())

Wall time: 1min 28s


In [50]:
%%time
nb = GaussianNB()
nb.fit(bow.toarray(), df['class'])
preds = nb.predict(bow_test.toarray())
score = accuracy_score(df_test['class'], preds)
print "Accuracy: " + str(score)

Accuracy: 0.80532
Wall time: 21min 38s
