In [52]:
from __future__ import print_function

import numpy as np
import pandas as pd

from glob import glob
import os.path
from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

## Load dataset

In [53]:
# Extract the review texts from all files that match the pattern

def extract_reviews_data(file_pattern):
    data = []
    for filename in glob(file_pattern):
        with open(filename, "rb") as f:
            review = f.read().decode("utf-8")
            doc_id = filename.split("/")[-1].split(".")[0]
            data.append({
                "review": review
            })
    return data

In [54]:
# Extract the training data with pos and neg reviews
train_pos_path = os.path.expanduser("~/Desktop/DL4NLT/aclImdb/train/pos/*.txt")
train_neg_path = os.path.expanduser("~/Desktop/DL4NLT/aclImdb/train/neg/*.txt")

train_pos_data = extract_reviews_data(train_pos_path)
train_neg_data = extract_reviews_data(train_neg_path)

In [55]:
# Extract the test data with pos and neg reviews
test_pos_path = os.path.expanduser("~/Desktop/DL4NLT/aclImdb/test/pos/*.txt")
test_neg_path = os.path.expanduser("~/Desktop/DL4NLT/aclImdb/test/neg/*.txt")

test_pos_data = extract_reviews_data(test_pos_path)
test_neg_data = extract_reviews_data(test_neg_path)

In [56]:
# Create a train.tsv file. positive reviews are labeled as 1 and negative as 0. shuffle data before save to file
train_pos_df = pd.DataFrame(train_pos_data)
train_pos_df.head()

train_pos_df["sentiment"] = 1
train_neg_df = pd.DataFrame(train_neg_data)
train_neg_df["sentiment"] = 0

train_df = pd.concat([train_pos_df, train_neg_df], axis=0)
train_df = shuffle(train_df)
train_df["doc_id"] = np.arange(len(train_df))

train_df[["doc_id", "sentiment", "review"]].to_csv("train.tsv", sep="\t")

In [57]:
# Create the test.tsv file. positive reviews are labeled as 1 and negative as 0. shuffle data before save to file
test_pos_df = pd.DataFrame(test_pos_data)
test_pos_df["sentiment"] = 1
test_neg_df = pd.DataFrame(test_neg_data)
test_neg_df["sentiment"] = 0

test_df = pd.concat([test_pos_df, test_neg_df], axis=0)

test_df = shuffle(test_df)
test_df["doc_id"] = np.arange(len(test_df))

test_df[["doc_id", "review"]].to_csv("test.tsv", sep="\t", index=False)

In [58]:
train_df = pd.read_csv("train.tsv", sep="\t")

## Split train - test data

In [63]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df["review"], train_df["sentiment"], test_size=0.2)

In [64]:
print("Training Data: {}, Validation: {}".format(len(X_train), len(X_valid)))

Training Data: 20000, Validation: 5000


## Vectorize data

In [65]:
vect = CountVectorizer(max_features=5000, binary=True, stop_words="english")

In [66]:
# Fit training data
vect.fit(X_train)


CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [67]:
# transform training and validation data
X_train_vect = vect.transform(X_train)
X_valid_vect = vect.transform(X_valid)

## Model 1 - Logistic Regression

In [68]:
model_1 = LogisticRegression()
model_1.fit(X_train_vect, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [69]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_1.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_1.score(X_valid_vect, y_valid)))

Training Accuracy: 0.964
Validation Accuracy: 0.858


## Model 2 - Naive Bayes

In [70]:
model_2 = MultinomialNB()
model_2.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [71]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_2.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_2.score(X_valid_vect, y_valid)))

Training Accuracy: 0.864
Validation Accuracy: 0.858


## Random Forest

In [72]:
model_3 = RandomForestClassifier(min_samples_leaf=3, n_estimators=25, n_jobs=-1)
model_3.fit(X_train_vect, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_3.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_3.score(X_valid_vect, y_valid)))

Training Accuracy: 0.948
Validation Accuracy: 0.831


## Model 4 - Gradient Boosted Trees

In [75]:
model_4 = RandomForestClassifier(min_samples_leaf=3, n_estimators=25, n_jobs=-1)
model_4.fit(X_train_vect, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [76]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_4.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_4.score(X_valid_vect, y_valid)))


Training Accuracy: 0.950
Validation Accuracy: 0.831


## Ensemble model

In [77]:
classifiers = [("Logistic Regression", model_1), 
               ("Naive Bayes", model_2), 
               ("Random Forest", model_3), 
               ("Gradient Boosted", model_4)]

In [78]:
classifiers

[('Logistic Regression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('Naive Bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
 ('Random Forest',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)),
 ('Gradient Boosted',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes

In [79]:
ensemble_model = VotingClassifier(classifiers, n_jobs=-1)

In [80]:
ensemble_model.fit(X_train_vect, y_train)

VotingClassifier(estimators=[('Logistic Regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [81]:
# Accuracy on the ensembled Model
print("Training Accuracy: {:.3f}".format(ensemble_model.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(ensemble_model.score(X_valid_vect, y_valid)))

  if diff:


Training Accuracy: 0.951
Validation Accuracy: 0.857


  if diff:


## Test the ensembled model 

In [82]:
test_df = pd.read_csv("test.tsv", sep="\t")

In [83]:
X_test = test_df.review
X_test_vect = vect.transform(X_test)

In [84]:
y_test_pred = ensemble_model.predict(X_test_vect)

  if diff:


In [85]:
df = pd.DataFrame({
    "doc_id": test_df.doc_id,
    "sentiment": y_test_pred
})

In [86]:
df.to_csv("ensemlbe_pred.csv", index=False)