## TFIDF with preprocessing over an Ensemble of Models
An ensemble of:
* Logistic Model with SAGA solver
* Logistic Model with liblinear solver
* LinearSVC Model

This ensemble of model gave best mean squared error. Even better cost obtained using expected value of prediction by taking probability of predictions of Logistic model.

In [1]:
import nltk
import nltk.sentiment
import numpy as np
import json
import pickle
import timeit
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import re

In [2]:
f = open("../stopwords_list.dat")
stopwords = set([word.strip().lower() for word in f.readlines()])
f.close()

In [3]:
devfile = "../../A1_Data/dev.json"
trainfile = "../../A1_Data/train.json"

### Train Model

In [4]:
feature_vector = pickle.load(open("neg_tfidf_train_vector.pickle","rb"))

In [5]:
train_labels = []
f = open(trainfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    train_labels.append(json_dump["ratings"])
f.close()

In [6]:
from sklearn.linear_model import LogisticRegression
model_log = LogisticRegression(penalty="l2", multi_class='multinomial', solver="saga", max_iter=2000, n_jobs=2)

In [7]:
start = timeit.default_timer()
print(model_log.fit(feature_vector, train_labels))
print(timeit.default_timer()-start)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='multinomial',
          n_jobs=2, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)
245.9304527430795


In [8]:
pickle.dump(model_log,open("neg_logist_model.pickle", "wb"))

In [9]:
from sklearn.svm import LinearSVC
model_svc = LinearSVC(penalty="l2", loss="squared_hinge", dual=False, tol=0.0001, C=1.0, multi_class="ovr",
                 fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None,
                  max_iter=1000)

In [10]:
start = timeit.default_timer()
print(model_svc.fit(feature_vector, train_labels))
print("Train time:",timeit.default_timer()-start)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Train time: 660.7536866678856


In [11]:
pickle.dump(model_svc,open("neg_svc_model.pickle", "wb"))

In [12]:
from sklearn.linear_model import LogisticRegression
model_liblin = LogisticRegression(penalty="l2", multi_class='ovr', solver="liblinear", max_iter=2000, n_jobs=2)

In [13]:
start = timeit.default_timer()
print(model_liblin.fit(feature_vector, train_labels))
print("Train time:",timeit.default_timer()-start)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Train time: 492.73601657757536


In [14]:
pickle.dump(model_liblin,open("neg_liblin_model.pickle", "wb"))

### Validation phase

In [16]:
dev_feature_vector = pickle.load(open("neg_tfidf_dev_vector.pickle","rb"))

In [18]:
dev_labels = []
dev_data = []
f = open(devfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    dev_labels.append(json_dump["ratings"])
    dev_data.append(json_dump["review"])
f.close()

In [22]:
y_pred = (model_log.predict(dev_feature_vector) + model_svc.predict(dev_feature_vector)
          + model_liblin.predict(dev_feature_vector))/3
diff = y_pred - np.array(dev_labels)
cost = np.sum(diff*diff)
print("Cost: \t\t",cost)

Cost: 		 93589.44444444445
