In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import h5py
from collections import Counter
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import RandomizedSearchCV

stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
np.set_printoptions(threshold=np.nan)

  from ._conv import register_converters as _register_converters


In [2]:
# Load data
IMDB_train = pd.read_csv('./IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [3]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

# Preprocessing

In [4]:
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        #clean = re.compile('<.*?>')
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [5]:
def rm_numbers(data):
    new_data = []
    #i = 0
    for sentence in (data):
        new_sentence = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", sentence)
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data



In [6]:
IMDB_train = preprocessing(IMDB_train)
IMDB_test = preprocessing(IMDB_test)

In [66]:
IMDB_train_rm_num = rm_numbers(IMDB_train)
IMDB_test_rm_num = rm_numbers(IMDB_test)

# Bag of n-gram 

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [66]:

unigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words='english', max_features =30000)
bigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words='english', max_features =30000)
trigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(3, 3), stop_words='english', max_features =30000)



In [67]:
train_unigram = unigram.fit_transform(IMDB_train).toarray()
test_unigram = unigram.transform(IMDB_test).toarray()


In [69]:
train_bigram = bigram.fit_transform(IMDB_train).toarray()
test_bigram = bigram.transform(IMDB_test).toarray()

In [70]:
train_trigram = trigram.fit_transform(IMDB_train).toarray()
test_trigram = trigram.transform(IMDB_test).toarray()

In [8]:

unigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words=None, max_features =30000)
bigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words=None, max_features =30000)
trigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(3, 3), stop_words=None, max_features =30000)



In [None]:
train_unigram_w_sw = unigram_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_w_sw = unigram_w_sw.transform(IMDB_test).toarray()

In [23]:
train_bigram_w_sw = bigram_w_sw.fit_transform(IMDB_train).toarray()
test_bigram_w_sw = bigram_w_sw.transform(IMDB_test).toarray()

In [27]:
train_trigram_w_sw = trigram_w_sw.fit_transform(IMDB_train).toarray()
test_trigram_w_sw = trigram_w_sw.transform(IMDB_test).toarray()

In [9]:

unibigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 2), stop_words=None, max_features =30000)
allgram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 3), stop_words=None, max_features =30000)
allgram_wo_lm = CountVectorizer(analyzer='word', ngram_range=(1, 3), stop_words=None, max_features =30000)



In [10]:
train_unibigram_w_sw = unibigram_w_sw.fit_transform(IMDB_train).toarray()
test_unibigram_w_sw = unibigram_w_sw.transform(IMDB_test).toarray()

In [15]:
train_allgram_w_sw = allgram_w_sw.fit_transform(IMDB_train).toarray()
test_allgram_w_sw = allgram_w_sw.transform(IMDB_test).toarray()

In [10]:
train_allgram_wo_lm = allgram_wo_lm.fit_transform(IMDB_train).toarray()
test_allgram_wo_lm = allgram_wo_lm.transform(IMDB_test).toarray()


In [67]:
train_allgram_rm_num = allgram_w_sw.fit_transform(IMDB_train_rm_num).toarray()
test_allgram_rm_num = allgram_w_sw.transform(IMDB_test_rm_num).toarray()

# Naive Bayes

In [11]:
def Naive_Bayes_B(train_data, train_label, valid_data, valid_label, test_data, test_label):
    tuned_parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
    test_valid_fold = np.r_[ np.full(train_label.shape[0], -1),np.ones(valid_label.shape[0])]
    ps = PredefinedSplit(test_valid_fold)

    clf = BernoulliNB()
    #clf.fit(np.r_[train_data,valid_data], np.r_[train_label,valid_label])
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='accuracy', cv=ps, return_train_score=True)
    clf.fit(np.r_[train_data,valid_data], np.r_[train_label,valid_label])
    #y_pred = clf.predict(yelp_train_x)

    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = BernoulliNB(alpha = best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_valid = clf.predict(valid_data)
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_valid= f1_score(valid_label, y_pred_valid, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (valid): ', f1_valid)
    print('f1 (test): ', f1_test)

In [20]:
Naive_Bayes_B(train_unigram[:15000],IMDB_train_y[:15000],train_unigram[15000:],IMDB_train_y[15000:],test_unigram,IMDB_test_y)

train_scores: [0.92406667 0.92293333 0.92053333 0.91553333 0.90793333 0.89      ]
valid_scores: [0.8052 0.8135 0.8261 0.8345 0.8395 0.8393]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8395
f1 (train):  0.9079333333333335
f1 (valid):  0.8395000000000001
f1 (test):  0.80388


In [21]:
Naive_Bayes_B(train_unigram_w_sw[:15000],IMDB_train_y[:15000],train_unigram_w_sw[15000:],IMDB_train_y[15000:],test_unigram_w_sw,IMDB_test_y)

train_scores: [0.92706667 0.92586667 0.92333333 0.91866667 0.91033333 0.8942    ]
valid_scores: [0.8145 0.8247 0.8335 0.8404 0.8468 0.8453]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8468
f1 (train):  0.9103333333333333
f1 (valid):  0.8468
f1 (test):  0.81612


In [13]:
Naive_Bayes_B(train_bigram[:15000],IMDB_train_y[:15000],train_bigram[15000:],IMDB_train_y[15000:],test_bigram,IMDB_test_y)

train_scores: [0.94873333 0.94846667 0.948      0.9466     0.94293333 0.92993333]
valid_scores: [0.8051 0.8084 0.8132 0.823  0.831  0.832 ]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1}
best_estimator BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.832
f1 (train):  0.9299333333333333
f1 (valid):  0.832
f1 (test):  0.8191999999999999


In [14]:
Naive_Bayes_B(train_bigram_w_sw[:15000],IMDB_train_y[:15000],train_bigram_w_sw[15000:],IMDB_train_y[15000:],test_bigram_w_sw,IMDB_test_y)

train_scores: [0.9276     0.9274     0.92713333 0.92633333 0.92473333 0.92193333]
valid_scores: [0.8591 0.8605 0.862  0.8618 0.861  0.8609]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.001}
best_estimator BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.862
f1 (train):  0.9271333333333334
f1 (valid):  0.8619999999999999
f1 (test):  0.86016


In [None]:
Naive_Bayes_B(train_trigram[:15000],IMDB_train_y[:15000],train_trigram[15000:],IMDB_train_y[15000:],test_trigram,IMDB_test_y)

train_scores: [0.90593333 0.90593333 0.90593333 0.90566667 0.90446667 0.88766667]
valid_scores: [0.7081 0.7082 0.7083 0.7091 0.7149 0.7164]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1}
best_estimator BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.7164
f1 (train):  0.8876666666666667
f1 (valid):  0.7164
f1 (test):  0.69028


In [12]:
Naive_Bayes_B(train_trigram_w_sw[:15000],IMDB_train_y[:15000],train_trigram_w_sw[15000:],IMDB_train_y[15000:],test_trigram_w_sw,IMDB_test_y)

train_scores: [0.9348     0.93466667 0.9344     0.93393333 0.93233333 0.9254    ]
valid_scores: [0.8208 0.8239 0.8279 0.8317 0.835  0.8362]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1}
best_estimator BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8362
f1 (train):  0.9254
f1 (valid):  0.8362
f1 (test):  0.83884


In [11]:
def Naive_Bayes(train_data, train_label, test_data, test_label, cv):

    tuned_parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
    clf = BernoulliNB()
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='accuracy', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)

    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = BernoulliNB(alpha = best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [16]:
Naive_Bayes(train_unigram,IMDB_train_y,test_unigram,IMDB_test_y, 5)

train_scores: [0.9143  0.91329 0.91148 0.90837 0.90272 0.89093]
valid_scores: [0.81928 0.8262  0.83324 0.8404  0.84524 0.84388]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.84524
f1 (train):  0.89436
f1 (test):  0.81292


In [17]:
Naive_Bayes(train_unigram_w_sw,IMDB_train_y,test_unigram_w_sw,IMDB_test_y, 5)

train_scores: [0.91624 0.91499 0.91327 0.90961 0.90376 0.89206]
valid_scores: [0.8272  0.83476 0.842   0.84776 0.85156 0.8514 ]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.85156
f1 (train):  0.89508
f1 (test):  0.82412


In [16]:
Naive_Bayes(train_bigram,np.asarray(IMDB_train_y),test_bigram,np.asarray(IMDB_test_y), 5)

train_scores: [0.9325  0.93231 0.93187 0.93081 0.9283  0.91866]
valid_scores: [0.82392 0.82712 0.83108 0.83652 0.84228 0.84408]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1}
best_estimator BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.84408
f1 (train):  0.91036
f1 (test):  0.82776


In [17]:
Naive_Bayes(train_bigram_w_sw,IMDB_train_y,test_bigram_w_sw,IMDB_test_y, 5)

train_scores: [0.91403 0.91394 0.9137  0.91334 0.9127  0.9103 ]
valid_scores: [0.86568 0.866   0.86628 0.86648 0.86684 0.86528]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.86684
f1 (train):  0.905
f1 (test):  0.86616


In [14]:
Naive_Bayes(train_trigram,np.asarray(IMDB_train_y),test_trigram,np.asarray(IMDB_test_y), 5)

train_scores: [0.89638 0.89637 0.89634 0.89544 0.89209 0.87817]
valid_scores: [0.71588 0.71592 0.71628 0.71792 0.72204 0.72952]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1}
best_estimator BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.72952
f1 (train):  0.86708
f1 (test):  0.7022


In [15]:
Naive_Bayes(train_trigram_w_sw,IMDB_train_y,test_trigram_w_sw,IMDB_test_y, 5)

train_scores: [0.91979 0.91971 0.91947 0.91901 0.91776 0.91342]
valid_scores: [0.84052 0.84256 0.84444 0.8458  0.84752 0.84584]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.84752
f1 (train):  0.9089599999999999
f1 (test):  0.85092


In [71]:
Naive_Bayes(train_unibigram_w_sw,IMDB_train_y,test_unibigram_w_sw,IMDB_test_y, 5)

train_scores: [0.89824 0.89815 0.8981  0.89793 0.89762 0.89592]
valid_scores: [0.86864 0.86896 0.86908 0.86892 0.86872 0.86756]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.001}
best_estimator BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.86908
f1 (train):  0.89244
f1 (test):  0.86308


In [18]:
Naive_Bayes(train_allgram_w_sw,IMDB_train_y,test_allgram_w_sw,IMDB_test_y, 5)

train_scores: [0.89585 0.89582 0.89578 0.89558 0.89525 0.89398]
valid_scores: [0.8696  0.86956 0.8696  0.86932 0.86932 0.8678 ]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1e-05}
best_estimator BernoulliNB(alpha=1e-05, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8696
f1 (train):  0.8902
f1 (test):  0.8664


In [12]:
Naive_Bayes(train_allgram_wo_lm,IMDB_train_y,test_allgram_wo_lm,IMDB_test_y, 5)

train_scores: [0.89686 0.89679 0.89667 0.89656 0.89604 0.89454]
valid_scores: [0.86912 0.86888 0.8688  0.86856 0.86832 0.86724]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1e-05}
best_estimator BernoulliNB(alpha=1e-05, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.86912
f1 (train):  0.8918
f1 (test):  0.86688


In [68]:
Naive_Bayes(train_allgram_rm_num,IMDB_train_y,test_allgram_rm_num,IMDB_test_y, 5)

train_scores: [0.89468 0.89466 0.89459 0.89444 0.89411 0.89301]
valid_scores: [0.86748 0.86748 0.86748 0.86732 0.86724 0.86612]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 1e-05}
best_estimator BernoulliNB(alpha=1e-05, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.86748
f1 (train):  0.8894
f1 (test):  0.865


# NBSVM

In [12]:

import numpy as np

from scipy.sparse import spmatrix, coo_matrix

from sklearn.base import BaseEstimator
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.svm import LinearSVC

class NBSVM(BaseEstimator, LinearClassifierMixin, SparseCoefMixin):

    def __init__(self, alpha=1, C=1, beta=0.25, fit_intercept=False):
        self.alpha = alpha
        self.C = C
        self.beta = beta
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        if len(self.classes_) == 2:
            coef_, intercept_ = self._fit_binary(X, y)
            self.coef_ = coef_
            self.intercept_ = intercept_
        else:
            coef_, intercept_ = zip(*[
                self._fit_binary(X, y == class_)
                for class_ in self.classes_
            ])
            self.coef_ = np.concatenate(coef_)
            self.intercept_ = np.array(intercept_).flatten()
        return self

    def _fit_binary(self, X, y):
        p = np.asarray(self.alpha + X[y == 1].sum(axis=0)).flatten()
        q = np.asarray(self.alpha + X[y == 0].sum(axis=0)).flatten()
        r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
        b = np.log((y == 1).sum()) - np.log((y == 0).sum())

        if isinstance(X, spmatrix):
            indices = np.arange(len(r))
            r_sparse = coo_matrix(
                (r, (indices, indices)),
                shape=(len(r), len(r))
            )
            X_scaled = X * r_sparse
        else:
            X_scaled = X * r

        lsvc = LinearSVC(
            C=self.C,
            fit_intercept=self.fit_intercept,
            max_iter=10000
        ).fit(X_scaled, y)

        mean_mag =  np.abs(lsvc.coef_).mean()

        coef_ = (1 - self.beta) * mean_mag * r + self.beta * (r * lsvc.coef_)

        intercept_ = (1 - self.beta) * mean_mag * b + self.beta * lsvc.intercept_

        return coef_, intercept_

In [13]:
def NB_SVM(train_data, train_label, test_data, test_label, cv):

    tuned_parameters_C = [{'C': [0.01, 0.1, 1.0, 2.0]}]
    
    clf = NBSVM()
    clf = GridSearchCV(clf, tuned_parameters_C, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)
    best_param_c = clf.best_params_ 
    print('best_param', best_param_c)
    
    tuned_parameters_beta = [{'beta': [0.25, .5, .75]}]
    clf = NBSVM(C=best_param_c['C'])
    clf = GridSearchCV(clf, tuned_parameters_beta, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)
    best_param_b = clf.best_params_ 
    print('best_param', best_param_b)
    
    tuned_parameters_alpha = [{'alpha': [0.001, 0.01, 0.1, 1]}]
    clf = NBSVM(C=best_param_c['C'], beta=best_param_b['beta'])
    clf = GridSearchCV(clf, tuned_parameters_alpha, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = NBSVM(C=best_param_c['C'], beta=best_param_b['beta'], alpha=best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [58]:
NB_SVM(train_unigram,np.asarray(IMDB_train_y),test_unigram,np.asarray(IMDB_test_y), 5)

best_param {'C': 0.01}
best_param {'beta': 0.5}
best_param {'alpha': 1}
train_scores: [0.96432 0.96313 0.96018 0.95212]
valid_scores: [0.88444 0.88804 0.89212 0.89316]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=1, beta=0.5, fit_intercept=False)
best_score 0.89316
f1 (train):  0.94796
f1 (test):  0.87652


In [13]:
NB_SVM(train_unigram_w_sw,IMDB_train_y,test_unigram_w_sw,IMDB_test_y, 5)

best_param {'C': 0.01}
best_param {'beta': 0.5}
best_param {'alpha': 1}
train_scores: [0.9651  0.96364 0.96071 0.95233]
valid_scores: [0.88916 0.89296 0.89616 0.89784]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=1, beta=0.5, fit_intercept=False)
best_score 0.89784
f1 (train):  0.94912
f1 (test):  0.8836


In [24]:
NB_SVM(train_bigram_w_sw,IMDB_train_y,test_bigram_w_sw,IMDB_test_y, 5)

best_param {'C': 0.01}
best_param {'beta': 0.5}
best_param {'alpha': 1}
train_scores: [0.96277 0.96262 0.96223 0.95953]
valid_scores: [0.89136 0.89156 0.89196 0.89228]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=1, beta=0.5, fit_intercept=False)
best_score 0.89228
f1 (train):  0.95484
f1 (test):  0.8904800000000002


In [28]:
NB_SVM(train_trigram_w_sw,IMDB_train_y,test_trigram_w_sw,IMDB_test_y, 5)

best_param {'C': 0.01}
best_param {'beta': 0.25}
best_param {'alpha': 0.1}
train_scores: [0.94452 0.94405 0.94293 0.93719]
valid_scores: [0.8518  0.85408 0.85596 0.85536]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=0.1, beta=0.25, fit_intercept=False)
best_score 0.85596
f1 (train):  0.9326
f1 (test):  0.85784


In [14]:
NB_SVM(train_unibigram_w_sw,IMDB_train_y,test_unibigram_w_sw,IMDB_test_y, 5)

best_param {'C': 0.01}
best_param {'beta': 0.75}
best_param {'alpha': 0.1}
train_scores: [0.97696 0.97692 0.97661 0.97479]
valid_scores: [0.90648 0.907   0.90704 0.90672]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=0.1, beta=0.75, fit_intercept=False)
best_score 0.90704
f1 (train):  0.97248
f1 (test):  0.90344


In [16]:
NB_SVM(train_allgram_w_sw,IMDB_train_y,test_allgram_w_sw,IMDB_test_y, 5)

best_param {'C': 0.01}
best_param {'beta': 0.75}
best_param {'alpha': 0.1}
train_scores: [0.97608 0.97603 0.9759  0.97439]
valid_scores: [0.90548 0.90564 0.90588 0.9058 ]
params: [{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_estimator NBSVM(C=0.01, alpha=0.1, beta=0.75, fit_intercept=False)
best_score 0.90588
f1 (train):  0.97084
f1 (test):  0.90476
