In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import RandomizedSearchCV
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
np.set_printoptions(threshold=np.nan)

In [2]:
# Load data
IMDB_train = pd.read_csv('./IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [3]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

# Preprocessing

In [4]:
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [5]:
def rm_numbers(data):
    new_data = []
    #i = 0
    for sentence in (data):
        new_sentence = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", sentence)
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data



In [6]:
IMDB_train = preprocessing(IMDB_train)
IMDB_test = preprocessing(IMDB_test)

In [57]:
IMDB_train_rm_num = rm_numbers(IMDB_train)
IMDB_test_rm_num = rm_numbers(IMDB_test)

# Bag of n-gram 

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [10]:

unigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words='english', max_features =30000)
unigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words=None, max_features =30000)

In [11]:
train_unigram = unigram.fit_transform(IMDB_train).toarray()
test_unigram = unigram.transform(IMDB_test).toarray()


In [12]:
train_unigram_w_sw = unigram_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_w_sw = unigram_w_sw.transform(IMDB_test).toarray()


In [8]:
unigram_tfid_w_sw = TfidfVectorizer(tokenizer=LemmaTokenizer(),analyzer='word', stop_words=None, ngram_range=(1, 1), max_features =30000)

In [9]:
train_unigram_tfid_w_sw = unigram_tfid_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_tfid_w_sw = unigram_tfid_w_sw.transform(IMDB_test).toarray()

# Random & Majority Classifiers as Baseline

In [50]:
def random_classifier(label):
    pred = np.random.randint(0,2, size=len(label))
    f1= f1_score(label, pred, average='micro')
    return f1
            

In [51]:
IMDB_train_rc_f1 = random_classifier(IMDB_train_y)
print(IMDB_train_rc_f1)

IMDB_test_rc_f1 = random_classifier(IMDB_test_y)
print(IMDB_test_rc_f1)

0.5042
0.504


In [52]:
def takeSecond(elem):
    return elem[1]

def majority_classifier_train(label):
    frequency = list(Counter(label).items())
    frequency = sorted(frequency, key=takeSecond, reverse=True)
    majority_class = frequency[0][0]
    pred = np.full(len(label), majority_class)
    f1 = f1_score(label, pred, average='micro')
    return f1, majority_class


def majority_classifier_test(label, majority_class):
    pred = np.full(len(label), majority_class)
    f1 = f1_score(label, pred, average='micro')
    return f1
    

In [53]:
IMDB_train_mc_f1,IMDB_train_mc_class = majority_classifier_train(IMDB_train_y)
print(IMDB_train_mc_f1,IMDB_train_mc_class)

IMDB_test_mc_f1 = majority_classifier_test(IMDB_test_y, IMDB_train_mc_class)
print(IMDB_test_mc_f1)

0.5 1
0.5


# Naive Bayes Classifier

In [11]:
def Naive_Bayes_Bernoulli(train_data, train_label, test_data, test_label, cv):

    tuned_parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
    clf = BernoulliNB()
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)

    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = BernoulliNB(alpha = best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [16]:
Naive_Bayes_Bernoulli(train_unigram,np.asarray(IMDB_train_y),test_unigram,np.asarray(IMDB_test_y), 5)

train_scores: [0.9143  0.91329 0.91148 0.90837 0.90272 0.89093]
valid_scores: [0.81928 0.8262  0.83324 0.8404  0.84524 0.84388]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.84524
f1 (train):  0.89436
f1 (test):  0.81292


In [17]:
Naive_Bayes_Bernoulli(train_unigram_w_sw,IMDB_train_y,test_unigram_w_sw,IMDB_test_y, 5)

train_scores: [0.91624 0.91499 0.91327 0.90961 0.90376 0.89206]
valid_scores: [0.8272  0.83476 0.842   0.84776 0.85156 0.8514 ]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.85156
f1 (train):  0.89508
f1 (test):  0.82412


In [15]:
def Naive_Bayes_Gaussian(train_data, train_label, test_data, test_label, cv):

    tuned_parameters = [{'priors': [[0.5,0.5],[0.9,0.1],[0.1,0.9]]}]
    clf = GaussianNB()
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)

    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = GaussianNB(priors = best_param['priors'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [17]:
Naive_Bayes_Gaussian(train_unigram_tfid_w_sw,IMDB_train_y,test_unigram_tfid_w_sw,IMDB_test_y, 5)

train_scores: [0.91785 0.91783 0.91787]
valid_scores: [0.69936 0.69936 0.69932]
params: [{'priors': [0.5, 0.5]}, {'priors': [0.9, 0.1]}, {'priors': [0.1, 0.9]}]
best_param {'priors': [0.5, 0.5]}
best_estimator GaussianNB(priors=[0.5, 0.5])
best_score 0.69936
f1 (train):  0.90784
f1 (test):  0.61956


# Support Vector Machine Classifier

In [16]:
def SVM(train_data, train_label, test_data, test_label, cv):

    tuned_parameters = [{'C': [0.01, 0.1, 1.0, 2.0, 3.0], 'tol': [0.0001, 0.001, 0.01, 0.1, 1]}]
    
    clf = LinearSVC(dual=False)
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='f1_micro', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)

    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = LinearSVC(C=best_param['C'], tol=best_param['tol'], dual=False)
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [14]:
SVM(train_unigram,IMDB_train_y,test_unigram,IMDB_test_y, 5)

train_scores: [0.97209 0.97213 0.97222 0.97255 0.92488 0.9983  0.99831 0.99825 0.98903
 0.9369  1.      1.      0.99996 0.98953 0.94069 1.      1.      0.99998
 0.98849 0.94157 1.      1.      0.99999 0.98971 0.94156]
valid_scores: [0.88008 0.88016 0.87992 0.87968 0.86488 0.8662  0.8666  0.86612 0.86636
 0.85948 0.85132 0.8524  0.85204 0.86232 0.85996 0.8492  0.85032 0.85128
 0.86176 0.85988 0.84876 0.85004 0.8508  0.86152 0.86048]
params: [{'C': 0.01, 'tol': 0.0001}, {'C': 0.01, 'tol': 0.001}, {'C': 0.01, 'tol': 0.01}, {'C': 0.01, 'tol': 0.1}, {'C': 0.01, 'tol': 1}, {'C': 0.1, 'tol': 0.0001}, {'C': 0.1, 'tol': 0.001}, {'C': 0.1, 'tol': 0.01}, {'C': 0.1, 'tol': 0.1}, {'C': 0.1, 'tol': 1}, {'C': 1.0, 'tol': 0.0001}, {'C': 1.0, 'tol': 0.001}, {'C': 1.0, 'tol': 0.01}, {'C': 1.0, 'tol': 0.1}, {'C': 1.0, 'tol': 1}, {'C': 2.0, 'tol': 0.0001}, {'C': 2.0, 'tol': 0.001}, {'C': 2.0, 'tol': 0.01}, {'C': 2.0, 'tol': 0.1}, {'C': 2.0, 'tol': 1}, {'C': 3.0, 'tol': 0.0001}, {'C': 3.0, 'tol': 0.001}, {

In [15]:
SVM(train_unigram_w_sw,IMDB_train_y,test_unigram_w_sw,IMDB_test_y, 5)

train_scores: [0.97504 0.97502 0.97461 0.96893 0.91224 0.99876 0.99861 0.99806 0.98175
 0.91703 1.      0.99997 0.99979 0.98957 0.91708 1.      1.      0.9999
 0.98647 0.91708 1.      1.      0.99989 0.98642 0.91713]
valid_scores: [0.8842  0.88424 0.88408 0.88264 0.86996 0.86924 0.86932 0.87012 0.87328
 0.86876 0.85768 0.85908 0.86204 0.86808 0.86872 0.8568  0.85896 0.86208
 0.87236 0.86868 0.8564  0.85776 0.86044 0.87436 0.86848]
params: [{'C': 0.01, 'tol': 0.0001}, {'C': 0.01, 'tol': 0.001}, {'C': 0.01, 'tol': 0.01}, {'C': 0.01, 'tol': 0.1}, {'C': 0.01, 'tol': 1}, {'C': 0.1, 'tol': 0.0001}, {'C': 0.1, 'tol': 0.001}, {'C': 0.1, 'tol': 0.01}, {'C': 0.1, 'tol': 0.1}, {'C': 0.1, 'tol': 1}, {'C': 1.0, 'tol': 0.0001}, {'C': 1.0, 'tol': 0.001}, {'C': 1.0, 'tol': 0.01}, {'C': 1.0, 'tol': 0.1}, {'C': 1.0, 'tol': 1}, {'C': 2.0, 'tol': 0.0001}, {'C': 2.0, 'tol': 0.001}, {'C': 2.0, 'tol': 0.01}, {'C': 2.0, 'tol': 0.1}, {'C': 2.0, 'tol': 1}, {'C': 3.0, 'tol': 0.0001}, {'C': 3.0, 'tol': 0.001}, {'

In [18]:
SVM(train_unigram_tfid_w_sw,IMDB_train_y,test_unigram_tfid_w_sw,IMDB_test_y, 5)

train_scores: [0.86128 0.86126 0.86105 0.86105 0.847   0.93103 0.93108 0.93045 0.93019
 0.91735 0.98781 0.98781 0.9878  0.98774 0.96572 0.99602 0.99602 0.99607
 0.99615 0.97323 0.9985  0.99849 0.99849 0.99759 0.97386]
valid_scores: [0.84408 0.84408 0.84436 0.84436 0.83536 0.88648 0.88644 0.8866  0.88768
 0.88464 0.89012 0.89008 0.89004 0.88944 0.88772 0.88624 0.88636 0.8864
 0.88476 0.88308 0.88264 0.88264 0.88136 0.87896 0.88096]
params: [{'C': 0.01, 'tol': 0.0001}, {'C': 0.01, 'tol': 0.001}, {'C': 0.01, 'tol': 0.01}, {'C': 0.01, 'tol': 0.1}, {'C': 0.01, 'tol': 1}, {'C': 0.1, 'tol': 0.0001}, {'C': 0.1, 'tol': 0.001}, {'C': 0.1, 'tol': 0.01}, {'C': 0.1, 'tol': 0.1}, {'C': 0.1, 'tol': 1}, {'C': 1.0, 'tol': 0.0001}, {'C': 1.0, 'tol': 0.001}, {'C': 1.0, 'tol': 0.01}, {'C': 1.0, 'tol': 0.1}, {'C': 1.0, 'tol': 1}, {'C': 2.0, 'tol': 0.0001}, {'C': 2.0, 'tol': 0.001}, {'C': 2.0, 'tol': 0.01}, {'C': 2.0, 'tol': 0.1}, {'C': 2.0, 'tol': 1}, {'C': 3.0, 'tol': 0.0001}, {'C': 3.0, 'tol': 0.001}, {'