In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import h5py
from collections import Counter
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import RandomizedSearchCV

stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
np.set_printoptions(threshold=np.nan)

  from ._conv import register_converters as _register_converters


In [2]:
# Load data
IMDB_train = pd.read_csv('./IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [3]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

# Preprocessing

In [4]:
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        #clean = re.compile('<.*?>')
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [5]:
"""
def rm_stopwords(data):
    new_data = []
    for sent in data:
        new_data.append([w for w in sent if w not in stopwords])
    return new_data
"""

'\ndef rm_stopwords(data):\n    new_data = []\n    for sent in data:\n        new_data.append([w for w in sent if w not in stopwords])\n    return new_data\n'

In [6]:
"""
<<<<IGNORE THESE FOR NOW>>>>

def tokenize(data):
    new_data = []
    for sentence in (data):
        new_sentence = nltk.word_tokenize(sentence)
        new_data.append(new_sentence)
    return new_data        

def stem_lem(data):
    new_data = []
    for sent in data:
        this_sent = []
        for w in test3:
            w = stemmer.stem(w)
            w = lemmatizer.lemmatize(w)
            this_sent.append(stemmer.stem(w))
        new_data.append(this_sent)
    return new_data
"""

'\n<<<<IGNORE THESE FOR NOW>>>>\n\ndef tokenize(data):\n    new_data = []\n    for sentence in (data):\n        new_sentence = nltk.word_tokenize(sentence)\n        new_data.append(new_sentence)\n    return new_data        \n\ndef stem_lem(data):\n    new_data = []\n    for sent in data:\n        this_sent = []\n        for w in test3:\n            w = stemmer.stem(w)\n            w = lemmatizer.lemmatize(w)\n            this_sent.append(stemmer.stem(w))\n        new_data.append(this_sent)\n    return new_data\n'

In [5]:
IMDB_train = preprocessing(IMDB_train)
#IMDB_valid = preprocessing(IMDB_valid)
IMDB_test = preprocessing(IMDB_test)

In [8]:
#IMDB_train_tok = tokenize(IMDB_train)
#IMDB_valid_tok = tokenize(IMDB_valid)
#IMDB_test_tok = tokenize(IMDB_test)

In [9]:
#IMDB_train_sl = stem_lem(IMDB_train_tok)
#IMDB_valid_sl = stem_lem(IMDB_valid_tok)
#IMDB_test_sl = stem_lem(IMDB_test_tok)

In [10]:
#IMDB_train_stop = rm_stopwords(IMDB_train_sl)
#IMDB_valid_stop = rm_stopwords(IMDB_valid_sl)
#IMDB_test_stop = rm_stopwords(IMDB_test_sl)

# Bag of n-gram 

In [6]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [12]:

unigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words='english', max_features =30000)
bigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words='english', max_features =30000)
trigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(3, 3), stop_words='english', max_features =30000)



In [13]:
train_unigram = unigram.fit_transform(IMDB_train).toarray()
test_unigram = unigram.transform(IMDB_test).toarray()


In [14]:
train_bigram = bigram.fit_transform(IMDB_train).toarray()
test_bigram = bigram.transform(IMDB_test).toarray()

In [44]:
train_trigram = trigram.fit_transform(IMDB_train).toarray()
test_trigram = trigram.transform(IMDB_test).toarray()

In [7]:

unigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words=None, max_features =30000)
bigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words=None, max_features =30000)
trigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(3, 3), stop_words=None, max_features =30000)



In [8]:
train_unigram_w_sw = unigram_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_w_sw = unigram_w_sw.transform(IMDB_test).toarray()


In [None]:
train_bigram_w_sw = bigram_w_sw.fit_transform(IMDB_train).toarray()
test_bigram_w_sw = bigram_w_sw.transform(IMDB_test).toarray()

In [None]:
train_trigram_w_sw = trigram_w_sw.fit_transform(IMDB_train).toarray()
test_trigram_w_sw = trigram_w_sw.transform(IMDB_test).toarray()

In [19]:
def Naive_Bayes_B(train_data, train_label, valid_data, valid_label, test_data, test_label):
    tuned_parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
    test_valid_fold = np.r_[ np.full(train_label.shape[0], -1),np.ones(valid_label.shape[0])]
    ps = PredefinedSplit(test_valid_fold)

    clf = BernoulliNB()
    #clf.fit(np.r_[train_data,valid_data], np.r_[train_label,valid_label])
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='accuracy', cv=ps, return_train_score=True)
    clf.fit(np.r_[train_data,valid_data], np.r_[train_label,valid_label])
    #y_pred = clf.predict(yelp_train_x)

    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = BernoulliNB(alpha = best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_valid = clf.predict(valid_data)
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_valid= f1_score(valid_label, y_pred_valid, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (valid): ', f1_valid)
    print('f1 (test): ', f1_test)

In [20]:
Naive_Bayes_B(train_unigram[:15000],IMDB_train_y[:15000],train_unigram[15000:],IMDB_train_y[15000:],test_unigram,IMDB_test_y)

train_scores: [0.92406667 0.92293333 0.92053333 0.91553333 0.90793333 0.89      ]
valid_scores: [0.8052 0.8135 0.8261 0.8345 0.8395 0.8393]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8395
f1 (train):  0.9079333333333335
f1 (valid):  0.8395000000000001
f1 (test):  0.80388


In [21]:
Naive_Bayes_B(train_unigram_w_sw[:15000],IMDB_train_y[:15000],train_unigram_w_sw[15000:],IMDB_train_y[15000:],test_unigram_w_sw,IMDB_test_y)

train_scores: [0.92706667 0.92586667 0.92333333 0.91866667 0.91033333 0.8942    ]
valid_scores: [0.8145 0.8247 0.8335 0.8404 0.8468 0.8453]
params: [{'alpha': 1e-05}, {'alpha': 0.0001}, {'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]
best_param {'alpha': 0.1}
best_estimator BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)
best_score 0.8468
f1 (train):  0.9103333333333333
f1 (valid):  0.8468
f1 (test):  0.81612


In [14]:
def Naive_Bayes(train_data, train_label, test_data, test_label, cv):

    tuned_parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
    clf = BernoulliNB()
    clf = GridSearchCV(clf, tuned_parameters, refit=True, scoring='accuracy', cv=cv, return_train_score=True)
    clf.fit(train_data, train_label)

    
    train_scores = clf.cv_results_['mean_train_score']
    print('train_scores:',train_scores)
    test_scores = clf.cv_results_['mean_test_score']
    print('valid_scores:',test_scores)
    params = clf.cv_results_['params']
    print('params:', params)
    best_param = clf.best_params_ 
    print('best_param', best_param)
    best_estimator = clf.best_estimator_  
    print('best_estimator', best_estimator)
    best_score = clf.best_score_
    print('best_score', best_score)

    clf = BernoulliNB(alpha = best_param['alpha'])
    clf.fit(train_data, train_label)
    
    y_pred_train = clf.predict(train_data)    
    y_pred_test = clf.predict(test_data)
    f1_train= f1_score(train_label, y_pred_train, average='micro')
    f1_test= f1_score(test_label, y_pred_test, average='micro')
    print('f1 (train): ', f1_train)
    print('f1 (test): ', f1_test)

In [None]:
Naive_Bayes(train_unigram,np.asarray(IMDB_train_y),test_unigram,np.asarray(IMDB_test_y), 5)

In [None]:
Naive_Bayes(train_unigram_w_sw,IMDB_train_y,test_unigram_w_sw,IMDB_test_y, 5)