In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
np.set_printoptions(threshold=np.nan)

In [2]:
# Load data
IMDB_train = pd.read_csv('./Dataset/Input/IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./Dataset/Input/IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./Dataset/Input/IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [3]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

# Preprocessing

In [7]:
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        #clean = re.compile('<.*?>')
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [10]:
IMDB_train = preprocessing(IMDB_train)
IMDB_test = preprocessing(IMDB_test)

# Bag of n-gram 

In [14]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [15]:
def rm_sents(data, target):
    new_data = []
    new_target = []
    for i in range(0,len(data)):
        if len(list(set(data[i]))) != 1:
            new_data.append(data[i])
            new_target.append(target[i])
    return new_data, new_target

In [None]:
unigram = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words='english', 
                          max_features =30000)
bigram = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 2), stop_words='english', 
                         max_features =30000)
trigram = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 3), stop_words='english', 
                          max_features =30000)

In [None]:
train_unigram = unigram.fit_transform(IMDB_train).toarray()
test_unigram = unigram.transform(IMDB_test).toarray()

In [None]:
train_bigram = bigram.fit_transform(IMDB_train).toarray()
test_bigram = bigram.transform(IMDB_test).toarray()

In [None]:
train_trigram = trigram.fit_transform(IMDB_train).toarray()
test_trigram = trigram.transform(IMDB_test).toarray()

In [17]:
unigram_w_sw = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words=None, 
                               max_features =30000, binary=True)
bigram_w_sw = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 2), stop_words=None, 
                              max_features =30000, binary=True)
trigram_w_sw = TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 3), stop_words=None, 
                               max_features =30000, binary=True)

In [None]:
train_unigram_w_sw = unigram_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_w_sw = unigram_w_sw.transform(IMDB_test).toarray()

In [18]:
train_bigram_w_sw = bigram_w_sw.fit_transform(IMDB_train).toarray()
test_bigram_w_sw = bigram_w_sw.transform(IMDB_test).toarray()

In [None]:
train_trigram_w_sw = trigram_w_sw.fit_transform(IMDB_train).toarray()
test_trigram_w_sw = trigram_w_sw.transform(IMDB_test).toarray()

# Linear SVM

In [19]:
def run_linearsvm(imdb_linearsvm_clf, train_input, train_output, test_input, test_output):
    imdb_linearsvm_clf = imdb_linearsvm_clf.fit(train_input, train_output)
    imdb_linearsvm_best_params = imdb_linearsvm_clf.best_params_
    print(imdb_linearsvm_best_params)
    # make classifier with best parameters found
    imdb_linearsvm_clf = LinearSVC(tol=imdb_linearsvm_best_params['tol'],
                                    C=imdb_linearsvm_best_params['C'])
    
    imdb_linearsvm_clf = imdb_linearsvm_clf.fit(train_input, train_output)
    
    # make predictions
    imdb_linearsvm_train_pred = imdb_linearsvm_clf.predict(train_input)
    imdb_linearsvm_test_pred = imdb_linearsvm_clf.predict(test_input)

    # calculate accuracy
    imdb_linearsvm_train_accuracy = accuracy_score(train_output, imdb_linearsvm_train_pred)
    imdb_linearsvm_test_accuracy = accuracy_score(test_output, imdb_linearsvm_test_pred)
    
    return [imdb_linearsvm_train_accuracy, imdb_linearsvm_test_accuracy, imdb_linearsvm_best_params]

In [20]:
def init_svm_clf():
    imdb_linearsvm_clf = LinearSVC(max_iter=35000)
    list_C = np.random.uniform(low=1, high=20, size=20)
    list_tol = np.random.uniform(low=10**(-6), high=10**(-1), size=20)
    # parameter grid to check against
    # for hyperparameter tuning
    tuned_parameters = {'C': list_C, 'tol': list_tol}
    imdb_linearsvm_clf = RandomizedSearchCV(imdb_linearsvm_clf, tuned_parameters, scoring='accuracy', cv=3, verbose=2, 
                                            n_iter=20)
    return imdb_linearsvm_clf

## Without stop words 

### Unigram

In [None]:
unigram_results = run_linearsvm(init_svm_clf(), train_unigram, IMDB_train_y, test_unigram, IMDB_test_y)

In [None]:
print("Linear SVM Train Unigram Accuracy:", unigram_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Unigram Accuracy:", unigram_results[1])
print("Best Linear SVM (Unigram) Parameters:", unigram_results[2])

### Bigram

In [None]:
bigram_results = run_linearsvm(init_svm_clf(), train_bigram, IMDB_train_y, test_bigram, IMDB_test_y)

In [None]:
print("Linear SVM Train Bigram Accuracy:", bigram_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Bigram Accuracy:", bigram_results[1])
print("Best Linear SVM (Bigram) Parameters:", bigram_results[2])

### Trigram

In [None]:
trigram_results = run_linearsvm(init_svm_clf(), train_trigram, IMDB_train_y, test_trigram, IMDB_test_y)

In [None]:
print("Linear SVM Train Trigram Accuracy:", trigram_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Trigram Accuracy:", trigram_results[1])
print("Best Linear SVM (Trigram) Parameters:", trigram_results[2])

## With stop words

### Unigram

In [None]:
unigram_w_sw_results = run_linearsvm(init_svm_clf(), train_unigram_w_sw, IMDB_train_y, 
                                     test_unigram_w_sw, IMDB_test_y)

In [None]:
print("Linear SVM Train Unigram w/ SW Accuracy:", unigram_w_sw_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Unigram w/ SW Accuracy:", unigram_w_sw_results[1])
print("Best Linear SVM (Unigram) w/ SW Parameters:", unigram_w_sw_results[2])

### Bigram

In [21]:
bigram_w_sw_results = run_linearsvm(init_svm_clf(), train_bigram_w_sw, IMDB_train_y, 
                                     test_bigram_w_sw, IMDB_test_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] tol=0.0371214185109454, C=3.6720450795498776 ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... tol=0.0371214185109454, C=3.6720450795498776, total=   6.9s
[CV] tol=0.0371214185109454, C=3.6720450795498776 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.9s remaining:    0.0s


[CV] ..... tol=0.0371214185109454, C=3.6720450795498776, total=   6.7s
[CV] tol=0.0371214185109454, C=3.6720450795498776 ....................
[CV] ..... tol=0.0371214185109454, C=3.6720450795498776, total=   6.6s
[CV] tol=0.024469273314805407, C=12.579006590268905 ..................
[CV] ... tol=0.024469273314805407, C=12.579006590268905, total=   7.1s
[CV] tol=0.024469273314805407, C=12.579006590268905 ..................
[CV] ... tol=0.024469273314805407, C=12.579006590268905, total=   7.4s
[CV] tol=0.024469273314805407, C=12.579006590268905 ..................
[CV] ... tol=0.024469273314805407, C=12.579006590268905, total=   7.3s
[CV] tol=0.003815835177003577, C=5.272170288918806 ...................
[CV] .... tol=0.003815835177003577, C=5.272170288918806, total=   7.0s
[CV] tol=0.003815835177003577, C=5.272170288918806 ...................
[CV] .... tol=0.003815835177003577, C=5.272170288918806, total=   7.2s
[CV] tol=0.003815835177003577, C=5.272170288918806 ...................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  8.0min finished


{'tol': 0.0020192831769063778, 'C': 1.8194125754321022}


In [22]:
print("Linear SVM Train Bigram w/ SW Accuracy:", bigram_w_sw_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Bigram w/ SW Accuracy:", bigram_w_sw_results[1])
print("Best Linear SVM (Bigram) w/ SW Parameters:", bigram_w_sw_results[2])

Linear SVM Train Bigram w/ SW Accuracy: 0.99944
Linear SVM Test Bigram w/ SW Accuracy: 0.88864
Best Linear SVM (Bigram) w/ SW Parameters: {'tol': 0.0020192831769063778, 'C': 1.8194125754321022}


### Trigram

In [None]:
trigram_w_sw_results = run_linearsvm(init_svm_clf(), train_trigram_w_sw, IMDB_train_y, 
                                     test_trigram_w_sw, IMDB_test_y)

In [None]:
print("Linear SVM Train Trigram w/ SW Accuracy:", trigram_w_sw_results[0])
# print("IMDB Linear SVM Valid F1 Score:", imdb_linearsvm_valid_f1)
print("Linear SVM Test Trigram w/ SW Accuracy:", trigram_w_sw_results[1])
print("Best Linear SVM (Trigram) w/ SW Parameters:", trigram_w_sw_results[2])