In [10]:
'''Dan Yunheum Seol
260677676
Collaborated with Aanika Rahman, Ramsha Ijaz
Got advice and help from Chloé Pierret, Peter Quinn
'''
# import essential libraries

import random
import string
import numpy as np
import pandas as pd
import operator as op
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.dummy import DummyClassifier
# ...
from sklearn.model_selection import GridSearchCV, PredefinedSplit, ParameterGrid
from sklearn.metrics import f1_score, accuracy_score

# for classifiers
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import ast
from collections import Counter
from tqdm import tqdm

yelp_tr = pd.read_csv("hwk3_datasets/yelp-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_te = pd.read_csv("hwk3_datasets/yelp-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_va = pd.read_csv("hwk3_datasets/yelp-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_tr = pd.read_csv("hwk3_datasets/IMDB-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_te = pd.read_csv("hwk3_datasets/IMDB-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_va = pd.read_csv("hwk3_datasets/IMDB-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
# categories of given dataset
hw3_datasets = {
    'Yelp': {'train': yelp_tr, 'valid': yelp_va, 'test': yelp_te},
    'IMDB': {'train': imdb_tr, 'valid': imdb_va, 'test': imdb_te},
}
#Pre-processing:
#You make the sentences to lower case

for dataset in hw3_datasets.values():
    for df in dataset.values():
        df['review'] = df['review'].str.lower()
        df['review'] = df['review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '')

vocab = {}
#We exclude the words that do not have much semantic value: such as "the"
#NLTK's stop words list
stops = {'the','a','i','me', 'youre', 'not', 'my', 'myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once', 'there','when','where','why','how','all','any','both','each','most','other','some','such','nor','only','so','than','too','very','s','t','can','will','just','don','should','now'}
for group_name, group in hw3_datasets.items():
    list_all_words = [word for sentence in group['train']['review'].str.split().tolist() for word in sentence]
    list_freq_words = Counter(word for word in list_all_words if word not in stops).most_common(10000)
    vocab[group_name] = {word[0]: i for i, word in enumerate(list_freq_words)}
vtzrIMDB = CountVectorizer(max_features = 10000, binary=False, vocabulary= vocab['IMDB']) #make it onehot encoded
train = hw3_datasets['IMDB']['train']
test = hw3_datasets['IMDB']['test']
val = hw3_datasets['IMDB']['valid']
train_vectors = vtzrIMDB.fit_transform(train['review'])
test_vectors = vtzrIMDB.transform(test['review'])
val_vectors = vtzrIMDB.transform(val['review'])

In [6]:
#test methods
#methods to get f1 score for training, valid, and test set
normalizer = Normalizer(norm='l1')

train_vectors_norm = normalizer.transform(train_vectors)
test_vectors_norm = normalizer.transform(test_vectors)
val_vectors_norm = normalizer.transform(val_vectors)

#test methods
#methods to get f1 score for training, valid, and test set
def acc_csfier(csfier, vec, label):
    try: 
        csfier.fit(train_vectors_norm, train['label'])
        yhat = csfier.predict(vec)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        yhat = csfier.predict(vec.toarray())
       
    acc_sc = accuracy_score(label,yhat)
  
  


    return acc_sc

def f1_va_csf(csfier):
    try: 
        csfier.fit(train_vectors_norm, train['label'])  
        va_y = csfier.predict(val_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        va_y = csfier.predict(val_vectors_norm.toarray())
    va_f = f1_score(val['label'], va_y,average='macro')
    return va_f
def f1_te_csf(csfier):
    try:
        csfier.fit(train_vectors_norm, train['label'])
        te_y = csfier.predict(test_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        te_y = csfier.predict(test_vectors_norm.toarray())
    te_f = f1_score(test['label'], te_y,average='macro')
    return te_f
def f1_tr_csf(csfier):
    try: 
        csfier.fit(train_vectors_norm, train['label'])  
        tr_y = csfier.predict(train_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        tr_y = csfier.predict(train_vectors_norm.toarray())
    tr_f = f1_score(train['label'], tr_y,average='macro')
    return tr_f
    

In [11]:
bayes_params = ParameterGrid({'var_smoothing':[1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11]}) #We have no hyperparameters besides this
tree_params = ParameterGrid({'random_state':[329],'criterion':['gini','entropy'],'max_depth':[None,10,100,1000],'min_samples_split':[2,5,10]})
svm_params = ParameterGrid({'random_state':[329],'loss':['hinge','squared_hinge'],'C':[0.025, 0.05, 0.1, 0.25,0.5,1,2.0]})

classifiers= [(GaussianNB, bayes_params), (DecisionTreeClassifier, tree_params), (svm.LinearSVC, svm_params)]

def tune_hyper(classifier, param_grid):
    best_score=0 #f1 score on validation
    best_params=None
    for params in param_grid:
        print(f"Attempt with : {params}")
        score = f1_va_csf(classifier(**params))
        print(f"F1 Score for the Validation set would be : {score}\n")
        if score>best_score:
            best_score=score
            best_params=params       
            
    print(f"Optimal parameters for Validation is : {best_params}")
    print(f"F1 Score on Validation set given optimal parameters is: {best_score}\n")
    return classifier(**best_params)

In [None]:
for pair in classifiers: # cycle through the classifiers and parameters
    classifier = pair[0]
    param_grid = pair[1]
    print(classifier)    
    best_classifier = tune_hyper(classifier,param_grid) 
    print(f"Here is our f1 score for the test set with optimal parameters: {f1_te_csf(best_classifier)}\n")
    print(f"Here is our f1 score for the training set given optimal parameters: {f1_tr_csf(best_classifier)}\n")
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, val_vectors_norm, val['label'])}\n")    
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, test_vectors_norm, test['label'])}\n")      
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, train_vectors_norm, train['label'])}\n") 
   

<class 'sklearn.naive_bayes.GaussianNB'>
Attempt with : {'var_smoothing': 1e-06}
F1 Score for the Validation set would be : 0.7626912950829565

Attempt with : {'var_smoothing': 1e-07}
F1 Score for the Validation set would be : 0.7613036643875809

Attempt with : {'var_smoothing': 1e-08}
F1 Score for the Validation set would be : 0.7604153271477583

Attempt with : {'var_smoothing': 1e-09}
F1 Score for the Validation set would be : 0.75872737051261

Attempt with : {'var_smoothing': 1e-10}
F1 Score for the Validation set would be : 0.7576988919615479

Attempt with : {'var_smoothing': 1e-11}
F1 Score for the Validation set would be : 0.7571574576688758

Optimal parameters for Validation is : {'var_smoothing': 1e-06}
F1 Score on Validation set given optimal parameters is: 0.7626912950829565

