# Yelp Frequency Bag-of-Words

In [1]:
'''Dan Yunheum Seol
260677676
Collaborated with Aanika Rahman, Ramsha Ijaz
Got advice and help from Chloé Pierret, Peter Quinn
'''
# import essential libraries

import random
import string
import numpy as np
import pandas as pd
import operator as op
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score #f1_score(y_true, y_pred)

# ...
from sklearn.model_selection import GridSearchCV, PredefinedSplit, ParameterGrid
# for classifiers
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import ast
from collections import Counter
import matplotlib.pyplot as plt
# examples are split with  \n
# rating given with review is last char in example
yelp_tr = pd.read_csv("hwk3_datasets/yelp-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_te = pd.read_csv("hwk3_datasets/yelp-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_va = pd.read_csv("hwk3_datasets/yelp-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_tr = pd.read_csv("hwk3_datasets/IMDB-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_te = pd.read_csv("hwk3_datasets/IMDB-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_va = pd.read_csv("hwk3_datasets/IMDB-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])

# categories of given dataset
hw3_datasets = {
    'Yelp': {'train': yelp_tr, 'valid': yelp_va, 'test': yelp_te},
    'IMDB': {'train': imdb_tr, 'valid': imdb_va, 'test': imdb_te},
}
#Pre-processing:
#You make the sentences to lower case

for dataset in hw3_datasets.values():
    for df in dataset.values():
        df['review'] = df['review'].str.lower()
        df['review'] = df['review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '')
vocab = {}
#We exclude the words that do not have much semantic value: such as "the"
#NLTK's stop words list
stops = {'the','a','i','me', 'youre', 'not', 'my', 'myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once', 'there','when','where','why','how','all','any','both','each','most','other','some','such','nor','only','so','than','too','very','s','t','can','will','just','don','should','now'}   
for group_name, group in hw3_datasets.items():
    list_all_words = [word for sentence in group['train']['review'].str.split().tolist() for word in sentence]
    list_freq_words = Counter(word for word in list_all_words if word not in stops).most_common(10000)
    vocab[group_name] = {word[0]: i for i, word in enumerate(list_freq_words)}
print("The Vocabulary has been created")
vtzrYelp = CountVectorizer(max_features = 10000, binary=False, vocabulary= vocab['Yelp']) #make it onehot encoded 
train = hw3_datasets['Yelp']['train']
test = hw3_datasets['Yelp']['test']
val = hw3_datasets['Yelp']['valid']
train_vectors = vtzrYelp.fit_transform(train['review'])
test_vectors = vtzrYelp.transform(test['review'])
val_vectors = vtzrYelp.transform(val['review'])
#print(train_vectors)
type(train_vectors) #what type does it return?
X = train_vectors.toarray() #change it to a 2d array
#print(sp.sparse.csr_matrix(train_vectors.toarray())) #could I change it back to sparse.csr_matrix?

The Vocabulary has been created


In [9]:
#Yelp_vocab_stored = pd.read_csv("hwk3_datasets/submission/Yelp-vocab.txt", sep='\t', lineterminator='\n', header=None, names=['word', 'index', 'count'])
#Yelp_counts = Yelp_vocab_stored['count'] # get the count column so that you can divide each entry of the vector
normalizer = Normalizer(norm='l1')

train_vectors_norm = normalizer.transform(train_vectors)
test_vectors_norm = normalizer.transform(test_vectors)
val_vectors_norm = normalizer.transform(val_vectors)

#test methods
#methods to get f1 score for training, valid, and test set
def acc_csfier(csfier, vec, label):
    try: 
        csfier.fit(train_vectors_norm, train['label'])
        yhat = csfier.predict(vec)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        yhat = csfier.predict(vec.toarray())
       
    acc_sc = accuracy_score(label,yhat)
  
  


    return acc_sc

def f1_va_csf(csfier):
    try: 
        csfier.fit(train_vectors_norm, train['label'])  
        va_y = csfier.predict(val_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        va_y = csfier.predict(val_vectors_norm.toarray())
    va_f = f1_score(val['label'], va_y,average='macro')
    return va_f
def f1_te_csf(csfier):
    try:
        csfier.fit(train_vectors_norm, train['label'])
        te_y = csfier.predict(test_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        te_y = csfier.predict(test_vectors_norm.toarray())
    te_f = f1_score(test['label'], te_y,average='macro')
    return te_f
def f1_tr_csf(csfier):
    try: 
        csfier.fit(train_vectors_norm, train['label'])  
        tr_y = csfier.predict(train_vectors_norm)
    except: #classifiers such as GaussianNB cannot take input in sparse matrix form
        csfier.fit(train_vectors_norm.toarray(), train['label'])
        tr_y = csfier.predict(train_vectors_norm.toarray())
    tr_f = f1_score(train['label'], tr_y,average='macro')
    return tr_f
    
       

In [5]:
bayes_params = ParameterGrid({'var_smoothing':[1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11]}) #We have no hyperparameters besides this
tree_params = ParameterGrid({'random_state':[329],'criterion':['gini','entropy'],'max_depth':[None,10,100,1000],'min_samples_split':[2,5,10]})
svm_params = ParameterGrid({'random_state':[329],'loss':['hinge','squared_hinge'],'C':[0.025, 0.05, 0.1, 0.25,0.5,1,2.0]})

classifiers= [(GaussianNB, bayes_params), (DecisionTreeClassifier, tree_params), (svm.LinearSVC, svm_params)]

def tune_hyper(classifier, param_grid):
    best_score=0 #f1 score on validation
    best_params=None
    for params in param_grid:
        print(f"Attempt with : {params}")
        score = f1_va_csf(classifier(**params))
        print(f"F1 Score for the Validation set would be : {score}\n")
        if score>best_score:
            best_score=score
            best_params=params       
            
    print(f"Optimal parameters for Validation is : {best_params}")
    print(f"F1 Score on Validation set given optimal parameters is: {best_score}\n")
    return classifier(**best_params)

In [11]:

for pair in classifiers: # cycle through the classifiers and parameters
    classifier = pair[0]
    param_grid = pair[1]
    print(classifier)    
    best_classifier = tune_hyper(classifier,param_grid) 
    print(f"Here is our f1 score for the test set with optimal parameters: {f1_te_csf(best_classifier)}\n")
    print(f"Here is our f1 score for the training set given optimal parameters: {f1_tr_csf(best_classifier)}\n")
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, val_vectors_norm, val['label'])}\n")    
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, test_vectors_norm, test['label'])}\n")      
   # print(f"Here is our accuracy score for the validation set with optimal parameters: {acc_csfier(best_classifier, train_vectors_norm, train['label'])}\n") 
     

<class 'sklearn.naive_bayes.GaussianNB'>
Attempt with : {'var_smoothing': 1e-06}
F1 Score for the Validation set would be : 0.2484847678818137

Attempt with : {'var_smoothing': 1e-07}
F1 Score for the Validation set would be : 0.24854996538217194

Attempt with : {'var_smoothing': 1e-08}
F1 Score for the Validation set would be : 0.24676684774170804

Attempt with : {'var_smoothing': 1e-09}
F1 Score for the Validation set would be : 0.24446290139087007

Attempt with : {'var_smoothing': 1e-10}
F1 Score for the Validation set would be : 0.2421773477741977

Attempt with : {'var_smoothing': 1e-11}
F1 Score for the Validation set would be : 0.2427082688239141

Optimal parameters for Validation is : {'var_smoothing': 1e-07}
F1 Score on Validation set given optimal parameters is: 0.24854996538217194

Here is our f1 score for the test set with optimal parameters: 0.24618440104792644

Here is our f1 score for the training set given optimal parameters: 0.8045798422374391

<class 'sklearn.tree.tree



F1 Score for the Validation set would be : 0.3808329725966012

Attempt with : {'C': 1.0, 'loss': 'squared_hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.2959857514522948

Attempt with : {'C': 0.5, 'loss': 'hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.379023001876849

Attempt with : {'C': 0.5, 'loss': 'squared_hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.22860047268242392

Attempt with : {'C': 2.0, 'loss': 'hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.3893413234353727

Attempt with : {'C': 2.0, 'loss': 'squared_hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.3587891972619519

Attempt with : {'C': 5.0, 'loss': 'hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.3967061457705372

Attempt with : {'C': 5.0, 'loss': 'squared_hinge', 'random_state': 329}
F1 Score for the Validation set would be : 0.4381163765092354

Optimal paramete

In [None]:
'''
def optimize_GaussianNB(vec_smoothing):
    best_score = 0
    best_params = -1
    for i in range(len(vec_smoothing)):
        print(f"Trying:{vec_smoothing[i]}")
        csfier = GaussianNB(priors=None, var_smoothing=vec_smoothing[i])
        csfier.fit(train_vectors.toarray(), train['label'])
        val_y = csfier.predict(val_vectors.toarray())
        val_f1 = f1_score(val['label'],val_y, average='macro')
        print(f"F1 Score Validation: {val_f1}\n")
        if  val_f1>best_score:
            best_score= val_f1
            best_params=vec_smoothing[i]
    print(f"Best params for Validation: {best_params}")
    print(f"Best F1 Score on Validation: {best_score}\n")
    return best_params
    '''
'''
for pair in classifiers: # cycle through the classifiers and parameters
    classifier = pair[0]
    param_grid = pair[1]
    print(classifier)    
    best_classifier = optimize_parameters(classifier,param_grid)  
    print(f"Test score for best params: {test_classifier(best_classifier)}\n")
    print(f"Train score for best params: {test_classifier_train(best_classifier)}\n")
'''
#tuning Var_smoothing for GaussianNB
'''
vec_smt = [1e-7, 1e-8, 1e-9, 1e-10, 1e-11]
best_GNB = optimize_GaussianNB(vec_smt)
GNB = GaussianNB(priors=None, var_smoothing = best_GNB)
GNB.fit(train_vectors.toarray(), train['label'])
GNB_hat = GNB.predict(test_vectors.toarray())
GNB_f1_te = f1_score(test['label'],GNB_hat, average='macro')
GNB_hat_tr = GNB.predict(train_vectors.toarray())
GNB_f1_tr = f1_score(train['label'],GNB_hat_tr, average='macro')
print(f"Test score for best params: {GNB_f1_te}\n")
print(f"Train score for best params: {GNB_f1_tr}\n")

'''