In [125]:
# COMP 551 Assignment 3
# Name: Charles Lagace
# Student ID: 260698807
# This notebook was written with Python 3.6

import numpy as np
import random
import re
import scipy.sparse as ss

In [98]:
# Question 1: preprocess and convert the YELP dataset

# train set
with open("hwk3_datasets/yelp-train.txt", "r") as myfile:
    yelp_train = myfile.readlines()
    
# validation set    
with open("hwk3_datasets/yelp-valid.txt", "r") as myfile:
    yelp_valid = myfile.readlines()
    
# test set
with open("hwk3_datasets/yelp-test.txt", "r") as myfile:
    yelp_test = myfile.readlines()
    
yelp_ytrain = [line.split("\t")[-1][0] for line in yelp_train]
yelp_yvalid = [line.split("\t")[-1][0] for line in yelp_valid]
yelp_ytest = [line.split("\t")[-1][0] for line in yelp_test]

# preprocess the data
yelp_xtrain_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in yelp_train]
yelp_xvalid_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in yelp_valid]
yelp_xtest_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in yelp_test]

# get unique words
words = [text.split(" ") for text in yelp_xtrain_str]
unique_words = set([word for text in words for word in text])
vocabulary = dict([(word, i) for i, word in enumerate(sorted(unique_words))])

#count word occurrences
word_counts = np.zeros(len(unique_words), dtype=np.int32)
for text in words:     
    # vocabulary[0] = '' (blank character) so we exclude it
    index_map = [vocabulary[word] for word in text if vocabulary[word]]
    word_counts += np.bincount(np.array(index_map), minlength=len(unique_words))
    
# get 10000 most frequent words
frequent_words = sorted(zip(word_counts, sorted(unique_words), range(len(unique_words))))
frequent_words.reverse()
frequent_words = frequent_words[:10000]
vocabulary = dict([(word[1], i) for i, word in enumerate(frequent_words)])

#save the vocabulary
with open("my_datasets/yelp-vocab.txt", "w") as myfile:
    for word in frequent_words:
        myfile.write("{} {} {}\n".format(word[1], vocabulary[word[1]], word[0]))

# convert input to int representation
yelp_xtrain = [[vocabulary[word] for word in text if word in vocabulary] for text in words]

valid_words = [text.split(" ") for text in yelp_xvalid_str]
yelp_xvalid = [[vocabulary[word] for word in text if word in vocabulary] for text in valid_words]

test_words = [text.split(" ") for text in yelp_xtest_str]
yelp_xtest = [[vocabulary[word] for word in text if word in vocabulary] for text in test_words]

# save training set
with open("my_datasets/yelp-train.txt", "w") as myfile:
    for i, text in enumerate(yelp_xtrain):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), yelp_ytrain[i]))

# save validation set
with open("my_datasets/yelp-valid.txt", "w") as myfile:
    for i, text in enumerate(yelp_xvalid):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), yelp_yvalid[i]))

# save test set
with open("my_datasets/yelp-test.txt", "w") as myfile:
    for i, text in enumerate(yelp_xtest):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), yelp_ytest[i]))
        
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(yelp_xtrain_str, vocabulary=vocabulary)

# generate the one_hot representation
yelp_xtrain = vectorizer.fit_transform(yelp_xtrain_str)
yelp_xvalid = vectorizer.transform(yelp_xvalid_str)
yelp_xtest = vectorizer.transform(yelp_xtest_str)

# generate the BBoW representation
yelp_xtrain_bbow = yelp_xtrain.sign()
yelp_xvalid_bbow = yelp_xvalid.sign()
yelp_xtest_bbow = yelp_xtest.sign()

# generate the FBoW representation
yelp_xtrain_fbow = ss.lil_matrix(yelp_xtrain.shape)
yelp_xvalid_fbow = ss.lil_matrix(yelp_xvalid.shape)
yelp_xtest_fbow = ss.lil_matrix(yelp_xtest.shape)

for row in range(yelp_xtrain.shape[0]):
    yelp_xtrain_fbow[row] = yelp_xtrain[row] / yelp_xtrain[row].sum()
for row in range(yelp_xvalid.shape[0]):
    yelp_xvalid_fbow[row] = yelp_xvalid[row] / yelp_xvalid[row].sum()
for row in range(yelp_xtest.shape[0]):
    yelp_xtest_fbow[row] = yelp_xtest[row] / yelp_xtest[row].sum()

yelp_xtrain_fbow = yelp_xtrain_fbow.tocsr()
yelp_xvalid_fbow = yelp_xvalid_fbow.tocsr()
yelp_xtest_fbow = yelp_xtest_fbow.tocsr()

['5', '5', '5', '3', '2', '2', '3', '1', '3', '5', '5', '5', '4', '4', '1', '2', '2', '3', '5', '2']
[(45289, 'the', 30739), (30384, 'and', 939), (24943, 'a', 1), (24105, 'i', 14907), (21039, 'to', 31321), (14411, 'of', 20804), (13707, 'was', 33417), (12071, 'is', 15741), (11542, 'it', 15798), (10615, 'for', 11637), (10183, 'in', 15121), (8244, 'that', 30691), (7557, 'my', 19840), (7537, 'with', 34133), (7279, 'but', 4150), (7155, 'you', 34635), (7010, 'this', 30977), (6398, 'they', 30899), (6343, 'on', 21029), (5982, 'have', 13826), (5763, 'we', 33564), (5237, 'not', 20524), (5011, 'had', 13506), (5010, 'are', 1327), (4573, 'place', 22933), (4533, 'good', 12820), (4484, 'so', 28148), (4482, 'at', 1632), (4194, 'were', 33769), (4144, 'food', 11533), (3900, 'be', 2456), (3877, 'as', 1500), (3546, 'there', 30839), (3508, 'great', 13123), (3425, 'like', 17294), (3179, 'if', 14967), (3170, 'its', 15906), (3144, 'me', 18512), (3134, 'all', 636), (3114, 'just', 16326), (3068, 'very', 32994),

  return self.astype(np.float_)._mul_scalar(1./other)


  (0, 0)	0.0591715976331
  (0, 1)	0.0236686390533
  (0, 4)	0.0295857988166
  (0, 5)	0.00591715976331
  (0, 6)	0.00591715976331
  (0, 7)	0.0118343195266
  (0, 8)	0.0118343195266
  (0, 10)	0.0118343195266
  (0, 11)	0.0236686390533
  (0, 12)	0.0177514792899
  (0, 13)	0.00591715976331
  (0, 14)	0.0118343195266
  (0, 15)	0.0118343195266
  (0, 16)	0.00591715976331
  (0, 17)	0.0177514792899
  (0, 20)	0.00591715976331
  (0, 22)	0.00591715976331
  (0, 24)	0.0177514792899
  (0, 26)	0.00591715976331
  (0, 27)	0.00591715976331
  (0, 30)	0.00591715976331
  (0, 31)	0.00591715976331
  (0, 32)	0.00591715976331
  (0, 33)	0.00591715976331
  (0, 36)	0.00591715976331
  :	:
  (6999, 201)	0.0151515151515
  (6999, 212)	0.0151515151515
  (6999, 217)	0.0151515151515
  (6999, 224)	0.0151515151515
  (6999, 239)	0.0151515151515
  (6999, 244)	0.0151515151515
  (6999, 274)	0.0151515151515
  (6999, 303)	0.0151515151515
  (6999, 451)	0.0151515151515
  (6999, 479)	0.0151515151515
  (6999, 530)	0.030303030303
  (6999, 

In [99]:
#Preprocess and convert the IMDB dataset

# train set
with open("hwk3_datasets/IMDB-train.txt", "r") as myfile:
    imdb_train = myfile.readlines()
    
# validation set    
with open("hwk3_datasets/IMDB-valid.txt", "r") as myfile:
    imdb_valid = myfile.readlines()
    
# test set
with open("hwk3_datasets/IMDB-test.txt", "r") as myfile:
    imdb_test = myfile.readlines()
    
imdb_ytrain = [line.split("\t")[-1][0] for line in imdb_train]
imdb_yvalid = [line.split("\t")[-1][0] for line in imdb_valid]
imdb_ytest = [line.split("\t")[-1][0] for line in imdb_test]

# preprocess the data
imdb_xtrain_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in imdb_train]
imdb_xvalid_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in imdb_valid]
imdb_xtest_str = [re.sub(r'[^a-zA-Z ]', "", line).lower() for line in imdb_test]

# get unique words
words = [text.split(" ") for text in imdb_xtrain_str]
unique_words = set([word for text in words for word in text])
vocabulary = dict([(word, i) for i, word in enumerate(sorted(unique_words))])

#count word occurrences
word_counts = np.zeros(len(unique_words), dtype=np.int32)
for text in words:     
    # vocabulary[0] = '' (blank character) so we exclude it
    index_map = [vocabulary[word] for word in text if vocabulary[word]]
    word_counts += np.bincount(np.array(index_map), minlength=len(unique_words))
    
# get 10000 most frequent words
frequent_words = sorted(zip(word_counts, sorted(unique_words), range(len(unique_words))))
frequent_words.reverse()
frequent_words = frequent_words[:10000]
vocabulary = dict([(word[1], i) for i, word in enumerate(frequent_words)])

#save the vocabulary
with open("my_datasets/IMDB-vocab.txt", "w") as myfile:
    for word in frequent_words:
        myfile.write("{} {} {}\n".format(word[1], vocabulary[word[1]], word[0]))

# convert input to int representation
imdb_xtrain = [[vocabulary[word] for word in text if word in vocabulary] for text in words]

valid_words = [text.split(" ") for text in imdb_xvalid_str]
imdb_xvalid = [[vocabulary[word] for word in text if word in vocabulary] for text in valid_words]

test_words = [text.split(" ") for text in imdb_xtest_str]
imdb_xtest = [[vocabulary[word] for word in text if word in vocabulary] for text in test_words]

# save training set
with open("my_datasets/IMDB-train.txt", "w") as myfile:
    for i, text in enumerate(imdb_xtrain):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), imdb_ytrain[i]))

# save validation set
with open("my_datasets/IMDB-valid.txt", "w") as myfile:
    for i, text in enumerate(imdb_xvalid):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), imdb_yvalid[i]))

# save test set
with open("my_datasets/IMDB-test.txt", "w") as myfile:
    for i, text in enumerate(imdb_xtest):
        text = [str(i) for i in text]
        myfile.write("{}\t{}\n".format(" ".join(text), imdb_ytest[i]))
        
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(imdb_xtrain_str, vocabulary=vocabulary)

# generate the one_hot representation
imdb_xtrain = vectorizer.fit_transform(imdb_xtrain_str)
imdb_xvalid = vectorizer.transform(imdb_xvalid_str)
imdb_xtest = vectorizer.transform(imdb_xtest_str)

# generate the BBoW representation
imdb_xtrain_bbow = imdb_xtrain.sign()
imdb_xvalid_bbow = imdb_xvalid.sign()
imdb_xtest_bbow = imdb_xtest.sign()
print(imdb_xtrain_bbow)
print(imdb_xtrain_bbow.max(), imdb_xtrain_bbow.mean())

# generate the FBoW representation
imdb_xtrain_fbow = ss.lil_matrix(imdb_xtrain.shape)
imdb_xvalid_fbow = ss.lil_matrix(imdb_xvalid.shape)
imdb_xtest_fbow = ss.lil_matrix(imdb_xtest.shape)

for row in range(imdb_xtrain.shape[0]):
    imdb_xtrain_fbow[row] = imdb_xtrain[row] / imdb_xtrain[row].sum()
for row in range(imdb_xvalid.shape[0]):
    imdb_xvalid_fbow[row] = imdb_xvalid[row] / imdb_xvalid[row].sum()
for row in range(imdb_xtest.shape[0]):
    imdb_xtest_fbow[row] = imdb_xtest[row] / imdb_xtest[row].sum()

imdb_xtrain_fbow = imdb_xtrain_fbow.tocsr()
imdb_xvalid_fbow = imdb_xvalid_fbow.tocsr()
imdb_xtest_fbow = imdb_xtest_fbow.tocsr()

['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
[(200007, 'the', 79197), (97128, 'a', 1), (97037, 'and', 2718), (86860, 'of', 55290), (81098, 'to', 80436), (64239, 'is', 40560), (55429, 'in', 38831), (46365, 'it', 40714), (45792, 'i', 38064), (45229, 'this', 79608), (41499, 'that', 79145), (34674, 'br', 9343), (28650, 'was', 86225), (27707, 'as', 4242), (26311, 'for', 29457), (26267, 'with', 87856), (25003, 'movie', 51764), (24999, 'but', 10738), (22526, 'film', 28204), (19990, 'on', 55728), (18129, 'not', 54483), (17664, 'his', 36444), (17658, 'you', 89167), (17469, 'are', 3841), (16692, 'have', 35105), (16014, 'be', 6427), (15909, 'he', 35221), (15230, 'one', 55775), (14972, 'its', 40826), (14017, 'all', 1928), (13886, 'at', 4559), (13507, 'by', 10855), (12858, 'an', 2633), (12405, 'they', 79434), (12235, 'who', 87348), (12104, 'from', 30375), (11845, 'so', 73258), (11692, 'like', 45380), (10801, 'her', 35770), (10592, 'just', 4209

In [190]:
# Question 2: Yelp dataset, binary bag-of-words
# a) dummy classifiers

# train uniform classifier
from sklearn.dummy import DummyClassifier
uniform_classifier = DummyClassifier(strategy="uniform")
uniform_classifier.fit(yelp_xtrain_bbow, yelp_ytrain)
y_pred = uniform_classifier.predict(yelp_xtest_bbow)

from sklearn import metrics
uniform_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

# train majority-class classifier
majority_classifier = DummyClassifier(strategy="most_frequent")
majority_classifier.fit(yelp_xtrain_bbow, yelp_ytrain)
y_pred = majority_classifier.predict(yelp_xtest_bbow)

majority_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("F1-score of uniform classifier: {}".format(uniform_f1))
print("F1-score of majority-class classifier: {}".format(majority_f1))

F1-score of uniform classifier: 0.1945
F1-score of majority-class classifier: 0.351


In [191]:
# Yelp dataset, binary bag-of-words
# Train Naive Bayes

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.naive_bayes import BernoulliNB

parameters = [{'alpha': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]}]

# do a predefined train / test split
test_fold = []
for i in range(yelp_xtrain_bbow.shape[0]):
    test_fold.append(-1)
for i in range(yelp_xvalid_bbow.shape[0]):
    test_fold.append(random.randint(0, 4))

# fit the classifier
clf = GridSearchCV(BernoulliNB(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
y_combined = np.concatenate((yelp_ytrain, yelp_yvalid))
x_combined = ss.vstack([yelp_xtrain_bbow, yelp_xvalid_bbow])
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(yelp_xtest_bbow)
bayes_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(bayes_f1))

Train scores: [ 0.7481534   0.74507653  0.73864067  0.73223012  0.72402541  0.70910221
  0.69192267  0.66987177  0.62856417  0.59115338]
Train scores std: [ 0.00140592  0.00174038  0.0014971   0.00154089  0.00140623  0.00214883
  0.00171123  0.00172716  0.00221024  0.00113853]
Test scores: [ 0.41   0.419  0.419  0.417  0.42   0.415  0.403  0.406  0.398  0.389]
Test scores std: [ 0.02134289  0.02111393  0.01882846  0.01063698  0.01744049  0.02133364
  0.01624278  0.01788632  0.02449725  0.02907265]
The best hyperparameters are: {'alpha': 0.02}
The best fit F1-score on the test set is: 0.44


In [183]:
# Yelp dataset, binary bag-of-words
# Train Decision Tree

from sklearn.tree import DecisionTreeClassifier

# min_impurity_decrease is used to control high-bias vs high-variance trade-off
parameters = [{'criterion': ['gini', 'entropy'],
             'min_impurity_decrease': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]}]

# fit the classifier
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(yelp_xtest_bbow)
decision_tree_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(decision_tree_f1))

Train scores: [ 0.42620775  0.395152    0.39512633  0.36142068  0.35292311  0.35292311
  0.35292311  0.62874282  0.44909434  0.40789675  0.39050827  0.36902495
  0.35292311  0.35292311]
Train scores std: [ 0.00308289  0.00083546  0.00086424  0.01728205  0.00075626  0.00075626
  0.00075626  0.00352712  0.00534925  0.00249553  0.00519677  0.00066196
  0.00075626  0.00075626]
Test scores: [ 0.409  0.383  0.385  0.359  0.356  0.356  0.356  0.391  0.411  0.394
  0.379  0.363  0.356  0.356]
Test scores std: [ 0.04100082  0.03365397  0.03563238  0.02740243  0.02911196  0.02911196
  0.02911196  0.02375802  0.01382766  0.01905018  0.03704381  0.02575423
  0.02911196  0.02911196]
The best hyperparameters are: {'criterion': 'entropy', 'min_impurity_decrease': 0.002}
The best fit F1-score on the test set is: 0.394


In [193]:
# Yelp dataset, binary bag-of-words
# Train Linear SVM

from sklearn.svm import LinearSVC

# C is used to control high-bias vs high-variance trade-off
parameters = [{'loss': ['hinge', 'squared_hinge'],
               'C': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]}]

# fit the classifier
clf = GridSearchCV(LinearSVC(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(yelp_xtest_bbow)
svm_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(svm_f1))

Train scores: [ 0.60428346  0.64061531  0.63397593  0.70058953  0.68623068  0.77687172
  0.73546191  0.83166658  0.78658997  0.88205034  0.85256394  0.93720476
  0.89879479  0.96599986  0.9372307   0.98120516  0.97243544  0.9922821
  0.98569245  0.99630765]
Train scores std: [ 0.00206513  0.00100087  0.00262079  0.00120149  0.00179079  0.00059332
  0.00081596  0.00125983  0.00089597  0.00164922  0.00109523  0.00066938
  0.00094661  0.00098238  0.00121008  0.00046132  0.00094179  0.00029555
  0.00037991  0.00020575]
Test scores: [ 0.463  0.492  0.466  0.504  0.494  0.512  0.499  0.499  0.492  0.499
  0.489  0.482  0.464  0.479  0.47   0.469  0.46   0.455  0.455  0.447]
Test scores std: [ 0.02815831  0.04301725  0.03278227  0.03835453  0.04212933  0.03407927
  0.03400716  0.03404094  0.02984264  0.03654571  0.03346512  0.03480736
  0.03559444  0.04051644  0.03268884  0.03796158  0.04690814  0.04192693
  0.04778922  0.05143806]
The best hyperparameters are: {'C': 0.005, 'loss': 'squared_h

In [204]:
# Question 3: Yelp dataset, frequency bag-of-words
# Train Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

# do a predefined train / test split
test_fold = []
for i in range(yelp_xtrain_fbow.shape[0]):
    test_fold.append(-1)
for i in range(yelp_xvalid_fbow.shape[0]):
    test_fold.append(random.randint(0, 4))

# fit the classifier
clf = GaussianNB()
y_combined = np.concatenate((yelp_ytrain, yelp_yvalid))
x_combined = ss.vstack([yelp_xtrain_fbow, yelp_xvalid_fbow])
clf.fit(x_combined.toarray(), y_combined)

# report performance on training set
y_pred = clf.predict(x_combined.toarray())
bayes_f1 = metrics.f1_score(y_combined, y_pred, average="micro")
print("The F1-score on the training set is: {}".format(bayes_f1))

# report performance on test set
y_pred = clf.predict(yelp_xtest_fbow.toarray())
bayes_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")
print("The F1-score on the test set is: {}".format(bayes_f1))

The F1-score on the training set is: 0.7765
The F1-score on the test set is: 0.3005


In [187]:
# Yelp dataset, frequency bag-of-words
# Train Decision Tree

# min_impurity_decrease is used to control high-bias vs high-variance trade-off
parameters = [{'criterion': ['gini', 'entropy'],
             'min_impurity_decrease': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]}]

# fit the classifier
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(yelp_xtest_fbow)
decision_tree_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(decision_tree_f1))

Train scores: [ 0.44795051  0.40240889  0.39489833  0.36164899  0.35292321  0.35292321
  0.35292321  0.7312312   0.49133189  0.41169294  0.38874509  0.37443968
  0.35292321  0.35292321]
Train scores std: [ 0.00791185  0.00507084  0.00154818  0.01801259  0.00069439  0.00069439
  0.00069439  0.00276578  0.00496119  0.00143198  0.00361696  0.00714941
  0.00069439  0.00069439]
Test scores: [ 0.387  0.366  0.372  0.352  0.356  0.356  0.356  0.362  0.403  0.393
  0.366  0.36   0.356  0.356]
Test scores std: [ 0.05705417  0.06199634  0.05876946  0.03344202  0.02685426  0.02685426
  0.02685426  0.02155527  0.0552321   0.05902373  0.05228011  0.03242069
  0.02685426  0.02685426]
The best hyperparameters are: {'criterion': 'entropy', 'min_impurity_decrease': 0.002}
The best fit F1-score on the test set is: 0.4095


In [195]:
# Yelp dataset, frequency bag-of-words
# Train Linear SVM

from sklearn.svm import LinearSVC

# C is used to control high-bias vs high-variance trade-off
parameters = [{'loss': ['hinge', 'squared_hinge'],
               'C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100]}]

# fit the classifier
clf = GridSearchCV(LinearSVC(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(yelp_xtest_fbow)
svm_f1 = metrics.f1_score(yelp_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(svm_f1))

Train scores: [ 0.5738723   0.44746185  0.57487171  0.46018046  0.57487228  0.48271878
  0.57912811  0.51033339  0.58694862  0.54864193  0.60779461  0.61687151
  0.63923116  0.67574368  0.67584601  0.73387138  0.7303333   0.82002633
  0.78028438  0.88097619]
Train scores std: [ 0.00196195  0.00065744  0.00196161  0.00142629  0.00119252  0.00090376
  0.001428    0.00030308  0.00121358  0.00155073  0.00127823  0.00067387
  0.00092876  0.00067246  0.00124423  0.00096137  0.00080249  0.00168954
  0.00358855  0.00218453]
Test scores: [ 0.45   0.41   0.449  0.418  0.453  0.435  0.453  0.455  0.463  0.475
  0.473  0.499  0.483  0.509  0.492  0.517  0.508  0.503  0.504  0.501]
Test scores std: [ 0.0359449   0.0196809   0.03796373  0.02124244  0.03902096  0.025079
  0.03591722  0.0155712   0.03460027  0.03241022  0.03499769  0.03785903
  0.03307779  0.03769556  0.03333647  0.04425225  0.02544262  0.04212892
  0.03677481  0.03172663]
The best hyperparameters are: {'C': 20, 'loss': 'squared_hinge

In [205]:
# Question 4: IMDB dataset, binary bag-of-words
# a) dummy classifiers

# train uniform classifier
from sklearn.dummy import DummyClassifier
uniform_classifier = DummyClassifier(strategy="uniform")
uniform_classifier.fit(imdb_xtrain_bbow, imdb_ytrain)
y_pred = uniform_classifier.predict(imdb_xtest_bbow)

from sklearn import metrics
uniform_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("F1-score of uniform classifier: {}".format(uniform_f1))

F1-score of uniform classifier: 0.49708


In [206]:
# IMDB dataset, binary bag-of-words
# Train Naive Bayes

parameters = [{'alpha': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]}]

# do a predefined train / test split
test_fold = []
for i in range(imdb_xtrain_bbow.shape[0]):
    test_fold.append(-1)
for i in range(imdb_xvalid_bbow.shape[0]):
    test_fold.append(random.randint(0, 4))

# fit the classifier
clf = GridSearchCV(BernoulliNB(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
y_combined = np.concatenate((imdb_ytrain, imdb_yvalid))
x_combined = ss.vstack([imdb_xtrain_bbow, imdb_xvalid_bbow])
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(imdb_xtest_bbow)
bayes_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(bayes_f1))

Train scores: [ 0.86753832  0.867547    0.867547    0.86753828  0.86749476  0.86741654
  0.8673296   0.86714715  0.86679924  0.86640811]
Train scores std: [ 0.00082522  0.00083382  0.00083382  0.00083008  0.00083585  0.00080603
  0.00080433  0.00082977  0.00096465  0.00082998]
Test scores: [ 0.8474  0.8474  0.8475  0.8475  0.8476  0.8478  0.8478  0.8479  0.8479
  0.8476]
Test scores std: [ 0.00141028  0.00141028  0.00149705  0.00149705  0.00150337  0.00159753
  0.00159753  0.00196671  0.00227846  0.00192217]
The best hyperparameters are: {'alpha': 0.2}
The best fit F1-score on the test set is: 0.83816


In [207]:
# IMDB dataset, binary bag-of-words
# Train Decision Tree

# min_impurity_decrease is used to control high-bias vs high-variance trade-off
parameters = [{'criterion': ['gini', 'entropy'],
             'min_impurity_decrease': [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01]}]

# fit the classifier
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(imdb_xtest_bbow)
decision_tree_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(decision_tree_f1))

Train scores: [ 0.9427477   0.81764488  0.7537348   0.73155813  0.70456981  0.69245192
  0.67120855  0.99990436  0.96539377  0.77720432  0.747359    0.72037097
  0.69546     0.68399096]
Train scores std: [  1.25271809e-03   2.56355585e-03   4.54922039e-03   4.43828786e-03
   3.78577808e-03   8.02113907e-04   5.00822579e-04   7.97229732e-05
   2.32554844e-03   3.20661401e-03   2.14627583e-03   4.35681115e-03
   1.72814004e-03   5.59964126e-04]
Test scores: [ 0.7249  0.7372  0.7353  0.7263  0.7037  0.6933  0.6666  0.7003  0.7001
  0.7344  0.7345  0.7156  0.6959  0.6796]
Test scores std: [ 0.01102256  0.00970658  0.00520866  0.01060235  0.00723002  0.00915777
  0.00571048  0.0054334   0.00195401  0.00746795  0.00434344  0.00897345
  0.01119086  0.00640535]
The best hyperparameters are: {'criterion': 'gini', 'min_impurity_decrease': 0.0002}
The best fit F1-score on the test set is: 0.74436


In [208]:
# IMDB dataset, binary bag-of-words
# Train Linear SVM

# C is used to control high-bias vs high-variance trade-off
parameters = [{'loss': ['hinge', 'squared_hinge'],
               'C': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]}]

# fit the classifier
clf = GridSearchCV(LinearSVC(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(imdb_xtest_bbow)
svm_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(svm_f1))

Train scores: [ 0.87284399  0.90300027  0.8888353   0.91904461  0.90931392  0.93893942
  0.92458293  0.95401712  0.9393558   0.96681748  0.95847832  0.98175635
  0.96995618  0.99117422  0.98222575  0.99632208  0.99361769  0.99933926
  0.99835676  0.9999044 ]
Train scores std: [  8.37052957e-04   2.68752643e-04   3.18718559e-04   6.92577658e-04
   6.38119348e-04   6.34401625e-04   4.31577880e-04   3.12777600e-04
   5.37759467e-04   3.58404465e-04   5.90461104e-04   1.72073003e-04
   2.74835489e-04   4.38098647e-04   3.43866716e-04   2.51245357e-04
   2.70150515e-04   9.97193618e-05   1.78678721e-04   5.06218734e-05]
Test scores: [ 0.8594  0.8768  0.8686  0.8802  0.8765  0.8812  0.8801  0.8809  0.8795
  0.878   0.8739  0.8684  0.8659  0.8606  0.8584  0.8546  0.8453  0.8427
  0.8368  0.8384]
Test scores std: [ 0.00339663  0.00366913  0.002673    0.00503717  0.00510219  0.00542678
  0.00464403  0.0071572   0.00674953  0.00751895  0.00655     0.0058722
  0.00731048  0.00465675  0.00443778  

In [209]:
# Question 5: IMDB dataset, frequency bag-of-words
# Train Gaussian Naive Bayes

# do a predefined train / test split
test_fold = []
for i in range(imdb_xtrain_fbow.shape[0]):
    test_fold.append(-1)
for i in range(imdb_xvalid_fbow.shape[0]):
    test_fold.append(random.randint(0, 4))

# fit the classifier
clf = GaussianNB()
y_combined = np.concatenate((imdb_ytrain, imdb_yvalid))
x_combined = ss.vstack([imdb_xtrain_fbow, imdb_xvalid_fbow])
clf.fit(x_combined.toarray(), y_combined)

# report performance on training set
y_pred = clf.predict(x_combined.toarray())
bayes_f1 = metrics.f1_score(y_combined, y_pred, average="micro")
print("The F1-score on the training set is: {}".format(bayes_f1))

# report performance on test set
y_pred = clf.predict(imdb_xtest_fbow.toarray())
bayes_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")
print("The F1-score on the test set is: {}".format(bayes_f1))

The F1-score on the training set is: 0.83968
The F1-score on the test set is: 0.70228


In [201]:
# IMDB dataset, frequency bag-of-words
# Train Decision Tree

# min_impurity_decrease is used to control high-bias vs high-variance trade-off
parameters = [{'criterion': ['gini', 'entropy'],
             'min_impurity_decrease': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]}]

# fit the classifier
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(imdb_xtest_fbow)
decision_tree_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(decision_tree_f1))

Train scores: [ 1.          0.97052976  0.73725114  0.71372319  0.6991826   0.67189519
  0.65062576  0.50056523  0.50056523  1.          1.          0.75349368
  0.7192095   0.70334021  0.69169802  0.67173859  0.61112986  0.50056523]
Train scores std: [ 0.          0.00058223  0.0034856   0.00280748  0.0006958   0.00074189
  0.00059684  0.00018284  0.00018284  0.          0.          0.00200512
  0.00217656  0.00323749  0.00621709  0.00092861  0.00146292  0.00018284]
Test scores: [ 0.6972  0.7007  0.7274  0.7122  0.6979  0.668   0.6469  0.4935  0.4935
  0.6943  0.6948  0.7318  0.7154  0.7018  0.6844  0.6674  0.6097  0.4935]
Test scores std: [ 0.00629169  0.00849532  0.00795495  0.00535621  0.00641723  0.00756066
  0.00507871  0.00210801  0.00210801  0.00923982  0.00589442  0.00549294
  0.00408596  0.00851077  0.01237812  0.00686857  0.00642467  0.00210801]
The best hyperparameters are: {'criterion': 'entropy', 'min_impurity_decrease': 0.001}
The best fit F1-score on the test set is: 0.

In [202]:
# IMDB dataset, frequency bag-of-words
# Train Linear SVM

from sklearn.svm import LinearSVC

# C is used to control high-bias vs high-variance trade-off
parameters = [{'loss': ['hinge', 'squared_hinge'],
               'C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100]}]

# fit the classifier
clf = GridSearchCV(LinearSVC(), parameters, cv=PredefinedSplit(test_fold=test_fold),
                  scoring='f1_micro', return_train_score=True)
clf.fit(x_combined, y_combined)

# report grid search scores
train_scores = clf.cv_results_['mean_train_score']
train_std = clf.cv_results_['std_train_score']
test_scores = clf.cv_results_['mean_test_score']
test_std = clf.cv_results_['std_test_score']
print('Train scores:', train_scores)
print('Train scores std:', train_std)
print('Test scores:', test_scores)
print('Test scores std:', test_std)

# report best performance
y_pred = clf.predict(imdb_xtest_fbow)
svm_f1 = metrics.f1_score(imdb_ytest, y_pred, average="micro")

print("The best hyperparameters are: {}".format(clf.best_params_))
print("The best fit F1-score on the test set is: {}".format(svm_f1))

Train scores: [ 0.57381184  0.72179063  0.6716076   0.75093857  0.7064346   0.79216547
  0.74249546  0.82106988  0.78001789  0.84631301  0.82685282  0.87431273
  0.85342635  0.89194703  0.8756002   0.9070261   0.89815669  0.92610425
  0.91287793  0.94167821]
Train scores std: [ 0.03494757  0.00094499  0.00321189  0.00053611  0.00067426  0.00045316
  0.00062641  0.0005215   0.00073729  0.00068485  0.00064122  0.00073769
  0.0005108   0.00085407  0.00031874  0.00099155  0.00062471  0.00079673
  0.00065617  0.00062793]
Test scores: [ 0.5626  0.7137  0.6691  0.7413  0.7015  0.7802  0.7331  0.8102  0.7684
  0.8313  0.8162  0.8544  0.8385  0.8675  0.8565  0.8752  0.8718  0.8806
  0.8784  0.8824]
Test scores std: [ 0.0384001   0.00883982  0.01005636  0.0083421   0.00896203  0.00929538
  0.00729715  0.00795961  0.00999425  0.00854818  0.00737194  0.00397708
  0.00769575  0.00311109  0.00495367  0.00600125  0.00484408  0.00310185
  0.00617804  0.00214637]
The best hyperparameters are: {'C': 100