# INF8245E - Assignment 3
### Author : Guillaume Jones
### Date : 2021-11-14

### 1. Conversion to bag-of-words

In [9]:
# Preprocessing of text
import csv
import re
import numpy as np

def openTextDataset(filename):
    # Opens a .csv text file as np array for x and y
    with open(filename, encoding="utf8") as file:
        dataset_reader = csv.reader(file)
        dataset = np.array(list(dataset_reader)[1:])
        return dataset[:, 1], dataset[:, 0]

def preprocessWords(text_dataset):
    new_dataset = []
    for text in text_dataset:
        # Replaces ' and - with an empty character, as these are words 
        # like "2-d" and "don't" that need to be kept together
        new_text = re.sub(r"('|-)+", '', text)

        # Replaces other punctuation with a space, as they are characters such as , ; . () 
        # that should separate words. For example, "ears.medication" needs to become "ears medication"
        new_text = re.sub(r"(_|\W)+", ' ', new_text)

        # Sends to lowercase and splits into list of words
        new_dataset.append(new_text.lower().split(' ')[:-1])
    return new_dataset

train_x_raw, train_y = openTextDataset('medical_dataset/train.csv')
valid_x_raw, valid_y = openTextDataset('medical_dataset/valid.csv')
test_x_raw, test_y = openTextDataset('medical_dataset/test.csv')

train_x_prep = preprocessWords(train_x_raw.tolist())
valid_x_prep = preprocessWords(valid_x_raw.tolist())
test_x_prep = preprocessWords(test_x_raw.tolist())


In [10]:
from collections import Counter

# Sorted list of most common words
n_most_common = 10000
most_common_words = dict(Counter([x for example in train_x_prep for x in example]).most_common(n_most_common))
most_common_words_ids = dict(zip(most_common_words.keys(), range(1, n_most_common)))

# Save the vocabulary in required format
mcw_array = np.array(list(most_common_words.items()))
mcw_array = np.insert(mcw_array, 1, np.arange(1, n_most_common + 1), axis = 1)
np.savetxt('medical_text-vocab.txt', mcw_array, fmt='%s', delimiter='\t')

In [11]:
# Convert texts to word ids
def convertToIds(text_dataset, id_dict):
    new_dataset = []
    for example in text_dataset:
        new_example = []
        for word in example:
            if word in id_dict:
                new_example.append(id_dict[word])
        new_dataset.append(new_example)
    return new_dataset

train_x_ids = convertToIds(train_x_prep, most_common_words_ids)
valid_x_ids = convertToIds(valid_x_prep, most_common_words_ids)
test_x_ids = convertToIds(test_x_prep, most_common_words_ids)

# Save a dataset of ids in required format
for (ids, y, name) in [(train_x_ids, train_y, 'train'), (valid_x_ids, valid_y, 'valid'), (test_x_ids, test_y, 'test')]:
    id_strings = [' '.join(map(str, example)) for example in ids]
    complete_id_dataset = np.column_stack((id_strings, y))
    np.savetxt(f'medical_text-{name}.txt', complete_id_dataset, fmt='%s', delimiter='\t')

In [12]:
# Binary and frequency BoW generation

def generateBOW(id_dataset, n_features):
    # Create empty representation of final bag of words 
    bow_dataset = np.zeros((len(id_dataset), n_features))
    
    for (index, example) in enumerate(id_dataset):
        for id in example:
            bow_dataset[index, id - 1] += 1

    bow_binary = np.where(bow_dataset > 0, 1, 0).astype(float)

    # Divides each example by sum of BOW for example
    bow_sums = bow_dataset.sum(axis=1)
    bow_sums[np.where(bow_sums == 0)] = 1 # Prevent divisions by 0
    bow_frequency = bow_dataset / bow_sums[:, np.newaxis]

    return bow_binary, bow_frequency
    
train_x_binary, train_x_freq = generateBOW(train_x_ids, n_most_common)
valid_x_binary, valid_x_freq = generateBOW(valid_x_ids, n_most_common)
test_x_binary, test_x_freq = generateBOW(test_x_ids, n_most_common)
complete_dataset_binary = [(train_x_binary, train_y, '\tTrain'), (valid_x_binary, valid_y, '\tValid'), (test_x_binary, test_y, '\tTest')]
complete_dataset_freq = [(train_x_freq, train_y, '\tTrain'), (valid_x_freq, valid_y, '\tValid'), (test_x_freq, test_y, '\tTest')]

## 2. Binary bag-of-words

### 2.a. Baseline

In [13]:
import random
from scipy.stats import mode
from sklearn.metrics import f1_score

def printMacroF1Score(y_pred, y_true, title):
    f1 = f1_score(list(y_pred), list(y_true), average='macro')
    print(title + f' - F1: {f1:.3f}')

def evaluateClassifierF1(classifier, complete_dataset, classifier_name):
    print(f'{classifier_name} classifier performance:')
    for (x, y, name) in complete_dataset:
        printMacroF1Score(y, classifier.predict(x), f'{name}')

class RandomClassifier:
    def __init__(self, y_train):
        self.y_classes = list(set(y_train))

    def predict(self, x_test):
        return np.array(random.choices(self.y_classes, k=len(x_test)))

class MajorityClassifier:
    def __init__(self, y_train):
        self.majority_class = mode(y_train)[0][0]

    def predict(self, x_test):
        return np.array([self.majority_class] * len(x_test))

random_classifier = RandomClassifier(train_y)
majority_classifier = MajorityClassifier(train_y)

evaluateClassifierF1(random_classifier, complete_dataset_binary, 'Random')
evaluateClassifierF1(majority_classifier, complete_dataset_binary, 'Majority')

Random classifier performance:
	Train - F1: 0.244
	Valid - F1: 0.262
	Test - F1: 0.246
Majority classifier performance:
	Train - F1: 0.121
	Valid - F1: 0.124
	Test - F1: 0.142


### 2.b. Hyperparameter tuning

In [14]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import matplotlib.pyplot as plt

def showGridSearchScores(grid_search, classifier_name):
    print(f'\n{classifier_name} grid search:')
    for mean, params in zip(grid_search.cv_results_['mean_test_score'], 
                            grid_search.cv_results_['params']):
        print(f'{params} : {mean:.3f}')

    print(f'Best parameters : {grid_search.best_params_}')
    return grid_search.best_params_

def plotParameterScores(grid_search, parameter_to_plot, classifier_name):
    y = []
    x = []
    for mean, params in zip(grid_search.cv_results_['mean_test_score'], 
                            grid_search.cv_results_['params']):
        x.append(params[parameter_to_plot])
        y.append(mean)
    plt.figure(figsize=(10,5))
    plt.scatter(x, y)
    plt.ylabel('F1 score')
    plt.xlabel(parameter_to_plot)
    plt.title(classifier_name)
    plt.show()

# Global GridSearch parameters
scoring = 'f1_macro'
fold = np.repeat([-1, 0], [len(train_x_binary), len(valid_x_binary)])

# Must combine train and valid sets to use PredefinedSplit
train_valid_x_binary = np.concatenate((train_x_binary, valid_x_binary))
train_valid_x_freq = np.concatenate((train_x_freq, valid_x_freq))
train_valid_y = np.concatenate((train_y, valid_y))
cv = PredefinedSplit(fold)

In [None]:
# Bernoulli Naive Bayes grid search
from sklearn.naive_bayes import BernoulliNB

param_grid_bnb = [
    {   'alpha': [0.1, 0.2, 0.5, 1, 1.5, 2, 2.5, 3, 4, 5, 6, 7, 8] }
]
grid_search_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_bnb.fit(train_valid_x_binary, train_valid_y);

In [8]:
# Decision tree grid search
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = [
    {   'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 25, 30, 35, 40, 60, 80, 100],
        'min_samples_leaf': [1, 2, 4, 8] },
    {   'criterion': ['gini', 'entropy'],
        'max_depth': [100],
        'min_samples_leaf': [1],
        'ccp_alpha': [0.001, 0.002, 0.003, 0.004, 0.006, 0.01] }
]
grid_search_bdt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_bdt.fit(train_valid_x_binary, train_valid_y);

In [10]:
# Logistic regression grid search
from sklearn.linear_model import LogisticRegression

param_grid_lr = [
    {   'C': [0.1, 1, 10, 100, 1000, 10000],
        'solver' : ['newton-cg'] }
]
grid_search_blr = GridSearchCV(LogisticRegression(), param_grid_lr, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_blr.fit(train_valid_x_binary, train_valid_y);

In [10]:
# Linear SVM grid search
from sklearn.svm import LinearSVC

param_grid_svc = [
    {   'C': [0.01, 0.1, 0.5, 1, 5, 10, 100],
        'loss': ['hinge', 'squared_hinge'],
        'max_iter': [100000] }
]
grid_search_bsvc = GridSearchCV(LinearSVC(), param_grid_svc, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_bsvc.fit(train_valid_x_binary, train_valid_y);

### 2.c. / 2.d. Hyperparameter performance on valid set and final F1-scores

In [11]:
# Bernoulli Naive Bayes parameter display
best_params_bnb = showGridSearchScores(grid_search_bnb, 'Bernoulli NB (binary)')

classifier_bnb_final = BernoulliNB(**best_params_bnb).fit(train_x_binary, train_y)
evaluateClassifierF1(classifier_bnb_final, complete_dataset_binary, 'Bernoulli NB')

# Decision Tree parameter display
best_params_bdt = showGridSearchScores(grid_search_bdt, 'Decision Tree (binary)')

classifier_bdt_final = DecisionTreeClassifier(**best_params_bdt).fit(train_x_binary, train_y)
evaluateClassifierF1(classifier_bdt_final, complete_dataset_binary, 'Decision tree')


# Logistic regression parameter display
best_params_blr = showGridSearchScores(grid_search_blr, 'Logistic Regression (binary)')

classifier_blr_final = LogisticRegression(**best_params_blr).fit(train_x_binary, train_y)
evaluateClassifierF1(classifier_blr_final, complete_dataset_binary, 'Logistic regression')


# Linear SVM parameter display
best_params_bsvc = showGridSearchScores(grid_search_bsvc, 'Linear SVM (binary)')

classifier_bsvc_final = LinearSVC(**best_params_bsvc).fit(train_x_binary, train_y)
evaluateClassifierF1(classifier_bsvc_final, complete_dataset_binary, 'Linear SVM')


Bernoulli NB (binary) grid search:
{'alpha': 0.1} : 0.450
{'alpha': 0.2} : 0.451
{'alpha': 0.5} : 0.457
{'alpha': 1} : 0.458
{'alpha': 1.5} : 0.460
{'alpha': 2} : 0.466
{'alpha': 2.5} : 0.464
{'alpha': 3} : 0.456
{'alpha': 4} : 0.439
{'alpha': 5} : 0.440
{'alpha': 6} : 0.436
{'alpha': 7} : 0.427
{'alpha': 8} : 0.422
Best parameters : {'alpha': 2}
Bernoulli NB classifier performance:
	Train - F1: 0.524
	Valid - F1: 0.466
	Test - F1: 0.457

Decision Tree (binary) grid search:
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1} : 0.677
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2} : 0.693
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4} : 0.694
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 8} : 0.698
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1} : 0.721
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 2} : 0.721
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4} : 0.730
{'criterion': 'gini', 'max_dep

## 3. Frequency bag-of-words

### 3.a. Hyperparameter tuning

In [7]:
# Gaussian Naive Bayes grid search
from sklearn.naive_bayes import GaussianNB

param_grid_gnb = [
    { 'var_smoothing': [1E-11, 1E-10, 1E-9, 1E-8] }
]
grid_search_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_gnb.fit(train_valid_x_freq, train_valid_y);

In [12]:
# Decision tree grid search, frequency
from sklearn.tree import DecisionTreeClassifier

param_grid_fdt = [
    {   'criterion': ['gini', 'entropy'],
        'max_depth': [10, 15, 20, 25, 30, 40, 80],
        'min_samples_leaf': [1, 2, 4, 8] },
    {   'criterion': ['gini', 'entropy'],
        'max_depth': [100],
        'ccp_alpha': [0.001, 0.002, 0.003, 0.004, 0.006, 0.01] }
]
grid_search_fdt = GridSearchCV(DecisionTreeClassifier(), param_grid_fdt, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_fdt.fit(train_valid_x_freq, train_valid_y);

In [8]:
# Logistic regression grid search, frequency
from sklearn.linear_model import LogisticRegression

param_grid_flr = [
    {   'C': [1, 10, 100, 1000, 1E4, 1E5, 1E6, 1E7, 1E8],
        'max_iter': [1000],
        'solver' : ['liblinear'] }
]
grid_search_flr = GridSearchCV(LogisticRegression(), param_grid_flr, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_flr.fit(train_valid_x_freq, train_valid_y);

In [11]:
# Linear SVM grid search, frequency
from sklearn.svm import LinearSVC

param_grid_fsvc = [
    {   'C': [1, 10, 100, 1000, 1E4, 1E5, 1E6],
        'loss': ['hinge', 'squared_hinge'],
        'max_iter': [100000],
        'tol': [0.01] }
]
grid_search_fsvc = GridSearchCV(LinearSVC(), param_grid_fsvc, scoring=scoring, cv=cv, n_jobs=4, refit=False)
grid_search_fsvc.fit(train_valid_x_freq, train_valid_y);

### 3.b. / 3.c. Hyperparameter performance on valid set and final F1-scores

In [31]:
# # Gaussian Naive Bayes parameter display
best_params_gnb = showGridSearchScores(grid_search_gnb, 'Gaussian NB (frequency)')

classifier_gnb_final = GaussianNB(**best_params_gnb).fit(train_x_freq, train_y)
evaluateClassifierF1(classifier_gnb_final, complete_dataset_freq, 'GNB')

# Decision Tree (frequency) parameter display
best_params_fdt = showGridSearchScores(grid_search_fdt, 'Decision tree (frequency)')

classifier_fdt_final = DecisionTreeClassifier(**best_params_fdt).fit(train_x_freq, train_y)
evaluateClassifierF1(classifier_fdt_final, complete_dataset_freq, 'Decision tree')

# Logistic regression (frequency) parameter display
best_params_flr = showGridSearchScores(grid_search_flr, 'Logistic regression (frequency)')

classifier_flr_final = LogisticRegression(**best_params_flr).fit(train_x_freq, train_y)
evaluateClassifierF1(classifier_flr_final, complete_dataset_freq, 'Logistic regression')

# Linear SVM (frequency) parameter display
best_params_fsvc = showGridSearchScores(grid_search_fsvc, 'Linear SVM (frequency)')

classifier_fsvc_final = LinearSVC(**best_params_fsvc).fit(train_x_freq, train_y)
evaluateClassifierF1(classifier_fsvc_final, complete_dataset_freq, 'Linear SVM')


Gaussian NB (frequency) grid search:
{'var_smoothing': 1e-11} : 0.361
{'var_smoothing': 1e-10} : 0.372
{'var_smoothing': 1e-09} : 0.368
{'var_smoothing': 1e-08} : 0.365
Best parameters : {'var_smoothing': 1e-10}
GNB classifier performance:
	Train - F1: 0.679
	Valid - F1: 0.372
	Test - F1: 0.352

Decision tree (frequency) grid search:
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1} : 0.702
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2} : 0.704
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4} : 0.702
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 8} : 0.709
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1} : 0.717
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 2} : 0.723
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 4} : 0.721
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 8} : 0.724
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1} : 0.759
{'criterion': 'gini', 'max_depth'