# ML441 Assignment 4
## Charles de Kock - 26023830

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import time
from tqdm import tqdm
import json
from contextlib import redirect_stdout
from autorank import autorank, plot_stats
import scikit_posthocs as sp
import io
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
import hashlib
from collections import defaultdict
import itertools

In [3]:
seed = 40
np.random.seed(seed)

## Auxillary Functions

In [4]:
def load_json_file(filename):
    with open(filename, 'r') as file:
        return json.load(file)
    
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        try:
            return super().default(obj)
        except TypeError:
            pass
        if hasattr(obj, '__dict__'):
            return obj.__dict__
        elif hasattr(obj, '__str__'):
            return str(obj)
        else:
            return f"<non-serializable: {type(obj).__name__}>"

def save_json(data, path, indent=4):
    with open(path, "w") as f:
        json.dump(data, f, indent=indent, cls=CustomJSONEncoder)

def hash_dict(d):
    dict_str = json.dumps(d, sort_keys=True)
    return hashlib.sha256(dict_str.encode('utf-8')).hexdigest()

def avg_results(data, n_folds=10):
    out = {}
    for key in list(data.keys()):
        accs = []
        f1s = []
        for i in range(n_folds):
            temp = data[key]['results'][f'fold_{i}']
            accs.append(temp['acc'])
            f1s.append(temp['f1'])
        out[key] =  {
            'mean_acc': float(np.mean(accs)),
            'std_acc': float(np.std(accs)),
            'mean_f1': float(np.mean(f1s)),
            'std_f1': float(np.std(f1s))
        }
    return out

def sort_results(full_data, key, show=False, top_n=10):
    avg_data = avg_results(full_data)
    out = sorted(avg_data.keys(), key=lambda k: avg_data[k][f'mean_{key}'], reverse=True)
    if show == True:
        for r in out[:top_n]:
            print(f"ID: {r} Parameters: {full_data[r]['params']} -> {key}: {avg_data[r]['mean_' + key]} ± {avg_data[r]['std_' + key]}")
    return out

## Load and Preprocess data

In [4]:
def preprocess(df):
    y = df['diagnosis'].map({'B': 0, 'M': 1})
    X = df.drop(columns=['diagnosis', 'gender', 'id'])
    return X, y

In [6]:
data = pd.read_csv('../data/breastCancer.csv', delimiter='\t', na_values='?')
X, y = preprocess(data)

## Cross validation folds

In [7]:
def generate_folds(X, y, n_folds = 10):
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    folds = []
    splits = kf.split(X, y)
    for train_index, val_index in splits:
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})
    return folds

In [8]:
folds = generate_folds(X, y, 10)

## Random Forest

In [14]:
def eval_rf(fold, params):
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        criterion=params['criterion'],
        max_depth=params['max_depth'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        class_weight=params['class_weight'],
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_rf(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

rf_param_dicts = create_param_dicts(
    n_estimators=[100, 200, 300, 400, 500, 600],
    max_depth=[None, 10, 20, 40],
    min_samples_leaf=[1, 2, 4],
    max_features=['sqrt', 'log2', 0.5],
    class_weight=['balanced', None],
    criterion=['gini', 'entropy']
)

In [15]:
rf_results = run_experiment(rf_param_dicts, folds)
save_json(rf_results, '../../results/rf/rf_results.json')

  0%|          | 0/864 [00:00<?, ?it/s]

100%|██████████| 864/864 [1:32:03<00:00,  6.39s/it]


In [51]:
save_json(rf_results, '../../results/rf/rf_results.json')

In [6]:
rf_results = load_json_file('../../results/rf/rf_results.json')

In [7]:
best_config_keys = sort_results(rf_results, 'acc', show=True)

ID: 088b164bcfb804169f292b098628e2a8db6e4923e64646a6a0e81ba6acc223bc Parameters: {'n_estimators': 500, 'max_depth': None, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9666040100250626 ± 0.014581776929294759
ID: 872be7e21374c7637fa0d6660119bd5457ed101eb1867b17d4078fb484220694 Parameters: {'n_estimators': 500, 'max_depth': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9666040100250626 ± 0.014581776929294759
ID: ec7a246568b4eccd6933f6393a9acf09d420f32020e876f234aff635c7d019fa Parameters: {'n_estimators': 500, 'max_depth': 20, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9666040100250626 ± 0.014581776929294759
ID: d301ed1aca94a6e06ef4fe70eedf0d5b062830536ad0c460a0026075a693f01a Parameters: {'n_estimators': 500, 'max_depth': 40, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'crit

## Latex tables

In [1]:
def results_to_table_primer(result, top_n=5, metric='acc'):
    top_keys = sort_results(result, metric, show=False)[:top_n]
    avg_result = avg_results(result)
    to_output = []
    for k in top_keys:
        temp = {}
        temp['params'] = result[k]['params']
        temp['mean_acc'] = avg_result[k]['mean_acc']
        temp['std_acc'] = avg_result[k]['std_acc']
        temp['mean_f1'] = avg_result[k]['mean_f1']
        temp['std_f1'] = avg_result[k]['std_acc']
        to_output.append(temp)
    return to_output

def results_to_table(result, top_n=5, metric='acc', title='Top performing KNN models', label='tab:results', decimals=4, width='\\textwidth'):
    def esc(s):
        return str(s).replace("_", "\\_")
    data = results_to_table_primer(result, top_n=top_n, metric=metric)
    latex = "\\begin{table}[H]\n"
    latex += "    \\centering\n"
    latex += f"    \\caption{{{title}}}\n"
    latex += f"    \\label{{{label}}}\n"
    latex += f"    \\begin{{tabularx}}{{{width}}}{{X r r}}\n"
    latex += "        \\toprule\n"
    latex += "        \\textbf{Parameters} & \\textbf{Accuracy} & \\textbf{F1-score} \\\\\n"
    latex += "        \\midrule\n"
    for entry in data:
        params = ", ".join([f"{esc(k)}={esc(v)}" for k, v in entry["params"].items()])
        params = f"\\{{ \\texttt{{{params}}} \\}}"
        acc = f"{entry['mean_acc']:.{decimals}f} ± {entry['std_acc']:.{decimals}f}"
        f1 = f"{entry['mean_f1']:.{decimals}f} ± {entry['std_f1']:.{decimals}f}"
        latex += f"        {params} & {acc} & {f1} \\\\\n"
    latex += "        \\bottomrule\n"
    latex += "    \\end{tabularx}\n"
    latex += "\\end{table}"
    return latex

In [8]:
print(results_to_table(rf_results, title='Top performing RF models', label='tab:rf'))

\begin{table}[H]
    \centering
    \caption{Top performing RF models}
    \label{tab:rf}
    \begin{tabularx}{\textwidth}{X r r}
        \toprule
        \textbf{Parameters} & \textbf{Accuracy} & \textbf{F1-score} \\
        \midrule
        \{ \texttt{n\_estimators=500, max\_depth=None, min\_samples\_leaf=1, max\_features=sqrt, class\_weight=balanced, criterion=gini} \} & 0.9666 ± 0.0146 & 0.9549 ± 0.0146 \\
        \{ \texttt{n\_estimators=500, max\_depth=10, min\_samples\_leaf=1, max\_features=sqrt, class\_weight=balanced, criterion=gini} \} & 0.9666 ± 0.0146 & 0.9549 ± 0.0146 \\
        \{ \texttt{n\_estimators=500, max\_depth=20, min\_samples\_leaf=1, max\_features=sqrt, class\_weight=balanced, criterion=gini} \} & 0.9666 ± 0.0146 & 0.9549 ± 0.0146 \\
        \{ \texttt{n\_estimators=500, max\_depth=40, min\_samples\_leaf=1, max\_features=sqrt, class\_weight=balanced, criterion=gini} \} & 0.9666 ± 0.0146 & 0.9549 ± 0.0146 \\
        \{ \texttt{n\_estimators=200, max\_depth=None, 