# ML441 Assignment 4
## Charles de Kock - 26023830

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import time
from tqdm import tqdm
import json
from contextlib import redirect_stdout
from autorank import autorank, plot_stats
import scikit_posthocs as sp
import io
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
import hashlib
from collections import defaultdict
import itertools

In [3]:
seed = 40
np.random.seed(seed)

## Auxillary Functions

In [4]:
def load_json_file(filename):
    with open(filename, 'r') as file:
        return json.load(file)
    
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        try:
            return super().default(obj)
        except TypeError:
            pass
        if hasattr(obj, '__dict__'):
            return obj.__dict__
        elif hasattr(obj, '__str__'):
            return str(obj)
        else:
            return f"<non-serializable: {type(obj).__name__}>"

def save_json(data, path, indent=4):
    with open(path, "w") as f:
        json.dump(data, f, indent=indent, cls=CustomJSONEncoder)

def hash_dict(d):
    dict_str = json.dumps(d, sort_keys=True)
    return hashlib.sha256(dict_str.encode('utf-8')).hexdigest()

def avg_results(data, n_folds=10):
    out = {}
    for key in list(data.keys()):
        accs = []
        f1s = []
        for i in range(n_folds):
            temp = data[key]['results'][f'fold_{i}']
            accs.append(temp['acc'])
            f1s.append(temp['f1'])
        out[key] =  {
            'mean_acc': float(np.mean(accs)),
            'std_acc': float(np.std(accs)),
            'mean_f1': float(np.mean(f1s)),
            'std_f1': float(np.std(f1s))
        }
    return out

def sort_results(full_data, key, show=False, top_n=10):
    avg_data = avg_results(full_data)
    out = sorted(avg_data.keys(), key=lambda k: avg_data[k][f'mean_{key}'], reverse=True)
    if show == True:
        for r in out[:top_n]:
            print(f"ID: {r} Parameters: {full_data[r]['params']} -> {key}: {avg_data[r]['mean_' + key]} ± {avg_data[r]['std_' + key]}")
    return out

## Load and Preprocess data

In [45]:
def preprocess(df):
    y = df['diagnosis'].map({'B': 0, 'M': 1})
    X = df.drop(columns=['diagnosis', 'gender', 'id'])
    return X, y

In [46]:
data = pd.read_csv('breastCancer.csv', delimiter='\t', na_values='?')
X, y = preprocess(data)

## Cross validation folds

In [47]:
def generate_folds(X, y, n_folds = 10):
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    folds = []
    splits = kf.split(X, y)
    for train_index, val_index in splits:
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})
    return folds

In [48]:
folds = generate_folds(X, y, 10)

## Random Forest

In [7]:
def eval_rf(fold, params):
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        criterion=params['criterion'],
        max_depth=params['max_depth'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        class_weight=params['class_weight'],
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_rf(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

rf_param_dicts = create_param_dicts(
    n_estimators=[100, 200, 300, 400, 500, 600],
    max_depth=[None, 10, 20, 40],
    min_samples_leaf=[1, 2, 4],
    max_features=['sqrt', 'log2', 0.5],
    class_weight=['balanced', None],
    criterion=['gini', 'entropy']
)

In [50]:
rf_results = run_experiment(rf_param_dicts, folds)

100%|██████████| 540/540 [47:57<00:00,  5.33s/it]


In [51]:
save_json(rf_results, '../../results/rf/no_preprocessing_results.json')

In [5]:
rf_results = load_json_file('../../results/rf/no_preprocessing_results.json')

In [6]:
best_config_keys = sort_results(rf_results, 'acc', show=True)

ID: b6d7e1674aeaaf67ec977afb53e5d88186ceca668721f38657b93e81c9ac9cc7 Parameters: {'n_estimators': 400, 'max_depth': None, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9630952380952381 ± 0.01831128961161411
ID: a92261b8679df734cdf3326e44310bc465fd2ac812f6fe565562347b8671a7e8 Parameters: {'n_estimators': 400, 'max_depth': None, 'min_samples_leaf': 1, 'max_features': 'log2', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9630952380952381 ± 0.01831128961161411
ID: e22a6549663f8b37c7ec464d1419734178fd18902275bef520a65d884d510155 Parameters: {'n_estimators': 400, 'max_depth': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'gini'} -> acc: 0.9630952380952381 ± 0.01831128961161411
ID: 372041f3f0141cee5fdc346a05bde5c54bc8a1f550027f7b88459935e23145dd Parameters: {'n_estimators': 400, 'max_depth': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'class_weight': 'balanced', 'crite