# ML441 Assignment 4
## Charles de Kock - 26023830

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import time
from tqdm import tqdm
import json
from contextlib import redirect_stdout
from autorank import autorank, plot_stats
import scikit_posthocs as sp
import io
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
import hashlib
from collections import defaultdict
import itertools
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
seed = 40
np.random.seed(seed)

## Auxillary Functions

In [7]:
def load_json_file(filename):
    with open(filename, 'r') as file:
        return json.load(file)
    
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        try:
            return super().default(obj)
        except TypeError:
            pass
        if hasattr(obj, '__dict__'):
            return obj.__dict__
        elif hasattr(obj, '__str__'):
            return str(obj)
        else:
            return f"<non-serializable: {type(obj).__name__}>"

def save_json(data, path, indent=4):
    with open(path, "w") as f:
        json.dump(data, f, indent=indent, cls=CustomJSONEncoder)

def hash_dict(d):
    dict_str = json.dumps(d, sort_keys=True)
    return hashlib.sha256(dict_str.encode('utf-8')).hexdigest()

def avg_results(data, n_folds=10):
    out = {}
    for key in list(data.keys()):
        accs = []
        f1s = []
        for i in range(n_folds):
            temp = data[key]['results'][f'fold_{i}']
            accs.append(temp['acc'])
            f1s.append(temp['f1'])
        out[key] =  {
            'mean_acc': float(np.mean(accs)),
            'std_acc': float(np.std(accs)),
            'mean_f1': float(np.mean(f1s)),
            'std_f1': float(np.std(f1s))
        }
    return out

def sort_results(full_data, key, show=False, top_n=10):
    avg_data = avg_results(full_data)
    out = sorted(avg_data.keys(), key=lambda k: avg_data[k][f'mean_{key}'], reverse=True)
    if show == True:
        for r in out[:top_n]:
            print(f"ID: {r} Parameters: {full_data[r]['params']} -> {key}: {avg_data[r]['mean_' + key]} ± {avg_data[r]['std_' + key]}")
    return out

def handle_outliers_iqr(df, threshold=2):
    df_clean = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        df_clean[col] = np.clip(df[col], lower_bound, upper_bound)
    return df_clean

## Load and Preprocess data

In [8]:
def preprocess(df):
    y = df['diagnosis'].map({'B': 0, 'M': 1})
    X = df.drop(columns=['diagnosis', 'gender', 'id', 'perimeter_mean', 'perimeter_se', 'perimeter_worst', 'area_mean', 'area_se', 'area_worst'])
    X = handle_outliers_iqr(X)
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
    return X, y

In [10]:
data = pd.read_csv('../data/breastCancer.csv', delimiter='\t', na_values='?')
X, y = preprocess(data)

## Cross validation folds

In [11]:
def generate_folds(X, y, n_folds = 10):
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    folds = []
    splits = kf.split(X, y)
    for train_index, val_index in splits:
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})
    return folds

In [12]:
folds = generate_folds(X, y, 10)

## LR

In [20]:
def eval_lr(fold, params):
    model = LogisticRegression(
        penalty=params['penalty'],
        C=params['C'],
        solver=params['solver'],
        max_iter=params['max_iter'],
        class_weight=params['class_weight'],
        fit_intercept=params['fit_intercept'],
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        
        # Compatibility filtering
        penalty = param_dict['penalty']
        solver = param_dict['solver']

        # Invalid combinations to skip
        if solver == 'lbfgs' and penalty not in ['l2', 'none']:
            continue
        if solver == 'saga' and penalty not in ['l1', 'l2', 'elasticnet']:
            continue
        if solver == 'liblinear' and penalty not in ['l1', 'l2']:
            continue
        if solver == 'newton-cg' and penalty not in ['l2', 'none']:
            continue
        if solver == 'sag' and penalty not in ['l2', 'none']:
            continue
        
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment_lr(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_lr(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

lr_param_dicts = create_param_dicts(
    penalty=['l1', 'l2', None],
    solver=['lbfgs', 'saga', 'liblinear'],
    C=[0.01, 0.1, 1, 10, 100],
    class_weight=['balanced', None],
    fit_intercept=[True, False],
    max_iter=[200, 500, 1000]
)

In [21]:
lr_results = run_experiment_lr(lr_param_dicts, folds)

100%|██████████| 300/300 [03:34<00:00,  1.40it/s]


In [23]:
save_json(lr_results, '../../results/hetero/lr_results.json')

In [None]:
lr_results = load_json_file('../../results/hetero/lr_results.json')

In [24]:
best_config_keys = sort_results(lr_results, 'acc', show=True)

ID: 742b2aa0a59fc529151f4695d0eb79fe5bdeaae12be92e4f504d0705a754e1f2 Parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 100, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 1000} -> acc: 0.9595551378446114 ± 0.020889413280441495
ID: 534d5a0882b30ad2a1a101f1a71d6a5f3980fc3bc5cfead616dbf974697e0507 Parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 100, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 500} -> acc: 0.9577694235588972 ± 0.022626249382872148
ID: 8b2cf4d4021c8801bf0a60a1817c6efa66b21cdfb6529693d654138a1d22afc0 Parameters: {'penalty': 'l2', 'solver': 'liblinear', 'C': 100, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 200} -> acc: 0.9577694235588972 ± 0.022626249382872148
ID: b24bd949615fe7636729a912e62c75e016e7b957320aa94078d4c839181d9896 Parameters: {'penalty': 'l2', 'solver': 'liblinear', 'C': 100, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 500} -> acc: 0.9577694235588972 ± 0.022626249382872148
ID: 750

## SVC

In [39]:
def eval_dt(fold, params):
    model = DecisionTreeClassifier(
        criterion=params['criterion'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        class_weight=params['class_weight'],
        random_state=42
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment_dt(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_dt(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

# Define Decision Tree hyperparameter grid
dt_param_dicts = create_param_dicts(
    criterion=['gini', 'entropy', 'log_loss'],
    max_depth=[None, 5, 10, 20, 30],
    min_samples_split=[2, 5, 10],
    min_samples_leaf=[1, 2, 4],
    max_features=[None, 'sqrt', 'log2'],
    class_weight=[None, 'balanced']
)

In [40]:
dt_results = run_experiment_dt(dt_param_dicts, folds)

100%|██████████| 810/810 [01:30<00:00,  8.98it/s]


In [41]:
save_json(dt_results, '../../results/hetero/dt_results.json')

In [None]:
dt_results = load_json_file('../../results/hetero/dt_results.json')

In [42]:
best_config_keys = sort_results(dt_results, 'acc', show=True)

ID: baa80c373866d67f92828a5888f7d2adc7337fa5d3b22f087237aa3606f01210 Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'class_weight': 'balanced'} -> acc: 0.9471804511278196 ± 0.02396082057458112
ID: f1cd5f5421b969ab607ec0228c95fd652742f83a940c8dd99da3fd92b38f6b53 Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'class_weight': 'balanced'} -> acc: 0.9471804511278196 ± 0.02396082057458112
ID: 9a18da355ed0988a9b68883cabb8543804073654e5231ca5c42b283a8e4bc6d0 Parameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'class_weight': 'balanced'} -> acc: 0.9471804511278196 ± 0.02396082057458112
ID: 42595413cd107cb78b04334698c2127b40e35fbc40ad1d8cf25bfc330d1da672 Parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'cl

## KNN

In [29]:
def eval_knn(fold, params):
    model = KNeighborsClassifier(
        n_neighbors=params['n_neighbors'],
        weights=params['weights'],
        algorithm=params['algorithm'],
        p=params['p'],
        metric='minkowski',
        n_jobs=-1
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment_knn(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_knn(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

# Define KNN hyperparameter grid
knn_param_dicts = create_param_dicts(
    n_neighbors=[3, 5, 7, 9, 11, 15, 31, 51],
    weights=['uniform', 'distance'],        
    algorithm=['brute'],  
    p=[1, 2, 3]                              
)


In [31]:
knn_results = run_experiment_knn(knn_param_dicts, folds)

100%|██████████| 48/48 [00:05<00:00,  9.04it/s]


In [32]:
save_json(knn_results, '../../results/hetero/knn_results.json')

In [None]:
knn_results = load_json_file('../../results/hetero/knn_results.json')

In [33]:
best_config_keys = sort_results(knn_results, 'acc', show=True)

ID: 9bd2fe001a173200956ed51a37c44c9ac8cf3fab687971a4c39e4e5eabc480c3 Parameters: {'n_neighbors': 31, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1} -> acc: 0.9384711779448622 ± 0.02860026904771743
ID: d7642812767e936bed7016c498e251eb4fd98b0e3dedfd61ca175967238e4a43 Parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1} -> acc: 0.9349624060150378 ± 0.023633766279937168
ID: d4c19b5daf05a0a4dbea1ba76f60af40ff401ef7fcc0bca1665130b725f8d5df Parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'p': 1} -> acc: 0.9349624060150378 ± 0.026108805279635883
ID: b6b8076e917c868ba885b720fbcad510f239ab40c273d7b1f2cb734dbdab1477 Parameters: {'n_neighbors': 31, 'weights': 'distance', 'algorithm': 'brute', 'p': 1} -> acc: 0.9332080200501254 ± 0.02697031829822565
ID: 4ecd3ba5c7cd2a437e0bbd07baea897134f58238dec7548bd576cefe94d32af0 Parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1} -> acc: 0.9332080200501253 ± 0.02580

## Stacking

In [None]:
def eval_stack(fold, params):
    model = KNeighborsClassifier(
        n_neighbors=params['n_neighbors'],
        weights=params['weights'],
        algorithm=params['algorithm'],
        p=params['p'],
        metric='minkowski',
        n_jobs=-1
    )
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1 = f1_score(fold['yv'], y_pred, average='binary')
    acc = accuracy_score(fold['yv'], y_pred)
    
    return {'acc': acc, 'f1': f1}

def create_param_dicts(**kwargs):
    keys = list(kwargs.keys())
    values = list(kwargs.values())
    
    param_dicts = []
    for combination in itertools.product(*values):
        param_dict = dict(zip(keys, combination))
        param_dicts.append(param_dict)
    
    return param_dicts

def run_experiment_knn(param_dicts, folds):
    out = {}
    for p in tqdm(param_dicts):
        entry = {}
        entry['params'] = p
        data = {}
        for i in range(len(folds)):
            data[f'fold_{i}'] = eval_knn(folds[i], p)
        entry['results'] = data
        out[hash_dict(p)] = entry
    return out

# Define KNN hyperparameter grid
knn_param_dicts = create_param_dicts(
    n_neighbors=[3, 5, 7, 9, 11, 15, 31, 51],
    weights=['uniform', 'distance'],        
    algorithm=['brute'],  
    p=[1, 2, 3]                              
)
