# Charles de Kock - 26023830 - ML441 Assignment 2
## Decision Trees

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import f1_score, matthews_corrcoef
import json
import itertools
from sklearn.tree import DecisionTreeClassifier
from collections import defaultdict

In [None]:
data_og = pd.read_csv(os.path.dirname(os.getcwd()) + '/forestCover.csv', na_values='?')

In [None]:
np.random.seed(42)

### Preprocess

In [None]:
def init_dt_preprocessing(df: pd.DataFrame):

    df = df.drop(columns=['Inclination', 'Observation_ID'])
    #df['Soil_Type1'] = df['Soil_Type1'].map({'positive': 1, 'negative': 0}).astype(np.int8)

    imputer = SimpleImputer(strategy="median")
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    return df

In [None]:
data_clean = init_dt_preprocessing(data_og)

In [None]:
def balancing_transform(df: pd.DataFrame, target, strategy_over, strategy_under, r=None):
    X = df.drop(columns=[target])
    y = df[target]

    under = RandomUnderSampler(sampling_strategy=strategy_under, random_state=r)
    X_under, y_under = under.fit_resample(X, y)

    smote = SMOTE(sampling_strategy=strategy_over, random_state=r+1)
    #smote_tomek = SMOTETomek(sampling_strategy=strategy_over, random_state=r) #consider using later, takes long tho
    X_res, y_res = smote.fit_resample(X_under, y_under)

    df_balanced = pd.DataFrame(X_res, columns=X.columns)
    df_balanced[target] = y_res

    return df_balanced

def balancing_transform_lean(X, y, strategy_over, strategy_under, r=None):

    under = RandomUnderSampler(sampling_strategy=strategy_under, random_state=r)
    X_under, y_under = under.fit_resample(X, y)

    smote = SMOTE(sampling_strategy=strategy_over, random_state=r)
    X_res, y_res = smote.fit_resample(X_under, y_under)

    return X_res, y_res

In [None]:
X_clean = data_clean.drop(columns=['Cover_Type'])
y_clean = data_clean['Cover_Type']

 ### Cross fold setup

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)

folds = []

for train_index, val_index in skf.split(X_clean, y_clean):
    X_train, X_val = X_clean.iloc[train_index], X_clean.iloc[val_index]
    y_train, y_val = y_clean.iloc[train_index], y_clean.iloc[val_index]
    folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})


In [None]:
def oversample_fold(fold, strategies):
    X, y = balancing_transform_lean(fold['Xt'], fold['yt'], strategies[1], strategies[0])
    return {'Xt': X, 'Xv': fold['Xv'], 'yt': y, 'yv': fold['yv']}

In [None]:
strat_A = [{1: 50000, 2: 50000}, {3: 35000, 4: 20000, 5: 20000, 6: 30000, 7: 30000}]

for i in tqdm(range(len(folds))):
    folds[i] = oversample_fold(folds[i], strategies=strat_A)

### Tuning

In [None]:
def eval_model(fold, fold_nr, param_id, params):
    model = DecisionTreeClassifier(criterion=params['criterion'],
                                   max_depth=params['max_depth'], 
                                   min_samples_leaf=params['min_samples_leaf'], 
                                   class_weight=params['class_weight'])
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1_w = f1_score(fold['yv'], y_pred, average='weighted')
    f1_m = f1_score(fold['yv'], y_pred, average='macro')
    mcc = matthews_corrcoef(fold['yv'], y_pred)
    
    return {'fold_nr': fold_nr, 'param_id': param_id,'params': params, 'mcc': mcc, 'f1_weigthed': f1_w, 'f1_macro': f1_m}

In [None]:
def generate_param_dicts(criteria, depths, min_samples_leaf, weight):
    param_dicts = []
    for c, d, ms, w in itertools.product(criteria, depths, min_samples_leaf, weight):
            params = {
                'criterion': c,
                'max_depth': d, 
                'min_samples_leaf': ms, 
                'class_weight': w
            }
            param_dicts.append(params)
    return param_dicts

In [None]:
def grid_search(param_grid, folds):
    out = []
    idx = 0
    for i in tqdm(range(len(param_grid))):
        for j in range(len(folds)):
            out.append(eval_model(folds[j], j, i, param_grid[i]))
            #print("Params:", i, '\nFold:', j, '\n')
    return out

In [None]:
def save_results(results, file):
    with open(file, "w") as f:
        for d in results:
            f.write(json.dumps(d) + "\n")

def load_results(file):
    loaded_results = []
    with open(file, "r") as f:
        for line in f:
            loaded_results.append(json.loads(line))
    return loaded_results

In [None]:
np.linspace(20, 50, 12).astype(int)

In [None]:
param_grid = generate_param_dicts(['gini', 'entropy'], np.linspace(20, 50, 12).astype(int), [1, 0.0002], [None, 'balanced'])

In [None]:
results_fine = grid_search(param_grid, folds)
save_results(results_fine, "eval_fine_dt.txt")

In [None]:
for result in results_fine:
    result['params']['max_depth'] = int(result['params']['max_depth'])

In [None]:
save_results(results_fine, "eval_fine_dt_backup.txt")

### Evaluation

In [None]:
def avg_results(data):
    grouped_data = defaultdict(list)
    for d in data:
        grouped_data[d['param_id']].append(d)

    summary_list = []
    for param_id, group in grouped_data.items():
        mcc_scores = [d['mcc'] for d in group]
        f1_weigthed_scores = [d['f1_weigthed'] for d in group]
        f1_macro_scores = [d['f1_macro'] for d in group]

        summary = {
            'param_id': param_id,
            'params': group[0]['params'],
            'mean_mcc': np.mean(mcc_scores),
            'std_mcc': np.std(mcc_scores),
            'mean_f1_weigthed': np.mean(f1_weigthed_scores),
            'std_f1_weigthed': np.std(f1_weigthed_scores),
            'mean_f1_macro': np.mean(f1_macro_scores),
            'std_f1_macro': np.std(f1_macro_scores)
        }
        summary_list.append(summary)

    return summary_list

In [None]:
def sort_results(results, key, show=False):
    out = sorted(results, key=lambda x: x['mean_' + key], reverse=True)
    if show == True:
        for r in out:
            print(f"ID: {r['param_id']} Parameters: {r['params']} -> {key}: {r['mean_' + key]} ± {r['std_' + key]}")
    return out

In [None]:
sorted_fine_evals_mcc = sort_results(avg_results(results_fine), 'mcc', show=False)
sorted_fine_evals_f1w = sort_results(avg_results(results_fine), 'f1_weigthed', show=False)
sorted_fine_evals_f1m = sort_results(avg_results(results_fine), 'f1_macro', show=False)

In [None]:
def combine_evals(eval1, eval2, eval3):
    out = defaultdict(int)
    for i in range(len(eval1)):
        out[str(eval1[i]['param_id'])] += (i + 1)
    for i in range(len(eval2)):
        out[str(eval2[i]['param_id'])] += (i + 1)
    for i in range(len(eval3)):
        out[str(eval3[i]['param_id'])] += (i + 1)
    return out

In [None]:
combined_ranks = combine_evals(sorted_fine_evals_mcc, sorted_fine_evals_f1w, sorted_fine_evals_f1m)
sorted(combined_ranks.items(), key=lambda item: item[1], reverse=False)

In [None]:
def find_ID(results , id):
    return [d for d in results if d.get('param_id') == id]

In [None]:
best = find_ID(results_fine, 183)

In [None]:
best