# Charles de Kock - 26023830 - ML441 Assignment 2
## KNN

In [28]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import f1_score, matthews_corrcoef
import json
import itertools
from collections import defaultdict

In [4]:
data_og = pd.read_csv(os.path.dirname(os.getcwd()) + '/forestCover.csv', na_values='?')

### Preprocess

In [5]:
def init_knn_preprocessing(df: pd.DataFrame):

    df = df.drop(columns=['Aspect', 'Inclination', 'Water_Level', 'Observation_ID'])
    df['Soil_Type1'] = df['Soil_Type1'].map({'positive': 1, 'negative': 0}).astype(np.int8)

    #imputer = KNNImputer(n_neighbors=10) # very expensive, not optimized K
    imputer = SimpleImputer(strategy="median")
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    numeric_cols = [
        'Elevation', 'Facet', 'Slope', 'Horizontal_Distance_To_Hydrology',
        'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
        'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
        'Horizontal_Distance_To_Fire_Points'
    ]

    def clamp_iqr(series, drop_list, factor=1.5, xfactor = 1.75): 
        Q1 = series.quantile(0.25) 
        Q3 = series.quantile(0.75) 
        IQR = Q3 - Q1 
        lower = Q1 - factor * IQR 
        upper = Q3 + factor * IQR 
        lower_extreme = Q1 - 2 * factor * IQR
        upper_extreme = Q3 + 2 * factor * IQR
        extreme_outliers = series[(series < lower_extreme) | (series > upper_extreme)].index.tolist()
        drop_list.extend(extreme_outliers)
        return series.clip(lower_extreme, upper_extreme)

    rows_to_drop = []
    for col in numeric_cols:
        df[col] = clamp_iqr(df[col], rows_to_drop)

    df = df.drop(index=set(rows_to_drop))
    df = df.reset_index(drop=True)

    scaler = RobustScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [6]:
data_clean = init_knn_preprocessing(data_og)

In [7]:
def balancing_transform(df: pd.DataFrame, target, strategy_over, strategy_under, r=None):
    X = df.drop(columns=[target])
    y = df[target]

    under = RandomUnderSampler(sampling_strategy=strategy_under, random_state=r)
    X_under, y_under = under.fit_resample(X, y)

    smote = SMOTE(sampling_strategy=strategy_over, random_state=r+1)
    #smote_tomek = SMOTETomek(sampling_strategy=strategy_over, random_state=r) #consider using later, takes long tho
    X_res, y_res = smote.fit_resample(X_under, y_under)

    df_balanced = pd.DataFrame(X_res, columns=X.columns)
    df_balanced[target] = y_res

    return df_balanced

def balancing_transform_lean(X, y, strategy_over, strategy_under, r=None):

    under = RandomUnderSampler(sampling_strategy=strategy_under, random_state=r)
    X_under, y_under = under.fit_resample(X, y)

    smote = SMOTE(sampling_strategy=strategy_over, random_state=r)
    X_res, y_res = smote.fit_resample(X_under, y_under)

    return X_res, y_res

In [8]:
X_clean = data_clean.drop(columns=['Cover_Type'])
y_clean = data_clean['Cover_Type']

 ### Cross fold setup

In [9]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)

folds = []

for train_index, val_index in skf.split(X_clean, y_clean):
    X_train, X_val = X_clean.iloc[train_index], X_clean.iloc[val_index]
    y_train, y_val = y_clean.iloc[train_index], y_clean.iloc[val_index]
    folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})


In [10]:
def oversample_fold(fold, strategies):
    X, y = balancing_transform_lean(fold['Xt'], fold['yt'], strategies[1], strategies[0])
    return {'Xt': X, 'Xv': fold['Xv'], 'yt': y, 'yv': fold['yv']}

In [11]:
strat_A = [{1: 50000, 2: 50000}, {3: 35000, 4: 20000, 5: 20000, 6: 30000, 7: 30000}]

for i in tqdm(range(len(folds))):
    folds[i] = oversample_fold(folds[i], strategies=strat_A)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


### Tuning

In [12]:
def eval_model(fold, fold_nr, param_id, params):
    model = KNeighborsClassifier(n_neighbors=params['k'],
                                 weights=params['weights'], 
                                 algorithm=params['algo'],
                                 metric=params['metric'],
                                 #p=params['p'],
                                 n_jobs=-1)
    
    model.fit(fold['Xt'], fold['yt'])
    y_pred = model.predict(fold['Xv'])
    f1_w = f1_score(fold['yv'], y_pred, average='weighted')
    f1_m = f1_score(fold['yv'], y_pred, average='macro')
    mcc = matthews_corrcoef(fold['yv'], y_pred)
    
    return {'fold_nr': fold_nr, 'param_id': param_id,'params': params, 'mcc': mcc, 'f1_weigthed': f1_w, 'f1_macro': f1_m}

In [13]:
def generate_param_dicts(k_values, weights_options):
    param_dicts = []
    for k, weights in itertools.product(k_values, weights_options):
            params = {
                'k': k,
                'weights': weights,
                'algo': 'brute',
                'metric': 'euclidean'
            }
            param_dicts.append(params)
    return param_dicts

In [17]:
param_grid = generate_param_dicts([5, 10, 15, 20, 25, 30, 35, 40, 45], ['uniform', 'distance'])

In [14]:
def grid_search(param_grid, folds):
    out = []
    idx = 0
    for i in range(len(param_grid)):
        for j in range(len(folds)):
            out.append(eval_model(folds[j], j, i, param_grid[i]))
            print("Params:", i, '\nFold:', j, '\n')
    return out

In [15]:
def save_results(results, file):
    with open(file, "w") as f:
        for d in results:
            f.write(json.dumps(d) + "\n")

def load_results(file):
    loaded_results = []
    with open(file, "r") as f:
        for line in f:
            loaded_results.append(json.loads(line))
    return loaded_results

In [22]:
results_coarse = grid_search(param_grid, folds)
save_results(results_coarse, "eval_coarse.txt")

Params: 0 
Fold: 0 

Params: 0 
Fold: 1 

Params: 0 
Fold: 2 

Params: 0 
Fold: 3 

Params: 0 
Fold: 4 

Params: 1 
Fold: 0 

Params: 1 
Fold: 1 

Params: 1 
Fold: 2 

Params: 1 
Fold: 3 

Params: 1 
Fold: 4 

Params: 2 
Fold: 0 

Params: 2 
Fold: 1 

Params: 2 
Fold: 2 

Params: 2 
Fold: 3 

Params: 2 
Fold: 4 

Params: 3 
Fold: 0 

Params: 3 
Fold: 1 

Params: 3 
Fold: 2 

Params: 3 
Fold: 3 

Params: 3 
Fold: 4 

Params: 4 
Fold: 0 

Params: 4 
Fold: 1 

Params: 4 
Fold: 2 

Params: 4 
Fold: 3 

Params: 4 
Fold: 4 

Params: 5 
Fold: 0 

Params: 5 
Fold: 1 

Params: 5 
Fold: 2 

Params: 5 
Fold: 3 

Params: 5 
Fold: 4 

Params: 6 
Fold: 0 

Params: 6 
Fold: 1 

Params: 6 
Fold: 2 

Params: 6 
Fold: 3 

Params: 6 
Fold: 4 

Params: 7 
Fold: 0 

Params: 7 
Fold: 1 

Params: 7 
Fold: 2 

Params: 7 
Fold: 3 

Params: 7 
Fold: 4 

Params: 8 
Fold: 0 

Params: 8 
Fold: 1 

Params: 8 
Fold: 2 

Params: 8 
Fold: 3 

Params: 8 
Fold: 4 

Params: 9 
Fold: 0 

Params: 9 
Fold: 1 

Params: 9 
Fo

In [16]:
param_grid = generate_param_dicts([1, 2, 3, 4, 5, 6, 7, 8, 9], ['uniform', 'distance'])

In [17]:
results_fine = grid_search(param_grid, folds)
save_results(results_fine, "eval_fine.txt")

Params: 0 
Fold: 0 

Params: 0 
Fold: 1 

Params: 0 
Fold: 2 

Params: 0 
Fold: 3 

Params: 0 
Fold: 4 

Params: 0 
Fold: 5 

Params: 0 
Fold: 6 

Params: 0 
Fold: 7 

Params: 0 
Fold: 8 

Params: 0 
Fold: 9 

Params: 1 
Fold: 0 

Params: 1 
Fold: 1 

Params: 1 
Fold: 2 

Params: 1 
Fold: 3 

Params: 1 
Fold: 4 

Params: 1 
Fold: 5 

Params: 1 
Fold: 6 

Params: 1 
Fold: 7 

Params: 1 
Fold: 8 

Params: 1 
Fold: 9 

Params: 2 
Fold: 0 

Params: 2 
Fold: 1 

Params: 2 
Fold: 2 

Params: 2 
Fold: 3 

Params: 2 
Fold: 4 

Params: 2 
Fold: 5 

Params: 2 
Fold: 6 

Params: 2 
Fold: 7 

Params: 2 
Fold: 8 

Params: 2 
Fold: 9 

Params: 3 
Fold: 0 

Params: 3 
Fold: 1 

Params: 3 
Fold: 2 

Params: 3 
Fold: 3 

Params: 3 
Fold: 4 

Params: 3 
Fold: 5 

Params: 3 
Fold: 6 

Params: 3 
Fold: 7 

Params: 3 
Fold: 8 

Params: 3 
Fold: 9 

Params: 4 
Fold: 0 

Params: 4 
Fold: 1 

Params: 4 
Fold: 2 

Params: 4 
Fold: 3 

Params: 4 
Fold: 4 

Params: 4 
Fold: 5 

Params: 4 
Fold: 6 

Params: 4 
Fo

In [52]:
test = param_grid[4].copy()
test['metric'] = 'manhattan'

test_out = eval_model(folds[0], 0, -1, test)

In [53]:
test_out

{'fold_nr': 0,
 'param_id': -1,
 'params': {'k': 3,
  'weights': 'uniform',
  'algo': 'brute',
  'metric': 'manhattan'},
 'mcc': 0.8191983105753851,
 'f1_weigthed': 0.8857828450241529,
 'f1_macro': 0.8290460750733443}

### Evaluation

In [29]:
def avg_results(data):
    grouped_data = defaultdict(list)
    for d in data:
        grouped_data[d['param_id']].append(d)

    summary_list = []
    for param_id, group in grouped_data.items():
        mcc_scores = [d['mcc'] for d in group]
        f1_weigthed_scores = [d['f1_weigthed'] for d in group]
        f1_macro_scores = [d['f1_macro'] for d in group]

        summary = {
            'param_id': param_id,
            'params': group[0]['params'],
            'mean_mcc': np.mean(mcc_scores),
            'std_mcc': np.std(mcc_scores),
            'mean_f1_weigthed': np.mean(f1_weigthed_scores),
            'std_f1_weigthed': np.std(f1_weigthed_scores),
            'mean_f1_macro': np.mean(f1_macro_scores),
            'std_f1_macro': np.std(f1_macro_scores)
        }
        summary_list.append(summary)

    return summary_list

In [37]:
def sort_results(results, key, show=False):
    out = sorted(results, key=lambda x: x['mean_' + key], reverse=True)
    if show == True:
        for r in out:
            print(f"ID: {r['param_id']} Parameters: {r['params']} -> {key}: {r['mean_' + key]} ± {r['std_' + key]}")
    return out

In [55]:
sorted_fine_evals_mcc = sort_results(avg_results(results_fine), 'mcc', show=False)
sorted_fine_evals_f1w = sort_results(avg_results(results_fine), 'f1_weigthed', show=False)
sorted_fine_evals_f1m = sort_results(avg_results(results_fine), 'f1_macro', show=False)

In [56]:
def combine_evals(eval1, eval2, eval3):
    out = defaultdict(int)
    for i in range(len(eval1)):
        out[str(eval1[i]['param_id'])] += (i + 1)
    for i in range(len(eval2)):
        out[str(eval2[i]['param_id'])] += (i + 1)
    for i in range(len(eval3)):
        out[str(eval3[i]['param_id'])] += (i + 1)
    return out

In [57]:
combined_ranks = combine_evals(sorted_fine_evals_mcc, sorted_fine_evals_f1w, sorted_fine_evals_f1m)
sorted(combined_ranks.items(), key=lambda item: item[1], reverse=False)

[('0', 3),
 ('1', 6),
 ('3', 9),
 ('7', 13),
 ('5', 16),
 ('2', 19),
 ('4', 21),
 ('11', 25),
 ('9', 26),
 ('6', 29),
 ('13', 34),
 ('15', 37),
 ('8', 38),
 ('10', 41),
 ('17', 43),
 ('12', 49),
 ('14', 50),
 ('16', 54)]

In [58]:
def find_ID(results , id):
    return next((d for d in results if d.get('param_id') == id), None)

In [65]:
best = find_ID(results_fine, 5)

In [66]:
best

{'fold_nr': 0,
 'param_id': 5,
 'params': {'k': 3,
  'weights': 'distance',
  'algo': 'brute',
  'metric': 'euclidean'},
 'mcc': 0.8189688361542544,
 'f1_weigthed': 0.885709223759754,
 'f1_macro': 0.8307832353487162}