# Chapter 02 - Supervised learning - Challenge

This challenge was proposed in the GE ML - CDS and it is available in this [kaggle's competition](https://www.kaggle.com/competitions/ge-ml-cds-knn/overview).


# Setup

## Library import

In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

plotly.offline.init_notebook_mode(connected=True)

# Options for pandas
pd.options.display.max_columns = None
pd.options.display.max_rows = 30

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Local library import

## Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [5]:
RANDON_STATE = 42
DATA_FOLDER = '../data/'

## Data import
We retrieve all the required data for the analysis.

In [9]:
df = pd.read_csv(DATA_FOLDER + 'train.csv', index_col='id')
df.sample(5, random_state=RANDON_STATE)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
557,0.221154,0.323529,0.144578,0.015337,0.118694,0.04878,0.296984,0.113553,0.545455,0.325581,0.387097,1
4119,0.259615,0.313725,0.186747,0.044479,0.109792,0.069686,0.320186,0.060343,0.390909,0.104651,0.645161,0
3772,0.240385,0.156863,0.174699,0.20092,0.077151,0.1777,0.290023,0.165028,0.409091,0.186047,0.419355,1
2510,0.230769,0.196078,0.271084,0.105828,0.106825,0.15331,0.450116,0.166185,0.490909,0.27907,0.193548,1
4182,0.355769,0.294118,0.337349,0.139571,0.136499,0.045296,0.37355,0.172354,0.372727,0.5,0.306452,1


In [58]:
df_test = pd.read_csv(DATA_FOLDER + 'test.csv', index_col='id')
df_test.sample(5, random_state=RANDON_STATE)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4716,0.163462,0.230392,0.228916,0.030675,0.071217,0.027875,0.139211,0.073067,0.363636,0.430233,0.451613
934,0.230769,0.196078,0.259036,0.190184,0.089021,0.216028,0.519722,0.199345,0.327273,0.186047,0.145161
2003,0.346154,0.215686,0.13253,0.071319,0.130564,0.108014,0.396752,0.105842,0.372727,0.267442,0.580645
1304,0.403846,0.5,0.427711,0.256902,0.086053,0.205575,0.508121,0.248506,0.209091,0.616279,0.209677
1138,0.519231,0.196078,0.277108,0.039877,0.145401,0.12892,0.287703,0.17139,0.381818,0.418605,0.241935


### Data evaluation

#### Training dataset

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3428 entries, 3009 to 3115
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      3428 non-null   float64
 1   x2      3428 non-null   float64
 2   x3      3428 non-null   float64
 3   x4      3428 non-null   float64
 4   x5      3428 non-null   float64
 5   x6      3428 non-null   float64
 6   x7      3428 non-null   float64
 7   x8      3428 non-null   float64
 8   x9      3428 non-null   float64
 9   x10     3428 non-null   float64
 10  x11     3428 non-null   float64
 11  target  3428 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 348.2 KB


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,3428.0,0.294153,0.081188,0.0,0.240385,0.288462,0.336538,0.769231
x2,3428.0,0.193643,0.096141,0.0,0.127451,0.176471,0.235294,0.906863
x3,3428.0,0.20115,0.073212,0.0,0.162651,0.186747,0.23494,1.0
x4,3428.0,0.088768,0.077514,0.0,0.016871,0.070552,0.142638,1.0
x5,3428.0,0.109377,0.066601,0.008902,0.080119,0.10089,0.121662,1.0
x6,3428.0,0.115222,0.057253,0.0,0.073171,0.111498,0.15331,0.449477
x7,3428.0,0.299515,0.097947,0.0,0.227378,0.290023,0.366589,0.777262
x8,3428.0,0.133386,0.057208,0.0,0.090418,0.128976,0.172209,1.0
x9,3428.0,0.42299,0.13702,0.0,0.327273,0.409091,0.502273,1.0
x10,3428.0,0.313089,0.13246,0.0,0.22093,0.302326,0.383721,1.0


In [119]:
df['target'].value_counts()

0    1890
1    1538
Name: target, dtype: int64

#### Testing dataset

In [117]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470 entries, 4325 to 3707
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      1470 non-null   float64
 1   x2      1470 non-null   float64
 2   x3      1470 non-null   float64
 3   x4      1470 non-null   float64
 4   x5      1470 non-null   float64
 5   x6      1470 non-null   float64
 6   x7      1470 non-null   float64
 7   x8      1470 non-null   float64
 8   x9      1470 non-null   float64
 9   x10     1470 non-null   float64
 10  x11     1470 non-null   float64
dtypes: float64(11)
memory usage: 137.8 KB


In [118]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,1470.0,0.292743,0.081052,0.038462,0.240385,0.288462,0.346154,1.0
x2,1470.0,0.196012,0.104812,0.0,0.127451,0.176471,0.235294,1.0
x3,1470.0,0.201717,0.072203,0.0,0.162651,0.192771,0.23494,0.60241
x4,1470.0,0.08896,0.078464,0.001534,0.016871,0.069018,0.141104,0.47546
x5,1470.0,0.10851,0.060518,0.0,0.080119,0.10089,0.121662,0.617211
x6,1470.0,0.118001,0.063668,0.003484,0.076655,0.111498,0.15331,1.0
x7,1470.0,0.301599,0.100136,0.034803,0.232019,0.292343,0.366589,1.0
x8,1470.0,0.133299,0.058726,0.005591,0.086563,0.127048,0.176403,0.447079
x9,1470.0,0.43201,0.137703,0.063636,0.336364,0.418182,0.518182,0.936364
x10,1470.0,0.315377,0.133304,0.034884,0.22093,0.290698,0.383721,0.883721


**Conclusion**

There is no null values in the training dataset. Besides, the 11 (eleven) features have the same scale from 0 to 1.

# Data processing
Put here the core of the notebook. Feel free di further split this section into subsections.

In [74]:
X = df.drop(columns=['target']).copy()
y = df['target'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=RANDON_STATE)

print(f"""X_train's shape: {X_train.shape}
X_test's shape: {X_test.shape}
y_train's shape: {y_train.shape}
y_test's shape: {y_test.shape}""")

X_train's shape: (3085, 11)
X_test's shape: (343, 11)
y_train's shape: (3085,)
y_test's shape: (343,)


# Modeling

## 1st experiment: default hyperparameters

In [75]:
model = KNeighborsClassifier(n_jobs=-1)

model.fit(X_train, y_train)

In [76]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.72      0.67       180
           1       0.63      0.53      0.58       163

    accuracy                           0.63       343
   macro avg       0.63      0.62      0.62       343
weighted avg       0.63      0.63      0.63       343



### Predicting testing dataset

In [77]:
y_df_test = model.predict(df_test)
pd.DataFrame(y_df_test, index=df_test.index, columns=['target']).to_csv(DATA_FOLDER + 'output_1st_experiment.csv', sep=',')

## 2nd experiment: randomized grid search

In [78]:
model = KNeighborsClassifier(n_jobs=-1)

params = {
    'n_neighbors': np.arange(4, 50, 2).tolist(),
    'weights': ['uniform', 'distance'],
}

random_search = RandomizedSearchCV(
    model, param_distributions=params, scoring='accuracy', random_state=RANDON_STATE, cv=10, n_iter=20, n_jobs=-1
)

random_search.fit(X_train, y_train)

print(f"""Best score (accuracy): {random_search.best_score_}
Best params: {random_search.best_params_}""")

Best score (accuracy): 0.6836369940738873
Best params: {'weights': 'distance', 'n_neighbors': 16}


In [79]:
model = KNeighborsClassifier(**random_search.best_params_, n_jobs=-1)

model.fit(X_train, y_train)

In [80]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.78      0.73       180
           1       0.72      0.61      0.66       163

    accuracy                           0.70       343
   macro avg       0.71      0.70      0.70       343
weighted avg       0.70      0.70      0.70       343



### Predicting testing dataset

In [81]:
y_df_test = model.predict(df_test)
pd.DataFrame(y_df_test, index=df_test.index, columns=['target']).to_csv(DATA_FOLDER + 'output_2nd_experiment.csv', sep=',')

## 3rd experiment: grid search

In [113]:
model = KNeighborsClassifier(n_jobs=-1)

params = {
    'n_neighbors': np.arange(4, 50, 2).tolist(),
    'weights': ['uniform', 'distance'],
    'metric': ['manhattan', 'euclidean', 'cosine']
}

grid_search = GridSearchCV(
    model, param_grid=params, scoring='roc_auc', cv=5, n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"""Best score (accuracy): {grid_search.best_score_}
Best params: {grid_search.best_params_}""")

Best score (accuracy): 0.7684933545986178
Best params: {'metric': 'manhattan', 'n_neighbors': 24, 'weights': 'distance'}


In [114]:
model = KNeighborsClassifier(**grid_search.best_params_, n_jobs=-1)

model.fit(X_train, y_train)

In [115]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.81      0.74       180
           1       0.74      0.60      0.66       163

    accuracy                           0.71       343
   macro avg       0.71      0.70      0.70       343
weighted avg       0.71      0.71      0.70       343



### Predicting testing dataset

In [116]:
y_df_test = model.predict(df_test)
pd.DataFrame(y_df_test, index=df_test.index, columns=['target']).to_csv(DATA_FOLDER + 'output_3rd_experiment.csv', sep=',')

## 4th experiment: Evaluating the decision threshold

# References
We report here relevant references:
1. 