In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/students-drop-out-prediction/train.csv
/kaggle/input/students-drop-out-prediction/test.csv


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train_df = pd.read_csv("../input/students-drop-out-prediction/train.csv")
test_df = pd.read_csv("../input/students-drop-out-prediction/test.csv")

In [3]:
binary_col_names = ["v_1", "v_26", "v_11", "v_14", "v_30", "v_28", "v_9", "v_27", "label"]
non_binary_col_names = list(set(train_df.columns) - set(binary_col_names) - set(["id"]))

In [5]:
# We do not need 'id' column
# Scale non binary features
transform_pipeline = ColumnTransformer(
    [
        ("drop_id", "drop", ["id"]),
        ("scale_non_binary", StandardScaler(), non_binary_col_names)
    ],
    remainder="passthrough"
)
transform_train_data = transform_pipeline.fit_transform(train_df)
feature_names = [name.split("__")[1] for name in transform_pipeline.get_feature_names_out()]
transform_train_df = pd.DataFrame(data=transform_train_data, columns=feature_names)
transform_train_df = transform_train_df.astype(dict([(col, "int") for col in binary_col_names]))

In [6]:
transform_train_df

Unnamed: 0,v_5,v_4,v_38,v_36,v_7,v_33,v_13,v_31,v_15,v_18,...,v_19,v_1,v_11,v_14,v_26,v_27,v_30,v_9,v_28,label
0,-0.651208,-0.984845,0.937116,1.392576,0.440906,-0.346177,-0.497793,-0.777790,-0.236435,-0.544365,...,0.116531,1,1,0,0,1,0,0,0,1
1,1.345729,-0.762679,1.217297,0.722680,0.733830,-0.039075,2.300727,-0.650992,0.151760,2.808169,...,0.914999,1,0,1,1,0,1,1,0,2
2,-0.651208,1.236812,0.656935,0.521711,-0.379281,-0.346177,-0.497793,-0.143800,-0.624630,-0.544365,...,0.116531,1,1,0,0,1,1,0,0,1
3,0.014438,0.481449,0.656935,0.923649,-0.379281,-0.039075,-0.497793,-0.777790,0.539956,-0.683228,...,0.382687,1,1,0,0,1,0,1,0,1
4,2.233257,-1.740208,-0.407754,-0.081196,-0.437866,3.339046,5.410194,2.138565,2.480932,0.963283,...,4.641181,1,0,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,0.680083,0.059334,-1.080189,1.057628,-1.404516,-0.346177,-0.497793,-0.650992,0.539956,-0.683228,...,-0.149625,1,1,0,0,1,1,0,0,2
3792,0.901965,0.681398,1.777660,-0.550123,0.001520,1.189333,1.989780,-0.270598,3.257323,-0.564203,...,0.914999,1,0,1,1,1,1,1,0,2
3793,2.011375,-0.895979,-1.080189,-0.885071,-0.730790,0.575129,0.435047,-0.270598,0.928151,-0.584040,...,0.382687,1,0,1,1,1,1,1,0,0
3794,0.680083,0.503665,0.937116,0.655690,2.579252,0.575129,-0.497793,0.109796,2.869128,0.943446,...,1.979622,1,0,1,1,0,1,1,0,1


In [None]:
y_train_com = transform_train_df.pop("label")
X_train_com = transform_train_df
X_train, X_valid, y_train, y_valid = train_test_split(X_train_com, y_train_com, test_size=0.3, random_state=1)

In [12]:
# Default parameters
model = KNeighborsClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.67      0.56      0.61       375
           1       0.62      0.88      0.73       540
           2       0.46      0.12      0.19       224

    accuracy                           0.62      1139
   macro avg       0.58      0.52      0.51      1139
weighted avg       0.60      0.62      0.58      1139



In [13]:
# Weights = inverse of distance
model = KNeighborsClassifier(weights="distance")
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.72      0.51      0.60       375
           1       0.63      0.89      0.74       540
           2       0.40      0.19      0.26       224

    accuracy                           0.63      1139
   macro avg       0.58      0.53      0.53      1139
weighted avg       0.61      0.63      0.60      1139



In [16]:
# Metric = Manhattan distance
# Weight = inverse of distance
model = KNeighborsClassifier(weights="distance", p=1)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.76      0.56      0.64       375
           1       0.66      0.93      0.77       540
           2       0.49      0.23      0.31       224

    accuracy                           0.67      1139
   macro avg       0.64      0.57      0.58      1139
weighted avg       0.66      0.67      0.64      1139



In [21]:
# Metric = Manhattan distance
# Weight = inverse of distance
# Algorithm = BallTree
model = KNeighborsClassifier(weights="distance", p=1, algorithm="ball_tree")
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.76      0.56      0.64       375
           1       0.66      0.93      0.77       540
           2       0.49      0.23      0.31       224

    accuracy                           0.67      1139
   macro avg       0.64      0.57      0.58      1139
weighted avg       0.66      0.67      0.64      1139



### Seems like changing algo does not have any affect
### Now we will perform hyper parameter tuning for the `leaf_size` parameter

In [22]:
param_grid = {
    "leaf_size": [5, 10, 30, 50, 100, 200]
}
model = KNeighborsClassifier(weights="distance", p=1)
search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring="f1_macro", refit=True)
search.fit(X_train_com, y_train_com)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(p=1, weights='distance'),
             param_grid={'leaf_size': [5, 10, 30, 50, 100, 200]},
             scoring='f1_macro')

In [23]:
search.best_params_

{'leaf_size': 5}

In [24]:
search.best_score_

0.579310097333041

In [None]:
search.cv_results_

#### Seems like changing `leaf_size` param has no affect on the final result, it only affects the performance
##### For more details, see: https://stackoverflow.com/questions/65003877/understanding-leafsize-in-scipy-spatial-kdtree

#### The best score in clustering is
#### Accuracy = 0.67
#### Macro Average F score = 0.58
#### Parameters used were, manhattan distance, algorithm = auto, weight = inverse of distance