# Load dependencies
---

In [1]:
! pip install gokinjo scikit-learn

Collecting gokinjo
  Downloading gokinjo-0.1.0-py3-none-any.whl (10 kB)
Installing collected packages: gokinjo
Successfully installed gokinjo-0.1.0


In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.calibration import CalibratedClassifierCV

from gokinjo import knn_kfold_extract
from gokinjo import knn_extract

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt

# imbalanced
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [3]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id')
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id')
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [4]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=5
N_REPEAT = 2
SEED=314
kf = RepeatedStratifiedKFold(n_splits=K, random_state=SEED, n_repeats=N_REPEAT)

In [5]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred, **kwargs):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# KNeighborsClassifier 
---

Get best k

In [6]:
#k_range = list(range(1, 9))
#param_grid = dict(n_neighbors=k_range)

In [7]:
#knn = KNeighborsClassifier(n_jobs=-1)
#grid = GridSearchCV(knn, param_grid, cv=kf, scoring='roc_auc', n_jobs=-1)
#grid.fit(X, y);

In [8]:
#grid_mean_scores = grid.cv_results_['mean_test_score']

In [9]:
## plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
#plt.plot(k_range, grid_mean_scores)
#plt.xlabel('Value of K for KNN')
#plt.ylabel('Cross-Validated AUC')

In [10]:
#best_k = k_range[np.argmax(grid_mean_scores)]
best_k=1
#print("Best k: ", best_k)

# Export
---

In [11]:
# convert to numpy because gokinjo expects np arrays
X = X.to_numpy()
y = y.to_numpy()
X_test = X_test.to_numpy()


In [12]:
KNN_feat_train = knn_kfold_extract(X, y, k=best_k, normalize='standard')
print("KNN features for training set, shape: ", np.shape(KNN_feat_train))

KNN features for training set, shape:  (14123, 2)


In [13]:
KNN_feat_test = knn_extract(X, y, X_test, k=best_k, normalize='standard')
print("KNN features for test set, shape: ", np.shape(KNN_feat_test))

KNN features for test set, shape:  (21183, 2)


In [14]:
KNN_feat_train = pd.DataFrame(KNN_feat_train, columns=["knn"+str(x) for x in list(range(KNN_feat_train.shape[1]))])
KNN_feat_test = pd.DataFrame(KNN_feat_test, columns=["knn"+str(x) for x in list(range(KNN_feat_train.shape[1]))])

In [15]:
# store KNN features, they are computationally expensive
KNN_feat_train.to_csv('knn_feat_train.csv',index=False)
KNN_feat_test.to_csv('knn_feat_test.csv',index=False)

In [16]:
# add KNN feature to normal features
X, X_test = np.append(X, KNN_feat_train, axis=1), np.append(X_test, KNN_feat_test, axis=1) 
print("Train set, shape: ", np.shape(X))
print("Test set, shape: ", np.shape(X_test))

Train set, shape:  (14123, 70)
Test set, shape:  (21183, 70)


# References

- https://github.com/momijiame/gokinjo
- https://www.kaggle.com/melanie7744/tps6-boost-your-score-with-knn-features