In [8]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Load data

In [9]:
df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
#df = pd.read_csv('data/train.csv')

df.drop(columns=['Id'], inplace=True)

display(df.head())
display(df.shape)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


(617, 57)

### Preprocessing

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline

target = df['Class']
data = df.drop(columns=['Class'])

categorical_features = ['EJ']
numeric_features = [col for col in data.columns if col != 'EJ']

preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), numeric_features),
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder()), categorical_features)
)

transformed_data = preprocessor.fit_transform(data)

In [11]:
pd.DataFrame(transformed_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.035806,-0.250869,-0.940094,-0.41026,-0.655511,-0.948991,0.531241,-0.815752,0.0,1.0
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.060566,0.113218,-1.14507,-0.41026,0.687893,-0.238862,-0.509218,1.304239,1.0,0.0
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.051023,0.596934,1.637944,-0.29921,-0.05185,-0.351743,-0.424754,-0.808323,0.0,1.0
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,-0.060566,-0.105234,-0.219883,-0.342195,-0.650833,0.858232,1.101332,-0.812311,0.0,1.0
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.896815,-0.230064,-0.432313,0.09992,-0.318309,1.409422,-0.395228,-0.818054,0.0,1.0


### Dimensionality Reduction using Linear Discrimination Analysis

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

reducer = LinearDiscriminantAnalysis()
reduced_data = reducer.fit_transform(transformed_data, target)

In [13]:
pd.DataFrame(reduced_data).head()

Unnamed: 0,0
0,1.55084
1,-1.503885
2,0.168198
3,-0.163155
4,1.839952


### Create the KNN model


In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import (
    GridSearchCV,
    cross_validate
)

pipeline = make_pipeline(preprocessor, reducer, KNeighborsClassifier())
pipeline.fit(data, target)
cv_scores = cross_validate(pipeline, data, target, scoring=['accuracy', 'precision', 'recall', 'f1'])

print(f'Accuracy: {cv_scores["test_accuracy"].mean()}')
print(f'Precision: {cv_scores["test_precision"].mean()}')
print(f'Recall: {cv_scores["test_recall"].mean()}')
print(f'F1: {cv_scores["test_f1"].mean()}')
display(pd.DataFrame(cv_scores))

Accuracy: 0.8864935746131654
Precision: 0.7077595950660965
Recall: 0.5922077922077922
F1: 0.6432343858322305


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.123488,0.03968,0.879032,0.705882,0.545455,0.615385
1,0.19835,0.084087,0.927419,0.809524,0.772727,0.790698
2,0.125901,0.054418,0.845528,0.578947,0.5,0.536585
3,0.192117,0.090978,0.869919,0.666667,0.47619,0.555556
4,0.152239,0.044912,0.910569,0.777778,0.666667,0.717949


### Hyperparameter Optimization


In [15]:
param_grid = {"kneighborsclassifier__n_neighbors": (1, 2, 3, 5, 7, 11, 13, 17, 19, 23)}

classifier = GridSearchCV(pipeline, param_grid=param_grid, scoring='f1')
classifier.fit(data, target)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'kneighborsclassifier__n_neighbors': (1, ...)}"
,scoring,'f1'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,solver,'svd'
,shrinkage,
,priors,
,n_components,
,store_covariance,False
,tol,0.0001
,covariance_estimator,

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
classifier.best_params_

{'kneighborsclassifier__n_neighbors': 5}

### Export the predictions for the logistic regression model

In [17]:
#test_df = pd.read_csv('data/test.csv')
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

# Prepare test features
test_x = test_df.drop(columns=['Id'])

model = make_pipeline(preprocessor, reducer, KNeighborsClassifier(n_neighbors=classifier.best_params_['kneighborsclassifier__n_neighbors']))
model.fit(data, target)
prediction = model.predict_proba(test_x)
submission = pd.DataFrame(prediction)
submission['Id'] = test_df['Id'].values
submission.rename(columns={0: "class_0", 1: "class_1"}, inplace=True)

submission.to_csv('submission.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/icr-identify-age-related-conditions/test.csv'