In [1]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Load data

In [2]:
#df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
df = pd.read_csv('data/train.csv')

df.drop(columns=['Id'], inplace=True)

display(df.head())
display(df.shape)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


(617, 57)

### Preprocessing

In [3]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline

target = df['Class']
data = df.drop(columns=['Class'])

categorical_features = ['EJ']
numeric_features = [col for col in data.columns if col != 'EJ']

preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), numeric_features),
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder()), categorical_features)
)

transformed_data = preprocessor.fit_transform(data)

In [4]:
pd.DataFrame(transformed_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.035806,-0.250869,-0.940094,-0.41026,-0.655511,-0.948991,0.531241,-0.815752,0.0,1.0
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.060566,0.113218,-1.14507,-0.41026,0.687893,-0.238862,-0.509218,1.304239,1.0,0.0
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.051023,0.596934,1.637944,-0.29921,-0.05185,-0.351743,-0.424754,-0.808323,0.0,1.0
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,-0.060566,-0.105234,-0.219883,-0.342195,-0.650833,0.858232,1.101332,-0.812311,0.0,1.0
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.896815,-0.230064,-0.432313,0.09992,-0.318309,1.409422,-0.395228,-0.818054,0.0,1.0


### Create the Gaussian Process Classification model


In [7]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import (
    cross_validate
)

pipeline = make_pipeline(preprocessor, GaussianProcessClassifier())
pipeline.fit(data, target)
cross_validate(pipeline, data, target, scoring=['accuracy', 'precision', 'recall', 'f1'])

{'fit_time': array([0.0810039 , 0.07587528, 0.07423878, 0.08115888, 0.08021855]),
 'score_time': array([0.01824117, 0.01918411, 0.01871848, 0.01965952, 0.01997423]),
 'test_accuracy': array([0.88709677, 0.87096774, 0.87804878, 0.82113821, 0.90243902]),
 'test_precision': array([0.83333333, 0.6875    , 0.81818182, 0.45454545, 0.84615385]),
 'test_recall': array([0.45454545, 0.5       , 0.40909091, 0.23809524, 0.52380952]),
 'test_f1': array([0.58823529, 0.57894737, 0.54545455, 0.3125    , 0.64705882])}

### Export the predictions for the logistic regression model

In [6]:
test_df = pd.read_csv('data/test.csv')
#test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

# Prepare test features
test_x = test_df.drop(columns=['Id'])

prediction = pipeline.predict_proba(test_x)
submission = pd.DataFrame(prediction)
submission['Id'] = test_df['Id'].values
submission.rename(columns={0: "class_0", 1: "class_1"}, inplace=True)

submission.to_csv('submission.csv', index=False)