In [31]:
import pandas as pd
import numpy as np
import datetime

from catboost import CatBoostClassifier, Pool

In [16]:
def preprocess(csvfile):
    df = pd.read_csv(csvfile)
    labels = df['hospital_death'].tolist()
    df = df.fillna(-999)
    df = df.drop(['encounter_id', 'patient_id', 'hospital_death', 'readmission_status'], axis=1)
    return df, labels

### Load data

In [17]:
train_features, train_labels = preprocess('training_v2.csv')
test_features, _ = preprocess('unlabeled.csv')

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.1, random_state=42)

### Explore data

In [19]:
X_train.head()

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
19209,55,54.0,27.681661,0,Caucasian,F,170.0,Emergency Department,Accident & Emergency,376,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic
35140,70,42.0,32.456371,0,Caucasian,M,182.5,Emergency Department,Accident & Emergency,464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
61458,39,57.0,-999.0,0,Caucasian,M,190.5,Emergency Department,Accident & Emergency,616,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic
68992,37,66.0,29.049732,0,Other/Unknown,F,167.6,-999,Accident & Emergency,685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
47556,21,59.0,33.6726,0,Caucasian,F,170.18,Direct Admit,Other ICU,513,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Genitourinary,Renal/Genitourinary


In [20]:
cat_index = np.where(X_train.dtypes != float)[0]

In [21]:
cat_index

array([  0,   3,   4,   5,   7,   8,   9,  10,  11,  17, 180, 181])

In [None]:
# todo: find cols which are only 0 and 1, add to the list of categorical variables

### Train model

In [22]:
model = CatBoostClassifier(eval_metric='Accuracy', 
                           cat_features=cat_index,
                           use_best_model=True, 
                           random_seed=42)

In [23]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.124023
0:	learn: 0.9204395	test: 0.9192106	best: 0.9192106 (0)	total: 251ms	remaining: 4m 11s
1:	learn: 0.9242316	test: 0.9213912	best: 0.9213912 (1)	total: 395ms	remaining: 3m 17s
2:	learn: 0.9252735	test: 0.9247710	best: 0.9247710 (2)	total: 534ms	remaining: 2m 57s
3:	learn: 0.9261579	test: 0.9244440	best: 0.9247710 (2)	total: 657ms	remaining: 2m 43s
4:	learn: 0.9260731	test: 0.9241169	best: 0.9247710 (2)	total: 796ms	remaining: 2m 38s
5:	learn: 0.9266304	test: 0.9233537	best: 0.9247710 (2)	total: 916ms	remaining: 2m 31s
6:	learn: 0.9266667	test: 0.9232447	best: 0.9247710 (2)	total: 1.04s	remaining: 2m 27s
7:	learn: 0.9271392	test: 0.9248801	best: 0.9248801 (7)	total: 1.16s	remaining: 2m 23s
8:	learn: 0.9274542	test: 0.9252072	best: 0.9252072 (8)	total: 1.29s	remaining: 2m 22s
9:	learn: 0.9273573	test: 0.9249891	best: 0.9252072 (8)	total: 1.42s	remaining: 2m 20s
10:	learn: 0.9276844	test: 0.9255342	best: 0.9255342 (10)	total: 1.55s	remaining: 2m 19s
11:	learn: 

<catboost.core.CatBoostClassifier at 0x11e703940>

### Make predictions

In [29]:
def make_submission(preds):
    df = pd.read_csv('solution_template.csv', usecols=['encounter_id'])
    df['hospital_death'] = preds
    timestamp = '{:%y%m%d_%H%M}'.format(datetime.datetime.now())
    df.to_csv(f'solution_{timestamp}.csv', index=False)

In [25]:
preds = model.predict_proba(test_features)[:,1]

In [32]:
make_submission(preds)

https://www.kaggle.com/c/widsdatathon2020/submit