### Import Libraries

In [1]:
import pandas as pd
from catboost import CatBoostClassifier
import pickle

### Import Data

In [2]:
train = pd.read_csv('../data/readmissions_train.csv')


### Preprocessing

In [3]:
def find_diabetes_text(txt):
    try:
        if 'diabetes' in txt.lower():
            return 1
        else:
            return 0
    except:
        0

# Find out if `Diabetes|`diabetes` exists in diag_1_desc column
train['diabetes'] = train['diag_1_desc'].apply(lambda x: find_diabetes_text(x))

# Fill null values for Categorical Features
for c,typ in zip(train.columns,train.dtypes):
    if typ ==object:
        train[c] = train[c].fillna('unknown')

# Fill null values for numeric features
train = train.fillna(0)

### Split Data into X,y

In [4]:
X = train.drop(['readmitted','diag_1_desc','diag_2_desc','diag_3_desc'],axis=1)
y = train['readmitted']

### Initiate Modeling

In [5]:
model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)

cat_features = [i for i, typ in enumerate(X.dtypes) if typ == object] 

# train the model
model.fit(X,y, cat_features)


0:	learn: 0.6453099	total: 57.4ms	remaining: 57.4ms
1:	learn: 0.6333235	total: 66.1ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fa84b5e0490>

### Save Model

In [6]:
pickle.dump(model, open('custom_model/model.pkl', 'wb'))

### Save list of categorical features to impute correctly

In [30]:
cat_feature_names = [i for i, typ in zip(X.columns, X.dtypes) if typ == object]
print(cat_feature_names)

['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide.metformin', 'glipizide.metformin', 'glimepiride.pioglitazone', 'metformin.rosiglitazone', 'metformin.pioglitazone', 'change', 'diabetesMed']


### Verify custom model integrity with DRUM

In [36]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label yes --negative-class-label no

          yes        no
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
          yes        no
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
          yes        no
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
          yes        no
0    0.539075  0.460925
1    0.539075  0