In [1]:
import pandas as pd
import numpy as np
import time

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from ml_metrics import rmsle

# suppres some notifications
pd.options.mode.chained_assignment = None 

In [2]:
train = pd.read_hdf('../input/diabetic_train.h5')
test = pd.read_hdf('../input/diabetic_test.h5')
test['readmitted'] = -1

df = pd.concat([train, test])

# Feature Engineering

In [3]:
def present_with_higher_id(x):
    if x['patient_nbr'] in global_patients_frequent:
        # is frequent
        max_id = df[ df['patient_nbr'] == x['patient_nbr'] ]['id'].max()
        if max_id > x['id']:
            return 1
        else:
            return 0
    
    return 0

drug_keys_important = [ 'metformin' , 'repaglinide', 'nateglinide', 'glimepiride',
                        'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin']
top_specialities = ['notSet', 'InternalMedicine', 'Emergency/Trauma',
                    'Family/GeneralPractice', 'Cardiology', 'Surgery-General']

# IDs of the patients that exists more then once in the train set
train_patients_frequent = train['patient_nbr'].value_counts()
train_patients_frequent = train_patients_frequent[ train_patients_frequent > 1 ]

# IDs of the patients that exists more then once in the test set
test_patients_frequent = test['patient_nbr'].value_counts()
test_patients_frequent = test_patients_frequent[ test_patients_frequent > 1 ]

# IDs of the patients that exists more then once in the whole dataset
global_patients_frequent = df['patient_nbr'].value_counts()
global_patients_frequent = global_patients_frequent[ global_patients_frequent > 1 ]

# dataframe of the frequent IDs
gpdf = global_patients_frequent.to_frame()

# IDs of the patients that are frequent in the train or test sets (combined)
full_frequent = (train_patients_frequent + test_patients_frequent)

#############################################

df['race'] = df['race'].map(lambda x: 'notSet' if x == '?' else x)
df['race_cat'], labels = pd.factorize(df['race'])

df['gender_cat'] = df['gender'].map(lambda x: 1 if x == 'Male' else 0)

df['age'] = pd.to_numeric(df['age'].map(lambda x: x.split('-')[1].split(')')[0]))

df['weight'] = pd.to_numeric(df['weight'].map(lambda x: 0 if x == '?' else x.split('-')[1].split(')')[0]))

df['payer_code_cat'], labels = pd.factorize(df['payer_code']) 

df['readmission_not_possible'] = df['discharge_disposition_id'].map(lambda x: 1 if x == 11 else 0)

df['medical_specialty'] = df['medical_specialty'].map(lambda x: 'notSet' if x == '?' else x)
df['med_top'] = df['medical_specialty'].copy()
df.loc[ ~df['med_top'].isin(top_specialities), 'med_top' ] = 'Other'
df_cat = pd.get_dummies(df[ ['med_top'] ], drop_first = True)
df = pd.concat([df, df_cat], axis = 1)

df['medical_specialty_cat'], labels = pd.factorize(df['medical_specialty']) 

df['diag_1_cat'], labels = pd.factorize(df['diag_1']) 
df['diag_2_cat'], labels = pd.factorize(df['diag_2']) 
df['diag_3_cat'], labels = pd.factorize(df['diag_3']) 

df['max_glu_serum_cat'], labels = pd.factorize(df['max_glu_serum']) 
df['A1Cresult_cat'], labels = pd.factorize(df['A1Cresult']) 

for col in drug_keys_important:
    colname = str(col) + 'temp'
    df[colname] = df[col].apply(lambda x: 0 if (x == 0 or x == 1) else 1)
df['important_drugs_change'] = 0
for col in drug_keys_important:
    colname = str(col) + 'temp'
    df['important_drugs_change'] = df['important_drugs_change'] + df[colname]
    del df[colname]
    
df['important_drugs_count'] = 0
for col in drug_keys_important:
    df['important_drugs_count'] = df['important_drugs_count'] + df[col].map(lambda x: 0 if x == 0 else 1)

df['change'] = pd.to_numeric(df['change'].map(lambda x: 1 if x else 0))

df['diabetesMed'] = pd.to_numeric(df['diabetesMed'].map(lambda x: 1 if x else 0))

df['sum_visits'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient'] 

df['is_frequent'] = df['patient_nbr'].map(lambda x: 1 if x in (full_frequent) else 0)

df['is_global_frequent'] = df['patient_nbr'].map(lambda x: 1 if x in (global_patients_frequent) else 0)

df['global_frequency'] = df['patient_nbr'].map(lambda x: gpdf.loc[x].at['patient_nbr'] if x in (global_patients_frequent) else 1)

df['present_with_higher_id'] = df.apply(present_with_higher_id, axis=1)

#
# cleanup
#
columns_to_remove = [ 'race', 'gender', 'payer_code', 'med_top',
                      'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult',
                      'payer_code_cat', 'nateglinide', 'glimepiride', 'pioglitazone', 'rosiglitazone',
                      'chlorpropamide', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone',
                      'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 
                      'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone',
                      'metformin' , 'repaglinide', 'nateglinide', 'glimepiride',
                        'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone',
                    ]
for col_to_remove in columns_to_remove:
    if col_to_remove in df: del df[col_to_remove]

In [4]:
train = df[ df.readmitted != -1 ]
test = df[ df.readmitted == -1 ]
train.shape, test.shape

((33051, 42), (33170, 42))

# Prepare features

In [5]:
black_list = ['readmitted', 'id', 'encounter_id', 'patient_nbr']

bool_features = train.select_dtypes(include=[np.bool]).columns.values.tolist()

cat_feats = [feat for feat in train.columns if 'cat' in feat]

numeric_features = train.select_dtypes(include=[np.float64, np.int64, np.int16, np.int8, np.uint8]).columns.values
numeric_features = [feat for feat in numeric_features if feat not in (black_list + cat_feats) ]

feats = bool_features + numeric_features + cat_feats 

feats = [feat for feat in feats if feat not in (black_list)]

X = train[ feats ].values
y = train[ 'readmitted' ].values

print("Selected features: ", feats)

Selected features:  ['age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'change', 'diabetesMed', 'readmission_not_possible', 'med_top_Emergency/Trauma', 'med_top_Family/GeneralPractice', 'med_top_InternalMedicine', 'med_top_Other', 'med_top_Surgery-General', 'med_top_notSet', 'important_drugs_change', 'important_drugs_count', 'sum_visits', 'is_frequent', 'is_global_frequent', 'global_frequency', 'present_with_higher_id', 'num_medications', 'race_cat', 'gender_cat', 'medical_specialty_cat', 'diag_1_cat', 'diag_2_cat', 'diag_3_cat', 'max_glu_serum_cat', 'A1Cresult_cat']


### Cross validation

In [6]:
cv = KFold(n_splits=3, shuffle=True, random_state=2018)

xgb_params = {
    'n_jobs': 8, 
    'max_depth': 4, 
    'n_estimators': 600, 
    'learning_rate': 0.035,
    'random_state': 2019,
}

scores = {
    'x1_RMSLE' : []
}

def perform_scoring(m, y_true, y_pred):
    score = rmsle(y[test_idx], y_pred)
    print(m + " RMSLE: ", score)
    scores[m + '_RMSLE'].append(score)
    
fold = 0
for train_idx, test_idx in cv.split(X):
    fold += 1
    print("fold: ", fold)

    # first model
    model = XGBClassifier(**xgb_params)
    model.fit(X[train_idx], y[train_idx])
    y_pred = model.predict(X[test_idx])
    perform_scoring('x1', y[test_idx], y_pred)

for s in scores:
    print(s, np.mean(scores[s]), np.std(scores[s]))

fold:  1
x1 RMSLE:  1.2644571790473973
fold:  2
x1 RMSLE:  1.2871874082916193
fold:  3
x1 RMSLE:  1.2287877939120235
x1_RMSLE 1.26014412708368 0.024035814289689532


#### The result of the cross validation should be: x1_RMSLE 1.26014412708368 0.024035814289689532

# Training of the final model

In [7]:
#
# take the full dataset
#

XX = train[ feats ].values
yy = train[ 'readmitted' ].values
Xt = test[feats].values

#
# prepare the final model
#

model = XGBClassifier(**xgb_params)
%time model.fit(XX, yy)
y_pred = model.predict(Xt)

CPU times: user 43.9 s, sys: 192 ms, total: 44.1 s
Wall time: 5.53 s


In [8]:
test['readmitted'] = y_pred
test['readmitted'].value_counts()

0      29350
100     3820
Name: readmitted, dtype: int64

In [9]:
# additional adjustment
test['readmitted'] = test.apply(lambda x: 100 if x['present_with_higher_id'] == 1 else x['readmitted'], axis=1)
test['readmitted'] = test.apply(lambda x: 0 if x['readmission_not_possible'] == 1 else x['readmitted'], axis=1)

In [10]:
test['readmitted'].value_counts()

0      29348
100     3822
Name: readmitted, dtype: int64

# Save to the submission file

In [13]:
test[ ['id', 'readmitted'] ].to_csv('../output/predictions.csv', index=False) 