In [2]:
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import re

from scipy import stats
from scipy.stats import skew
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

pd.set_option('display.max_columns',99)
pd.set_option('display.max_rows',300)

In [3]:
train_orig = pd.read_csv('dataset_diabetes/train_dataset.csv')
test_orig = pd.read_csv('dataset_diabetes/test_dataset.csv')
icd9_orig = pd.read_csv('dataset_diabetes/icd9.csv')
admin_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', nrows = 8)
discharge_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', skiprows = 10, nrows = 30)
admin_source_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', skiprows = 42, nrows = 27)

In [4]:
train = train_orig.copy()
test = test_orig.copy()
icd9 = icd9_orig.copy()
admin_id = admin_id_orig.copy()
discharge_id = discharge_id_orig.copy()
admin_source_id = admin_source_id_orig.copy()

In [5]:
train_rows = train.shape[0] 
test_rows = test.shape[0]
print('There are {} rows in train'.format(train_rows))
print('There are {} rows in test'.format(test_rows))

There are 91589 rows in train
There are 10177 rows in test


In [6]:
#combined[:train_rows].head(3)

In [7]:
def summary_df(file,col):
    value_summary = {'values':file[col].value_counts().index.tolist(),
                     'counts':file[col].value_counts().values.tolist(),
                     'frequency': file[col].value_counts(normalize = True).values.tolist()}
    col_df = pd.DataFrame(value_summary)
    return col_df

In [8]:
#combined[:train_rows].tail(3)

In [9]:
#combined[train_rows:].head(3)

**Conversion process of certain numeric columns to categorical, and for all 'Object' type columns to categorical and question marks to NaN**

In [10]:
train_colnames = train.columns.tolist()

In [11]:

# numericcols = []

# [numericcols.append(col) for col in train_colnames if train[col].dtype == 'int64']

numericcols = train.select_dtypes(exclude = ['object']).columns.tolist()
catcolumns = train.select_dtypes(include = ['object']).columns.tolist()

In [12]:
print('Numeric cols','\n',numericcols)
print('\n','Category cols','\n',catcolumns)

Numeric cols 
 ['encounter_id', 'patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

 Category cols 
 ['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [13]:
# catcolumns = []
# [catcolumns.append(col) for col in train_colnames if col not in numericcols]

In [14]:
train = train.replace('?',np.nan)
test = test.replace('?',np.nan)

In [15]:
#convert object columns to category type and replace ?s with NaN for accurate missingness representation

tocats = ['admission_type_id','discharge_disposition_id','admission_source_id']

for col in tocats:
    train.loc[:,col] = train.loc[:,col].astype('object')
    test.loc[:,col] = test.loc[:,col].astype('object')

for col in train_colnames:
    if train[col].dtypes == 'object':
        train.loc[:,col] = train.loc[:,col].astype('object')
    if test[col].dtypes == 'object':
        test.loc[:,col] = test.loc[:,col].astype('object')

### Before Further Analysis, need to confirm that basic assumptions for Logistic Reagression are upheld:

- logistic regression requires the observations to be independent of each other.  In other words, the observations should not come from repeated measurements or matched data.

In [16]:
#are there any duplicate encounter_ids?
print('There are {} unique encounter IDs in train.'.format(len(train['encounter_id'].unique())))
print('There are {} unique patient numbers in train.'.format(len(train['patient_nbr'].unique())))

print('There are {} unique encounter IDs in test.'.format(len(test['encounter_id'].unique())))
print('There are {} unique patient numbers in test.'.format(len(test['patient_nbr'].unique())))

There are 91589 unique encounter IDs in train.
There are 65908 unique patient numbers in train.
There are 10177 unique encounter IDs in test.
There are 9650 unique patient numbers in test.


In [17]:
summary_df(train,'readmitted')

Unnamed: 0,values,counts,frequency
0,NO,49382,0.53917
1,>30,31990,0.349278
2,<30,10217,0.111553


In [18]:
repeat_patients = train[train.duplicated(subset = 'patient_nbr', keep = 'first')].sort_values(by = ['patient_nbr'])
repeat_patients.head(15)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
31143,26264286,135,Caucasian,Female,[50-60),,1,1,7,3,,Surgery-Cardiovascular/Thoracic,31,1,14,0,0,1,998.0,41.0,250.0,5,,,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Ch,Yes,>30
73493,83281464,1152,AfricanAmerican,Female,[60-70),,1,1,7,12,,Hematology/Oncology,37,1,18,0,0,2,282.0,287.0,466.0,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
18843,80742510,1152,AfricanAmerican,Female,[60-70),,1,1,7,8,,,30,1,16,0,0,1,282.0,250.0,,2,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
3663,30180318,1152,AfricanAmerican,Female,[50-60),,1,1,7,6,,Hematology/Oncology,45,4,15,0,0,2,282.0,794.0,250.0,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
16182,8380170,1152,AfricanAmerican,Female,[50-60),,1,1,7,6,,Hematology/Oncology,43,2,13,0,0,1,282.0,250.01,,2,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
87871,60254142,1314,Caucasian,Female,[40-50),,2,1,1,2,,InternalMedicine,50,5,13,0,0,0,996.0,411.0,401.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
21698,70601076,1314,Caucasian,Female,[40-50),,1,1,7,3,,Cardiology,54,3,14,0,0,2,78.0,250.0,414.0,7,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
86122,33503946,5220,Caucasian,Male,[70-80),,2,1,1,11,,Cardiology,65,4,19,0,0,2,404.0,427.0,276.0,9,,>8,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
80960,60892254,5220,Caucasian,Male,[70-80),,1,1,7,1,,InternalMedicine,35,0,12,0,0,2,250.7,707.0,428.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
67762,7981038,5220,Caucasian,Male,[60-70),,1,1,7,2,,InternalMedicine,15,0,14,0,0,0,276.0,426.0,558.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30


In [19]:
num_repeat_patients = train_rows - len(train['patient_nbr'].unique())
print('There are {} encounters where it is a repeat patient.'.format(num_repeat_patients))

There are 25681 encounters where it is a repeat patient.


#**filter train and test to only consider patients who have unique encounters. For simplicity,
take the group of repeat encounters and consider only the first one in order to preserve the 
assumption of each observation being independent from one another.**

In [20]:
#drop repeat patients in train and test file to preserve indepedence assumption for each observation
train.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
test.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
train.reset_index(inplace = True)
test.reset_index(inplace = True)

del train['index']
del test['index']

In [21]:
#are there any duplicate encounter_ids?
print('There are {} unique encounter IDs in train.'.format(len(train['encounter_id'].unique())))
print('There are {} unique patient numbers in train.'.format(len(train['patient_nbr'].unique())))

print('There are {} unique encounter IDs in test.'.format(len(test['encounter_id'].unique())))
print('There are {} unique patient numbers in test.'.format(len(test['patient_nbr'].unique())))

There are 65908 unique encounter IDs in train.
There are 65908 unique patient numbers in train.
There are 9650 unique encounter IDs in test.
There are 9650 unique patient numbers in test.


In [22]:
def update_rows_cols(train_file,test_file):
    train_rows = train.shape[0] 
    test_rows = test.shape[0]
    train_colnames = train.columns.tolist()
    test_colnames = test.columns.tolist()
    print('There are now {} rows in train'.format(train_rows))
    print('There are now {} rows in test'.format(test_rows))
    print('There are now {} columns in train'.format(len(train_colnames)))
    print('There are now {} columns in test'.format(len(test_colnames)))
    return train_rows, test_rows, train_colnames, test_colnames

In [23]:
#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train,test)

There are now 65908 rows in train
There are now 9650 rows in test
There are now 50 columns in train
There are now 50 columns in test


In [24]:
#combined[combined['payer_code'].isnull()]

In [25]:
#pandas_profiling.ProfileReport(combined)
#combined.profile_report(correlations={'cramers': False})

In [26]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65908 entries, 0 to 65907
Data columns (total 50 columns):
encounter_id                65908 non-null int64
patient_nbr                 65908 non-null int64
race                        64179 non-null object
gender                      65908 non-null object
age                         65908 non-null object
weight                      2581 non-null object
admission_type_id           65908 non-null object
discharge_disposition_id    65908 non-null object
admission_source_id         65908 non-null object
time_in_hospital            65908 non-null int64
payer_code                  37974 non-null object
medical_specialty           34104 non-null object
num_lab_procedures          65908 non-null int64
num_procedures              65908 non-null int64
num_medications             65908 non-null int64
number_outpatient           65908 non-null int64
number_emergency            65908 non-null int64
number_inpatient            65908 non-null int64
d

**Filter out encounters where patients expired or who have been discharged to hospice (are not candidates for readmission as they have either passed away or are in preparation to pass away at hospice). Convert readmissions column to 0 (not readmitted or readmission > 30 days) or 1 (readmission < 30 days).** 

In [27]:
#Remove discharge_disposition_ids denoting expiry or discharge to hospice. Update train and test
#row variables for the removed rows.
train = train.loc[~train['discharge_disposition_id'].isin([11,13,14,19,20,21])]
test = test.loc[~test['discharge_disposition_id'].isin([11,13,14,19,20,21])]

#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train,test)

There are now 64193 rows in train
There are now 9412 rows in test
There are now 50 columns in train
There are now 50 columns in test


In [28]:
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)

In [29]:
#missing in train?
for col in train_colnames:
    missing = np.sum(train.loc[:,col].isnull())
    if missing > 0:
        print(col, missing)

race 1701
weight 61677
payer_code 27277
medical_specialty 30887
diag_1 11
diag_2 270
diag_3 1061


In [30]:
#missing in test?
for col in test_colnames:
    missing = np.sum(test.loc[:,col].isnull())
    if missing > 0:
        print(col, missing)

race 222
weight 9077
payer_code 3828
medical_specialty 4575
diag_1 3
diag_2 32
diag_3 134


In [31]:
#drop columns weight and payer_code. Too many NaNs.

train.drop(['weight','payer_code'], axis = 1, inplace = True)
test.drop(['weight','payer_code'], axis = 1, inplace = True)

In [32]:
#convert readmission column to 1 (for readmission <30) and 0 (readmission = NO or readmission > 30)
train['readmitted'] = np.where((train['readmitted'] == '>30') | (train['readmitted'] == 'NO'), 0, 1)
test['readmitted'] = np.where((test['readmitted'] == '>30') | (test['readmitted'] == 'NO'), 0, 1)


In [33]:
#View the rows where all diagnoses are Missing
np.sum(train['diag_1'].isnull() & train['diag_2'].isnull() & train['diag_3'].isnull())

0

In [34]:
#Reset column order, and update row designation variables for train and test set.
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 64193 rows in train
There are now 9412 rows in test
There are now 48 columns in train
There are now 48 columns in test


##### Processing of diagnosis code columns to diagnosis descriptions, for ease of understanding

In [35]:
#icd9

In [36]:
#icd9.reset_index(inplace = True)

In [37]:
#create diagnosis dictionary
icd9 = icd9.set_index('Group name')
diag_dict = icd9['icd9 codes'].to_dict()
del diag_dict['Other']

In [38]:
#populate dictionary with relevant codes
two = [x for x in list(diag_dict.keys()) if x not in ['Diabetes','Injury','Musculoskeletal','Neoplasms']]
one = ['Injury','Musculoskeletal','Neoplasms']

for key in two:
        #value = diag_dict[key]
        temp = diag_dict[key].split(', ')
        value = list(range(int(temp[0].split('–')[0]),int(temp[0].split('–')[1])+1))
        value.append(int(temp[1]))
        diag_dict[key] = [str(x) for x in value]
            
for key in one:
    temp = diag_dict[key].split(', ')
    value = list(range(int(temp[0].split('–')[0]),int(temp[0].split('–')[1])+1))
    diag_dict[key] = [str(x) for x in value]


In [39]:
#create columns with the diagnosis descriptions/categories for train and test files. Missing
#diagnoses are placed into 'Unknown'
def get_condition(x):
    if '250' in x:
        return 'Diabetes'
    for key in diag_dict:
        if x in diag_dict[key]:
            return key
    return 'Other'

diagnoses = ['diag_1','diag_2','diag_3']
diag_descrip = []
for diagnosis in diagnoses:
    train.loc[:,diagnosis] = train.loc[:,diagnosis].fillna('Unknown')
    train[diagnosis +'_descrip'] = train[diagnosis].map(get_condition)
    diag_descrip.append(diagnosis +'_descrip')
    test.loc[:,diagnosis] = test.loc[:,diagnosis].fillna('Unknown')
    test[diagnosis +'_descrip'] = test[diagnosis].map(get_condition)

In [40]:
#Anything missing after populating the descriptions based on the above?
for diagnosis in diagnoses:
    missing = np.sum(train[diagnosis +'_descrip'].isnull())
    print(diagnosis +'_descrip', missing)

diag_1_descrip 0
diag_2_descrip 0
diag_3_descrip 0


**Update Rows and Columns following the creation of the diagnosis description columns.**

In [41]:
tmp1 = train_colnames[:train_colnames.index('number_diagnoses')]
tmp1.extend(diag_descrip)
tmp1.extend(train_colnames[train_colnames.index('number_diagnoses'):])

In [42]:
#Reset column order, and update row designation variables for train and test set.
train = train[tmp1]
test = test[tmp1]
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 64193 rows in train
There are now 9412 rows in test
There are now 51 columns in train
There are now 51 columns in test


**Fill out other columns with 'Unknown'. Update if I get a better imputation.**

In [43]:
def fill_na(file,series):
    file.loc[:,series] = file.loc[:,series].fillna('Unknown')

In [44]:
#missing in train?
fill = []
for col in train_colnames:
    missing = np.sum(train.loc[:,col].isnull())
    if missing > 0:
        fill.append(col)

        
fillt = []
for col in test_colnames:
    missing = np.sum(test.loc[:,col].isnull())
    if missing > 0:
        fillt.append(col)

print(fill)
print(fillt)

['race', 'medical_specialty']
['race', 'medical_specialty']


In [45]:
#fill columns with 'Unknown'
for series in fill:
    fill_na(train,series)
    fill_na(test,series)

In [46]:
catcols = train.select_dtypes(include = ['object']).columns.tolist()

In [47]:
for col in catcols:
    print(col,'='*(50 - len(col)))
    print(summary_df(train,col))

            values  counts  frequency
0        Caucasian   47982   0.747465
1  AfricanAmerican   11636   0.181266
2          Unknown    1701   0.026498
3         Hispanic    1363   0.021233
4            Other    1054   0.016419
5            Asian     457   0.007119
            values  counts  frequency
0           Female   34177   0.532410
1             Male   30014   0.467559
2  Unknown/Invalid       2   0.000031
     values  counts  frequency
0   [70-80)   16273   0.253501
1   [60-70)   14336   0.223327
2   [50-60)   11289   0.175860
3   [80-90)   10313   0.160656
4   [40-50)    6218   0.096864
5   [30-40)    2454   0.038228
6  [90-100)    1670   0.026015
7   [20-30)    1022   0.015921
8   [10-20)     478   0.007446
9    [0-10)     140   0.002181
   values  counts  frequency
0       1   32844   0.511645
1       3   12599   0.196268
2       2   11842   0.184475
3       6    3914   0.060972
4       5    2705   0.042139
5       8     265   0.004128
6       7      14   0.000218
7       4

   values  counts  frequency
0      No   63331   0.986572
1  Steady     764   0.011902
2      Up      71   0.001106
3    Down      27   0.000421
   values  counts  frequency
0      No   63740   0.992943
1  Steady     432   0.006730
2      Up      13   0.000203
3    Down       8   0.000125
   values  counts  frequency
0      No   64133   0.999065
1  Steady      57   0.000888
2      Up       2   0.000031
3    Down       1   0.000016
   values  counts  frequency
0      No   60811   0.947315
1  Steady    3037   0.047310
2      Up     218   0.003396
3    Down     127   0.001978
   values  counts  frequency
0      No   64192   0.999984
1  Steady       1   0.000016
   values  counts  frequency
0      No   56132   0.874426
1  Steady    7199   0.112146
2      Up     523   0.008147
3    Down     339   0.005281
   values  counts  frequency
0      No   57197   0.891016
1  Steady    6107   0.095135
2      Up     537   0.008365
3    Down     352   0.005483
   values  counts  frequency
0      No   64

In [48]:
def readmission_sum_df(file,col):
    #create counts
    tmp1  = file.groupby([col,'readmitted'])[['readmitted']].agg('count')
    tmp1.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp1.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp1.reset_index(inplace = True)
    
    return pd.merge(tmp1,pcts)

In [49]:
def by_readmission_df(file,col):
    #create counts
    tmp1  = file.groupby(['readmitted',col])[[col]].agg('count')
    tmp1.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp1.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp1.reset_index(inplace = True)
    
    return pd.merge(tmp1,pcts)

In [50]:
def col_sum_df(file,col,col2):
    #create counts
    tmp2  = file.groupby([col,col2])[[col2]].agg('count')
    tmp2.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp2.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp2.reset_index(inplace = True)
    
    return pd.merge(tmp2,pcts)

#**Impute the 'Unknown/Invalid' gender rows in train and test, based on frequencies of readmission and discharge_disposition_id**

In [51]:
readmission_sum_df(train,'gender')

Unnamed: 0,gender,readmitted,count,frequency
0,Female,0,31587,0.924218
1,Female,1,2590,0.075782
2,Male,0,27737,0.924135
3,Male,1,2277,0.075865
4,Unknown/Invalid,0,2,1.0


In [52]:
readmission_sum_df(test,'gender')

Unnamed: 0,gender,readmitted,count,frequency
0,Female,0,4508,0.888977
1,Female,1,563,0.111023
2,Male,0,3886,0.895392
3,Male,1,454,0.104608
4,Unknown/Invalid,0,1,1.0


In [53]:
#col_sum_df(train,'gender','discharge_disposition_id')

In [54]:
train.loc[train['gender'] == 'Unknown/Invalid']

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
17031,257364294,78119847,Unknown,Unknown/Invalid,[70-80),1,22,7,8,Unknown,59,2,21,0,0,0,850,805,808,Injury,Injury,Injury,9,,,Steady,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Ch,Yes,0
18516,226864668,60524946,Unknown,Unknown/Invalid,[60-70),1,1,7,1,Unknown,38,1,6,0,0,0,808,873,E813,Injury,Injury,Other,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0


In [55]:
#perform imputation
train.loc[train['encounter_id'] == 257364294,'gender'] = 'Female'
train.loc[train['encounter_id'] == 226864668,'gender'] = 'Male'
test.loc[test['gender'] == 'Unknown/Invalid','gender'] = 'Female'

### **Collapse admission id, discharge_disposition_id, and admission_source_id, based off above tables; convert other categorical columns to integer for modelling**

#**Admin_IDs and Discharge_IDs Treatment:**

In [56]:
#store original id columns into a separate variable, for future use.
train_discharge_id = train['discharge_disposition_id'].copy()
test_discharge_id = test['discharge_disposition_id'].copy()

train_admin_source_id = train['admission_source_id'].copy()
test_admin_source_id = test['admission_source_id'].copy()

train_admin_type_id = train['admission_type_id'].copy()
test_admin_type_id = test['admission_type_id'].copy()

In [57]:
def collapse_ids(file,id_col,oldlist,newlist):
    
    for item in list(zip(oldlist,newlist)): 
        file.loc[:,id_col] = file.loc[:,id_col].replace(item[0],item[1])

In [58]:
#Create arrays for collapse_ids function:

admin_source_old = [2,3,6,8,9,10,11,12,13,14,15,17,18,19,20,22,23,24,25,26]
admin_source_new = [1,1,5,1,26,4,7,7,7,7,26,26,5,5,26,4,7,4,7,5]

discharge_old = [4,7,8,9,10,12,15,16,17,18,22,23,24,25,30,27,28,29]
discharge_new = [3,3,6,5,2,5,5,1,1,26,5,3,3,26,5,5,5,5]

admin_type_old = [2,4,6,7,8]
admin_type_new = [1,3,9,1,9]

print('admin_source:',len(admin_source_old) == len(admin_source_new))
print('admin_source_old uniques:',len(np.unique(admin_source_old))) 
print('admin_source_new uniques:',len(np.unique(admin_source_new)))
print('discharge:',len(discharge_old) == len(discharge_new))
print('discharge old uniques:',len(np.unique(discharge_old)))
print('discharge new uniques:',len(np.unique(discharge_new)))
print('admin_type:',len(admin_type_old) == len(admin_type_new))
print('admin_type_old uniques:',len(np.unique(admin_type_old))) 
print('admin_type_new uniques:',len(np.unique(admin_type_new)))

admin_source: True
admin_source_old uniques: 20
admin_source_new uniques: 5
discharge: True
discharge old uniques: 18
discharge new uniques: 6
admin_type: True
admin_type_old uniques: 5
admin_type_new uniques: 3


In [59]:
#Conduct Replacement
collapse_ids(train,'discharge_disposition_id',discharge_old,discharge_new)
collapse_ids(test,'discharge_disposition_id',discharge_old,discharge_new)

collapse_ids(train,'admission_source_id',admin_source_old,admin_source_new)
collapse_ids(test,'admission_source_id',admin_source_old,admin_source_new)

collapse_ids(train,'admission_type_id',admin_type_old,admin_type_new)
collapse_ids(test,'admission_type_id',admin_type_old,admin_type_new)

In [60]:
#readmission_sum_df(train,'discharge_disposition_id')

In [61]:
#readmission_sum_df(train,'admission_type_id')

In [62]:
#readmission_sum_df(test,'admission_type_id')

In [63]:
#readmission_sum_df(train,'admission_source_id')

In [64]:
#readmission_sum_df(test,'admission_source_id')

In [65]:
pd.set_option('display.max_colwidth', 120)

In [66]:
#train.sample(15)

In [67]:
#test.sample(15)

#**Age Column Treatment:**

In [68]:
# del train['age_start']
# del train['age_end']
# del test['age_start']
# del test['age_end']

In [69]:
#Take Age bins and create age_start and age_end columns:
train['age_start'], train['age_end'] = zip(*train['age'].map(lambda x: x.split('-')))
train['age_start'] = train['age_start'].map(lambda x: int(x[1:]))
train['age_end'] = train['age_end'].map(lambda x: int(x[:-1]))
test['age_start'], test['age_end'] = zip(*test['age'].map(lambda x: x.split('-')))
test['age_start'] = test['age_start'].map(lambda x: int(x[1:]))
test['age_end'] = test['age_end'].map(lambda x: int(x[:-1]))

In [70]:
for col in train_colnames:
    print(col,'='*(50 - len(col)))
    print(summary_df(train,col))

          values  counts  frequency
0      113905662       1   0.000016
1      379682234       1   0.000016
2      110380806       1   0.000016
3       31071702       1   0.000016
4       91229652       1   0.000016
5      395460050       1   0.000016
6       69020400       1   0.000016
7       72142284       1   0.000016
8      124002612       1   0.000016
9       51834312       1   0.000016
10     168467910       1   0.000016
11     282507738       1   0.000016
12      59149764       1   0.000016
13      75017664       1   0.000016
14     201819798       1   0.000016
15     165244344       1   0.000016
16     119946642       1   0.000016
17     152744040       1   0.000016
18      63222048       1   0.000016
19     134800158       1   0.000016
20     183681828       1   0.000016
21     344849834       1   0.000016
22      56094120       1   0.000016
23     145990716       1   0.000016
24     157138338       1   0.000016
25     359052488       1   0.000016
26     268021152       1   0

70         Pediatrics-InfectiousDiseases       1   0.000016
     values  counts  frequency
0         1    2039   0.031764
1        43    1693   0.026374
2        44    1531   0.023850
3        45    1472   0.022931
4        46    1408   0.021934
5        38    1401   0.021825
6        40    1380   0.021498
7        41    1332   0.020750
8        47    1312   0.020438
9        37    1305   0.020329
10       49    1297   0.020205
11       42    1292   0.020127
12       39    1280   0.019940
13       48    1265   0.019706
14       51    1224   0.019067
15       36    1210   0.018849
16       50    1199   0.018678
17       35    1197   0.018647
18       54    1171   0.018242
19       55    1166   0.018164
20       56    1146   0.017852
21       52    1131   0.017619
22       53    1116   0.017385
23       57    1099   0.017120
24       58    1051   0.016373
25       34    1046   0.016295
26       59    1028   0.016014
27       61    1016   0.015827
28       60    1001   0.015594
29       6

   values  counts  frequency
0      No   50744   0.790491
1  Steady   12303   0.191656
2      Up     747   0.011637
3    Down     399   0.006216
   values  counts  frequency
0      No   63331   0.986572
1  Steady     764   0.011902
2      Up      71   0.001106
3    Down      27   0.000421
   values  counts  frequency
0      No   63740   0.992943
1  Steady     432   0.006730
2      Up      13   0.000203
3    Down       8   0.000125
   values  counts  frequency
0      No   64133   0.999065
1  Steady      57   0.000888
2      Up       2   0.000031
3    Down       1   0.000016
   values  counts  frequency
0      No   60811   0.947315
1  Steady    3037   0.047310
2      Up     218   0.003396
3    Down     127   0.001978
   values  counts  frequency
0      No   64192   0.999984
1  Steady       1   0.000016
   values  counts  frequency
0      No   56132   0.874426
1  Steady    7199   0.112146
2      Up     523   0.008147
3    Down     339   0.005281
   values  counts  frequency
0      No   57

#**Conversions to integer for other columns**

In [71]:
two_value_cols = ['diabetesMed','change','gender']

#convert two_value_columns to 0 and 1
#'No' = 0, 'Yes' = 1
train['diabetesMed'] = np.where((train['diabetesMed'] == 'No'), 0, 1)
test['diabetesMed'] = np.where((test['diabetesMed'] == 'No'), 0, 1)

#'No' = 0, 'CH' = 1
train['change'] = np.where((train['change'] == 'No'), 0, 1)
test['change'] = np.where((test['change'] == 'No'), 0, 1)

#'Female' = 0, 'Male' = 1
train['gender'] = np.where((train['gender'] == 'Female'), 0, 1)
test['gender'] = np.where((test['gender'] == 'Female'), 0, 1)

In [72]:
readmission_sum_df(train,'max_glu_serum')

Unnamed: 0,max_glu_serum,readmitted,count,frequency
0,>200,0,772,0.897674
1,>200,1,88,0.102326
2,>300,0,588,0.899083
3,>300,1,66,0.100917
4,,0,56537,0.925047
5,,1,4581,0.074953
6,Norm,0,1429,0.915439
7,Norm,1,132,0.084561


In [73]:
readmission_sum_df(train,'A1Cresult')

Unnamed: 0,A1Cresult,readmitted,count,frequency
0,>7,0,2356,0.931594
1,>7,1,173,0.068406
2,>8,0,5109,0.929754
3,>8,1,386,0.070246
4,,0,48658,0.922619
5,,1,4081,0.077381
6,Norm,0,3203,0.933819
7,Norm,1,227,0.066181


In [74]:
#Convert medical tests
tests = ['max_glu_serum','A1Cresult']

#high --> 2, normal -->1, none -->0
train['max_glu_serum'] = np.where((train['max_glu_serum'] == '>200') | (train['max_glu_serum'] == '>300'),\
                                  2,np.where(train['max_glu_serum'] == 'None',0,1))
test['max_glu_serum'] = np.where((test['max_glu_serum'] == '>200') | (test['max_glu_serum'] == '>300'),\
                                  2,np.where(test['max_glu_serum'] == 'None',0,1))
train['A1Cresult'] = np.where((train['A1Cresult'] == '>7') | (train['A1Cresult'] == '>8'),\
                                  2,np.where(train['A1Cresult'] == 'None',0,1))
test['A1Cresult'] = np.where((test['A1Cresult'] == '>7') | (test['A1Cresult'] == '>8'),\
                                  2,np.where(test['A1Cresult'] == 'None',0,1))

In [75]:
#collapse common terms in medical specialty
train.loc[:,'medical_specialty'] = np.where(train['medical_specialty'].str.contains('Surgery|Surgeon|Surgical'),'Surgeon',train['medical_specialty'])
test.loc[:,'medical_specialty'] = np.where(test['medical_specialty'].str.contains('Surgery|Surgeon|Surgical'),'Surgeon',test['medical_specialty'])
train.loc[:,'medical_specialty'] = np.where(train['medical_specialty'].str.contains('Orthopedics'),'Orthopedics',train['medical_specialty'])
test.loc[:,'medical_specialty'] = np.where(test['medical_specialty'].str.contains('Orthopedics'),'Orthopedics',test['medical_specialty'])

In [76]:
#collapse medical_specialty, due to high cardinality

toptrain = train['medical_specialty'].isin(train['medical_specialty'].value_counts().index[:7])
toptest = test['medical_specialty'].isin(test['medical_specialty'].value_counts().index[:7])
train.loc[~toptrain, 'medical_specialty'] = "Other_Specialty"
test.loc[~toptest, 'medical_specialty'] = "Other_Specialty"

In [77]:
print(train['medical_specialty'].value_counts().index[:7])
print(test['medical_specialty'].value_counts().index[:7])

Index(['Unknown', 'InternalMedicine', 'Other_Specialty',
       'Family/GeneralPractice', 'Emergency/Trauma', 'Cardiology', 'Surgeon'],
      dtype='object')
Index(['Unknown', 'InternalMedicine', 'Other_Specialty',
       'Family/GeneralPractice', 'Emergency/Trauma', 'Cardiology', 'Surgeon'],
      dtype='object')


In [78]:
#summary_df(train,'medical_specialty')

In [79]:
#summary_df(test,'medical_specialty')

#**Store Original Medication Columns to preserve initial data. Then convert dosages to integers
-0 for not given
-1 for up, down, and steady
the goal is reflect whethere the medication was given to the patient or not.**

In [80]:
train.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'diag_1_descrip', 'diag_2_descrip', 'diag_3_descrip',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'age_start', 'age_end'],
      dtype='object')

In [81]:
medications = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',\
               'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide','pioglitazone',\
               'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone','tolazamide',\
               'examide', 'citoglipton', 'insulin','glyburide-metformin',\
               'glipizide-metformin','glimepiride-pioglitazone', 'metformin-rosiglitazone',\
               'metformin-pioglitazone']

print(len(medications))

medications_df = train[medications].copy()

23


In [82]:
#pandas_profiling.ProfileReport(combined)
combined = pd.concat([train,test], axis = 0, ignore_index = True)
combined.profile_report(correlations={'cramers': False})



In [83]:
#just track whether there was a change in the medication that was given
for col in medications:
    train[col] = np.where((train[col]=='No')|(train[col]=='Steady'),0,1)
    test[col] = np.where((test[col]=='No')|(test[col] == 'Steady'),0,1)

In [84]:
#drop columns that are constant values, as indicated in the profile up above.
#constant values and no changes at all

dropped_meds = ['acetohexamide','examide','glimepiride-pioglitazone','glipizide-metformin',\
                'citoglipton','tolazamide','tolbutamide','troglitazone','metformin-pioglitazone',\
                'metformin-rosiglitazone']

train.drop(dropped_meds, axis = 1, inplace = True)
test.drop(dropped_meds, axis = 1, inplace = True)

In [85]:
selected_meds = [x for x in medications if x not in dropped_meds]
print(len(selected_meds))

selected_meds_df = train[selected_meds].copy()

13


In [86]:
train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end
63035,252777984,44440983,Caucasian,1,[60-70),1,2,7,3,Unknown,47,0,14,0,0,0,239,342.0,401.0,Neoplasms,Other,Circulatory,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,60,70
34958,165232482,59598810,AfricanAmerican,0,[80-90),1,1,7,7,Unknown,67,0,12,0,0,0,415,453.0,276.0,Circulatory,Circulatory,Other,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,80,90
14980,74230530,20778696,Caucasian,0,[40-50),1,1,7,1,InternalMedicine,55,0,6,0,0,0,566,250.02,276.0,Digestive,Diabetes,Other,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,40,50
5632,97148298,23504346,Caucasian,0,[70-80),5,6,5,1,Family/GeneralPractice,11,0,13,2,0,0,428,425.0,250.6,Circulatory,Circulatory,Diabetes,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,70,80
28638,117950598,24077142,AfricanAmerican,1,[70-80),1,1,1,1,Cardiology,43,4,18,0,1,0,414,411.0,402.0,Circulatory,Circulatory,Circulatory,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,70,80


### **Feature Engineering**

In [None]:
#summary_df(train,'number_outpatient')

In [None]:
#summary_df(train,'number_emergency')

In [None]:
#summary_df(train,'number_inpatient')

In [87]:
#avg number of all procedures done per day in the hospital
train['avg_total_procedures'] = (train['num_lab_procedures'] + train['num_procedures'])/train['time_in_hospital']
test['avg_total_procedures'] = (test['num_lab_procedures'] + test['num_procedures'])/test['time_in_hospital']

#if diabetes medication was given, was it changed?
train['diab_med_changed'] = train['change'] * train['diabetesMed']
test['diab_med_changed'] = test['change'] * test['diabetesMed']

#how many times was the medication changed?
train['num_of_changes'] = 0
test['num_of_changes'] = 0
for col in selected_meds:
    train['num_of_changes'] = train['num_of_changes'] + train[col]
    test['num_of_changes'] = test['num_of_changes'] + test[col]

#patient 'score' variable. Here, I'm trying to 'reward' patients who have gone to see their
#doctor in the previous year (number_outpatient > 0; most in the dataset have not seen their
#doctor in the previous year). However, the older a patient is, the more likely they are to have
#multiple visits to the doctor. So this must be counter-balanced by number of medications given
#scaled by number of diagnoses assessed during the hospital stay.

train['patient_score'] = (((train['age_start'] + train['age_end'])/2)/(train['number_outpatient'] + 1)) *\
                         (train['num_medications']/train['number_diagnoses'])

test['patient_score'] = (((test['age_start'] + test['age_end'])/2)/(test['number_outpatient'] + 1)) *\
                        (test['num_medications']/test['number_diagnoses'])

In [88]:
#col_sum_df(train,'number_outpatient','num_medications')

In [89]:
#train1 = train.copy()

In [90]:
#train1['patient_score'] = (((train1['age_start'] + train1['age_end'])/2)/(train1['number_outpatient'] + 1)) *\
 #                           (train1['num_medications']/train1['number_diagnoses'])

In [91]:
#train2 = train1[train1['patient_score'] < 1000]

In [92]:
#ax = sns.boxplot(x="readmitted", y="patient_score", data=train1)

In [93]:
#ax = sns.boxplot(x="readmitted", y="patient_score", data=train2)

In [94]:
#train1[['patient_score']].describe()

In [95]:
#train2[['patient_score']].describe()

In [96]:
#train1['patient_score'].median()

In [97]:
#train1.groupby('readmitted')['patient_score'].agg({'patient_score': ['mean','median','min','max','std']})

In [98]:
#train2.groupby('readmitted')['patient_score'].agg({'patient_score': ['mean','median','min','max','std']})

In [99]:
train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
43156,113380590,24072417,Caucasian,0,[80-90),1,3,7,3,Unknown,38,2,26,0,0,0,821,285,E885,Injury,Other,Other,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,80,90,13.333333,1,0,245.555556
53149,332547920,35922753,Unknown,1,[60-70),1,2,7,1,Unknown,29,0,10,0,0,0,426,410,401,Circulatory,Circulatory,Circulatory,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,60,70,29.0,0,0,108.333333
43133,347760086,95415831,Caucasian,1,[70-80),1,6,4,4,InternalMedicine,31,2,17,0,0,2,V57,424,414,Other,Circulatory,Circulatory,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,70,80,8.25,1,0,141.666667
6839,275197032,38489958,Caucasian,1,[80-90),3,1,1,2,Unknown,12,6,26,0,0,0,414,411,250,Circulatory,Circulatory,Diabetes,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,80,90,9.0,0,0,368.333333
18269,195902646,84740958,Caucasian,1,[80-90),1,1,7,8,Unknown,59,3,21,0,0,0,410,578,280,Circulatory,Digestive,Other,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80,90,7.75,0,0,198.333333


In [100]:
train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 64193 rows in train
There are now 9412 rows in test
There are now 47 columns in train
There are now 47 columns in test


**Finished adding/removing columns. Now convert columns to numeric/non numeric**

In [101]:
addtocats = ['encounter_id', 'patient_nbr']
tocats.extend(addtocats)

In [102]:
for col in tocats:
    train.loc[:,col] = train.loc[:,col].astype('category')
    test.loc[:,col] = test.loc[:,col].astype('category')

In [103]:
#update numericcols for the feature engineered columns
numericcols = [x for x in numericcols if x not in tocats]
numericcols.extend(['age_start','age_end','avg_total_procedures','num_of_changes','patient_score'])

#make a list of the columns that were converted from str to number_encoded:
t1 = train.select_dtypes(exclude = ['object','category']).columns.tolist()
encoded_cols = [x for x in t1 if x not in numericcols]
encoded_cols

['gender',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'glipizide',
 'glyburide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'insulin',
 'glyburide-metformin',
 'change',
 'diabetesMed',
 'readmitted',
 'diab_med_changed']

In [104]:
encoded_cols.remove('readmitted')

In [105]:
tocats

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'encounter_id',
 'patient_nbr']

**Copy train and test before removing the above columns and proceeding with dummification and transformations**

In [106]:
remove = ['encounter_id','patient_nbr','age','diag_1','diag_2','diag_3']
train1 = train.copy()
test1 = test.copy()

train1.drop(remove,inplace = True, axis = 1)
test1.drop(remove,inplace = True, axis = 1)

In [107]:
train1_target = train1['readmitted']
train1.drop('readmitted', inplace = True, axis = 1)

test1_target = test1['readmitted']
test1.drop('readmitted', inplace = True, axis = 1)

In [108]:
print('train1 shape is {}'.format(train1.shape))
print('train1_target shape is {}'.format(train1_target.shape))

print('test1 shape is {}'.format(test1.shape))
print('test1_target shape is {}'.format(test1_target.shape))

train1 shape is (64193, 40)
train1_target shape is (64193,)
test1 shape is (9412, 40)
test1_target shape is (9412,)


In [109]:
numericcols

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'age_start',
 'age_end',
 'avg_total_procedures',
 'num_of_changes',
 'patient_score']

In [110]:
for col in encoded_cols:
    train1.loc[:,col] = train1.loc[:,col].astype('category')
    test1.loc[:,col] = test1.loc[:,col].astype('category')

In [111]:
train1.tail()

Unnamed: 0,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
64188,Caucasian,1,1,1,7,1,Cardiology,44,0,2,0,0,0,Circulatory,Circulatory,Other,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,40,50,44.0,0,0,18.0
64189,Caucasian,0,1,1,1,1,Unknown,50,1,20,0,0,0,Diabetes,Digestive,Digestive,9,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,70,80,51.0,1,1,166.666667
64190,Caucasian,1,3,3,1,3,Orthopedics,55,1,33,0,0,0,Musculoskeletal,Circulatory,Other,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,70,80,18.666667,1,1,275.0
64191,Caucasian,0,9,26,7,12,Family/GeneralPractice,77,2,21,0,0,0,Respiratory,Respiratory,Genitourinary,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,60,70,6.583333,1,1,151.666667
64192,Caucasian,0,9,1,5,1,Cardiology,2,5,17,0,0,0,Circulatory,Circulatory,Other,7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,40,50,7.0,1,1,109.285714


In [112]:
test1.head()

Unnamed: 0,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
0,Caucasian,0,1,1,5,11,InternalMedicine,68,0,20,0,0,0,Diabetes,Genitourinary,Other,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,70,80,6.181818,0,0,300.0
1,Caucasian,1,1,1,1,1,Unknown,20,0,7,0,0,0,Other,Circulatory,Other,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,50,60,20.0,0,0,48.125
2,Caucasian,0,3,6,1,4,Unknown,21,3,23,1,0,2,Musculoskeletal,Musculoskeletal,Musculoskeletal,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,60,70,6.0,0,0,106.785714
3,Caucasian,1,1,1,1,12,Other_Specialty,28,0,19,0,0,1,Respiratory,Other,Other,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,30,40,2.333333,0,0,95.0
4,AfricanAmerican,0,1,2,7,1,Unknown,21,0,6,0,0,0,Circulatory,Genitourinary,Circulatory,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,60,70,21.0,0,0,55.714286


In [113]:
combined = pd.concat([train1,test1], axis = 0, ignore_index = True)

In [None]:
#combined = pd.concat([train1,test1], axis = 0, ignore_index = True)
combined[numericcols].profile_report(correlations={'cramers': False})

In [114]:
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train1,test1)

There are now 64193 rows in train
There are now 9412 rows in test
There are now 47 columns in train
There are now 47 columns in test


##**Check for Skewness and apply BoxCox Transformation**

In [115]:
#test for skewness
for col in numericcols:
    print(col, stats.skewtest(train1[col]))

time_in_hospital SkewtestResult(statistic=98.0878907036654, pvalue=0.0)
num_lab_procedures SkewtestResult(statistic=-22.930088804288893, pvalue=2.328560427448543e-116)
num_procedures SkewtestResult(statistic=102.09930513613483, pvalue=0.0)
num_medications SkewtestResult(statistic=110.70413317173639, pvalue=0.0)
number_outpatient SkewtestResult(statistic=263.6778479345026, pvalue=0.0)
number_emergency SkewtestResult(statistic=376.5235624997525, pvalue=0.0)
number_inpatient SkewtestResult(statistic=198.12079268402027, pvalue=0.0)
number_diagnoses SkewtestResult(statistic=-70.50292217284226, pvalue=0.0)
age_start SkewtestResult(statistic=-60.18894303427823, pvalue=0.0)
age_end SkewtestResult(statistic=-60.18894303427823, pvalue=0.0)
avg_total_procedures SkewtestResult(statistic=127.9180295853125, pvalue=0.0)
num_of_changes SkewtestResult(statistic=119.83897270631931, pvalue=0.0)
patient_score SkewtestResult(statistic=146.77279448709692, pvalue=0.0)


In [116]:
#correct for skew:
withzeros = ['num_of_changes','num_procedures','number_emergency','number_inpatient','number_outpatient']

skewedvalues = combined[numericcols].apply(lambda x: skew(x))

#skewness should be within -1 and 1,apparently. Correct for anything greater than 0.75
skewedvariables = skewedvalues[abs(skewedvalues) > 0.75]

nonzeros = [x for x in skewedvariables.index.tolist() if x not in withzeros]

skewedvariables1 = combined[withzeros] + 1 #the transform fails if it attempts to divide by zero, so add 1.
skewedvariables_reg = combined[nonzeros]

In [117]:
nonzeros

['time_in_hospital',
 'num_medications',
 'number_diagnoses',
 'avg_total_procedures',
 'patient_score']

In [118]:
#Box-Cox Transformation

for variable1 in withzeros:
     combined[variable1] = stats.boxcox(skewedvariables1[variable1])[0]

for variable2 in nonzeros:
     combined[variable2] = stats.boxcox(skewedvariables_reg[variable2])[0]


In [119]:
#marker for after_boxcox transformation.

after_boxcox = combined.copy()

train2 = after_boxcox[:train_rows]
test2 = after_boxcox[train_rows:]

**dummification**

In [120]:
train2 = pd.get_dummies(train2, drop_first = True)
test2 = pd.get_dummies(test2, drop_first = True)

**scaling**

In [121]:
#robustscaler to standardize all numeric columns (not categorical)

robust = RobustScaler()
robustscaler = robust.fit(train2[numericcols])#:train_rows,numcolstx])

# standard = StandardScaler()
# standardscaler = standard.fit(after_box_cox.loc[:train_rows,numcolstx])


train2[numericcols] = robustscaler.transform(train2[numericcols])
test2[numericcols] = robustscaler.transform(test2[numericcols])

## **Modelling**

In [122]:
logit = LogisticRegression()
logit.set_params(class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logit.fit(train2, train1_target)
print(logit.score(train2, train1_target))
cm = confusion_matrix(train1_target, logit.predict(train2))
cm



0.6670509245556369


array([[39986, 19340],
       [ 2033,  2834]])

In [123]:
prediction_test = logit.predict(test2)

print("Accuracy is {0:.2f}".format(accuracy_score(test1_target, prediction_test)))
print("Precision is {0:.2f}".format(precision_score(test1_target, prediction_test)))
print("Recall is {0:.2f}".format(recall_score(test1_target, prediction_test)))
print("AUC is {0:.2f}".format(roc_auc_score(test1_target, prediction_test)))

Accuracy is 0.60
Precision is 0.16
Recall is 0.63
AUC is 0.61


In [124]:
# use stratifiedKFold
skf = ms.StratifiedKFold(n_splits=10, shuffle=True, random_state=99)

In [125]:
logit1 = LogisticRegression()
logit1.set_params(class_weight = 'balanced')#turn on Ridge for heavily penalized coefficients. 
params1 = {'C':np.logspace(-4,4, 20)}

gs_logit1 = ms.GridSearchCV(estimator = logit1,
                           param_grid = params1,
                           cv = skf,
                           verbose = True, n_jobs = -1)
#print(logit.score(train2, train1_target))
#cm = confusion_matrix(train1_target, logit.predict(train2))
#cm

In [126]:
gs_logit1.fit(train2, train1_target)
gs_logit1.best_params_


Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.3min finished


{'C': 78.47599703514607}

In [128]:
#save the best result
logit_best = gs_logit1.best_estimator_
logit_best_predict = logit_best.predict(train2)
cm1 = confusion_matrix(train1_target, logit_best.predict(train2))
cm1

array([[40003, 19323],
       [ 2030,  2837]])

In [None]:
print('Score isgs_logit.best_score_

In [None]:
test1_target.shape

In [None]:
#test1_target

In [None]:
#test1