In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import re
import time


from scipy import stats
from scipy.stats import skew

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn import linear_model, svm

import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as ms
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit,\
     cross_val_score

from sklearn.metrics import confusion_matrix
from hpsklearn import HyperoptEstimator, svc


from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

pd.set_option('display.max_columns',99)
pd.set_option('display.max_rows',300)

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [2]:
#pip install lightgbm
#!pip install imblearn

In [2]:
train_orig = pd.read_csv('dataset_diabetes/train_dataset.csv')
test_orig = pd.read_csv('dataset_diabetes/test_dataset.csv')
icd9_orig = pd.read_csv('dataset_diabetes/icd9.csv')
admin_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', nrows = 8)
discharge_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', skiprows = 10, nrows = 30)
admin_source_id_orig = pd.read_csv('dataset_diabetes/IDs_mapping.csv', skiprows = 42, nrows = 27)

In [3]:
train = train_orig.copy()
test = test_orig.copy()
icd9 = icd9_orig.copy()
admin_id = admin_id_orig.copy()
discharge_id = discharge_id_orig.copy()
admin_source_id = admin_source_id_orig.copy()

In [4]:
train_rows = train.shape[0] 
test_rows = test.shape[0]
print('There are {} rows in train'.format(train_rows))
print('There are {} rows in test'.format(test_rows))

There are 91589 rows in train
There are 10177 rows in test


In [5]:
#combined[:train_rows].head(3)

In [6]:
def summary_df(file,col):
    value_summary = {'values':file[col].value_counts().index.tolist(),
                     'counts':file[col].value_counts().values.tolist(),
                     'frequency': file[col].value_counts(normalize = True).values.tolist()}
    col_df = pd.DataFrame(value_summary)
    return col_df

In [8]:
#combined[:train_rows].tail(3)

In [9]:
#combined[train_rows:].head(3)

**Conversion process of certain numeric columns to categorical, and for all 'Object' type columns to categorical and question marks to NaN**

In [12]:
train_colnames = train.columns.tolist()

In [13]:

# numericcols = []

# [numericcols.append(col) for col in train_colnames if train[col].dtype == 'int64']

numericcols = train.select_dtypes(exclude = ['object']).columns.tolist()
catcolumns = train.select_dtypes(include = ['object']).columns.tolist()

In [14]:
print('Numeric cols','\n',numericcols)
print('\n','Category cols','\n',catcolumns)

Numeric cols 
 ['encounter_id', 'patient_nbr', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

 Category cols 
 ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [15]:
# catcolumns = []
# [catcolumns.append(col) for col in train_colnames if col not in numericcols]

In [16]:
train = train.replace('?',np.nan)
test = test.replace('?',np.nan)

In [17]:
#convert object columns to category type and replace ?s with NaN for accurate missingness representation

tocats = ['admission_type_id','discharge_disposition_id','admission_source_id']

for col in tocats:
    train.loc[:,col] = train.loc[:,col].astype('object')
    test.loc[:,col] = test.loc[:,col].astype('object')

for col in train_colnames:
    if train[col].dtypes == 'object':
        train.loc[:,col] = train.loc[:,col].astype('object')
    if test[col].dtypes == 'object':
        test.loc[:,col] = test.loc[:,col].astype('object')

### Before Further Analysis, need to confirm that basic assumptions for Logistic Reagression are upheld:

- logistic regression requires the observations to be independent of each other.  In other words, the observations should not come from repeated measurements or matched data.

In [18]:
#are there any duplicate encounter_ids?
print('There are {} unique encounter IDs in train.'.format(len(train['encounter_id'].unique())))
print('There are {} unique patient numbers in train.'.format(len(train['patient_nbr'].unique())))

print('There are {} unique encounter IDs in test.'.format(len(test['encounter_id'].unique())))
print('There are {} unique patient numbers in test.'.format(len(test['patient_nbr'].unique())))

There are 91589 unique encounter IDs in train.
There are 65908 unique patient numbers in train.
There are 10177 unique encounter IDs in test.
There are 9650 unique patient numbers in test.


In [19]:
summary_df(train,'readmitted')

Unnamed: 0,values,counts,frequency
0,NO,49382,0.53917
1,>30,31990,0.349278
2,<30,10217,0.111553


In [178]:
10217/(49382+31990+10217)

0.11155269737632248

In [20]:
repeat_patients = train[train.duplicated(subset = 'patient_nbr', keep = 'first')].sort_values(by = ['patient_nbr'])
repeat_patients.head(15)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
31143,26264286,135,Caucasian,Female,[50-60),,1,1,7,3,,Surgery-Cardiovascular/Thoracic,31,1,14,0,0,1,998.0,41.0,250.0,5,,,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Ch,Yes,>30
73493,83281464,1152,AfricanAmerican,Female,[60-70),,1,1,7,12,,Hematology/Oncology,37,1,18,0,0,2,282.0,287.0,466.0,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
18843,80742510,1152,AfricanAmerican,Female,[60-70),,1,1,7,8,,,30,1,16,0,0,1,282.0,250.0,,2,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
3663,30180318,1152,AfricanAmerican,Female,[50-60),,1,1,7,6,,Hematology/Oncology,45,4,15,0,0,2,282.0,794.0,250.0,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
16182,8380170,1152,AfricanAmerican,Female,[50-60),,1,1,7,6,,Hematology/Oncology,43,2,13,0,0,1,282.0,250.01,,2,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
87871,60254142,1314,Caucasian,Female,[40-50),,2,1,1,2,,InternalMedicine,50,5,13,0,0,0,996.0,411.0,401.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
21698,70601076,1314,Caucasian,Female,[40-50),,1,1,7,3,,Cardiology,54,3,14,0,0,2,78.0,250.0,414.0,7,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
86122,33503946,5220,Caucasian,Male,[70-80),,2,1,1,11,,Cardiology,65,4,19,0,0,2,404.0,427.0,276.0,9,,>8,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
80960,60892254,5220,Caucasian,Male,[70-80),,1,1,7,1,,InternalMedicine,35,0,12,0,0,2,250.7,707.0,428.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
67762,7981038,5220,Caucasian,Male,[60-70),,1,1,7,2,,InternalMedicine,15,0,14,0,0,0,276.0,426.0,558.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30


In [21]:
num_repeat_patients = train_rows - len(train['patient_nbr'].unique())
print('There are {} encounters where it is a repeat patient.'.format(num_repeat_patients))

There are 25681 encounters where it is a repeat patient.


#**filter train and test to only consider patients who have unique encounters. For simplicity,
take the group of repeat encounters and consider only the first one in order to preserve the 
assumption of each observation being independent from one another.**

In [22]:
# #drop repeat patients in train and test file to preserve indepedence assumption for each observation
# train.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
# test.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
# train.reset_index(inplace = True)
# test.reset_index(inplace = True)

# del train['index']
# del test['index']

In [23]:
#are there any duplicate encounter_ids?
print('There are {} unique encounter IDs in train.'.format(len(train['encounter_id'].unique())))
print('There are {} unique patient numbers in train.'.format(len(train['patient_nbr'].unique())))

print('There are {} unique encounter IDs in test.'.format(len(test['encounter_id'].unique())))
print('There are {} unique patient numbers in test.'.format(len(test['patient_nbr'].unique())))

There are 91589 unique encounter IDs in train.
There are 65908 unique patient numbers in train.
There are 10177 unique encounter IDs in test.
There are 9650 unique patient numbers in test.


In [24]:
def update_rows_cols(train_file,test_file):
    train_rows = train_file.shape[0] 
    test_rows = test_file.shape[0]
    train_colnames = train_file.columns.tolist()
    test_colnames = test_file.columns.tolist()
    print('There are now {} rows in train'.format(train_rows))
    print('There are now {} rows in test'.format(test_rows))
    print('There are now {} columns in train'.format(len(train_colnames)))
    print('There are now {} columns in test'.format(len(test_colnames)))
    return train_rows, test_rows, train_colnames, test_colnames

In [25]:
#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train,test)

There are now 91589 rows in train
There are now 10177 rows in test
There are now 50 columns in train
There are now 50 columns in test


In [26]:
#combined[combined['payer_code'].isnull()]

In [27]:
#pandas_profiling.ProfileReport(combined)
#combined.profile_report(correlations={'cramers': False})

In [28]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91589 entries, 0 to 91588
Data columns (total 50 columns):
encounter_id                91589 non-null int64
patient_nbr                 91589 non-null int64
race                        89547 non-null object
gender                      91589 non-null object
age                         91589 non-null object
weight                      2850 non-null object
admission_type_id           91589 non-null object
discharge_disposition_id    91589 non-null object
admission_source_id         91589 non-null object
time_in_hospital            91589 non-null int64
payer_code                  55405 non-null object
medical_specialty           46615 non-null object
num_lab_procedures          91589 non-null int64
num_procedures              91589 non-null int64
num_medications             91589 non-null int64
number_outpatient           91589 non-null int64
number_emergency            91589 non-null int64
number_inpatient            91589 non-null int64
d

**Filter out encounters where patients expired or who have been discharged to hospice (are not candidates for readmission as they have either passed away or are in preparation to pass away at hospice). Convert readmissions column to 0 (not readmitted or readmission > 30 days) or 1 (readmission < 30 days).** 

In [29]:
#Remove discharge_disposition_ids denoting expiry or discharge to hospice. Update train and test
#row variables for the removed rows.
train = train.loc[~train['discharge_disposition_id'].isin([11,13,14,19,20,21])]
test = test.loc[~test['discharge_disposition_id'].isin([11,13,14,19,20,21])]

#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train,test)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 50 columns in train
There are now 50 columns in test


In [30]:
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)

In [31]:
#missing in train?
for col in train_colnames:
    missing = np.sum(train.loc[:,col].isnull())
    if missing > 0:
        print(col, missing)

race 2010
weight 86626
payer_code 35412
medical_specialty 43773
diag_1 17
diag_2 322
diag_3 1280


In [32]:
#missing in test?
for col in test_colnames:
    missing = np.sum(test.loc[:,col].isnull())
    if missing > 0:
        print(col, missing)

race 224
weight 9592
payer_code 3986
medical_specialty 4843
diag_1 3
diag_2 34
diag_3 139


In [33]:
#drop columns weight and payer_code. Too many NaNs.

train.drop(['weight','payer_code'], axis = 1, inplace = True)
test.drop(['weight','payer_code'], axis = 1, inplace = True)

In [34]:
#convert readmission column to 1 (for readmission <30) and 0 (readmission = NO or readmission > 30)
train['readmitted'] = np.where((train['readmitted'] == '>30') | (train['readmitted'] == 'NO'), 0, 1)
test['readmitted'] = np.where((test['readmitted'] == '>30') | (test['readmitted'] == 'NO'), 0, 1)


In [35]:
#View the rows where all diagnoses are Missing
np.sum(train['diag_1'].isnull() & train['diag_2'].isnull() & train['diag_3'].isnull())

0

In [36]:
#Reset column order, and update row designation variables for train and test set.
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 48 columns in train
There are now 48 columns in test


##### Processing of diagnosis code columns to diagnosis descriptions, for ease of understanding

In [37]:
#icd9

In [38]:
#icd9.reset_index(inplace = True)

In [39]:
#create diagnosis dictionary
icd9 = icd9.set_index('Group name')
diag_dict = icd9['icd9 codes'].to_dict()
del diag_dict['Other']

In [40]:
#populate dictionary with relevant codes
two = [x for x in list(diag_dict.keys()) if x not in ['Diabetes','Injury','Musculoskeletal','Neoplasms']]
one = ['Injury','Musculoskeletal','Neoplasms']

for key in two:
        #value = diag_dict[key]
        temp = diag_dict[key].split(', ')
        value = list(range(int(temp[0].split('–')[0]),int(temp[0].split('–')[1])+1))
        value.append(int(temp[1]))
        diag_dict[key] = [str(x) for x in value]
            
for key in one:
    temp = diag_dict[key].split(', ')
    value = list(range(int(temp[0].split('–')[0]),int(temp[0].split('–')[1])+1))
    diag_dict[key] = [str(x) for x in value]


In [41]:
#create columns with the diagnosis descriptions/categories for train and test files. Missing
#diagnoses are placed into 'Unknown'
def get_condition(x):
    if '250' in x:
        return 'Diabetes'
    for key in diag_dict:
        if x in diag_dict[key]:
            return key
    return 'Other'

diagnoses = ['diag_1','diag_2','diag_3']
diag_descrip = []
for diagnosis in diagnoses:
    train.loc[:,diagnosis] = train.loc[:,diagnosis].fillna('Unknown')
    train[diagnosis +'_descrip'] = train[diagnosis].map(get_condition)
    diag_descrip.append(diagnosis +'_descrip')
    test.loc[:,diagnosis] = test.loc[:,diagnosis].fillna('Unknown')
    test[diagnosis +'_descrip'] = test[diagnosis].map(get_condition)

In [42]:
#Anything missing after populating the descriptions based on the above?
for diagnosis in diagnoses:
    missing = np.sum(train[diagnosis +'_descrip'].isnull())
    print(diagnosis +'_descrip', missing)

diag_1_descrip 0
diag_2_descrip 0
diag_3_descrip 0


**Update Rows and Columns following the creation of the diagnosis description columns.**

In [43]:
tmp1 = train_colnames[:train_colnames.index('number_diagnoses')]
tmp1.extend(diag_descrip)
tmp1.extend(train_colnames[train_colnames.index('number_diagnoses'):])

In [44]:
#Reset column order, and update row designation variables for train and test set.
train = train[tmp1]
test = test[tmp1]
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 51 columns in train
There are now 51 columns in test


**Fill out other columns with 'Unknown'. Update if I get a better imputation.**

In [45]:
def fill_na(file,series):
    file.loc[:,series] = file.loc[:,series].fillna('Unknown')

In [46]:
#missing in train?
fill = []
for col in train_colnames:
    missing = np.sum(train.loc[:,col].isnull())
    if missing > 0:
        fill.append(col)

        
fillt = []
for col in test_colnames:
    missing = np.sum(test.loc[:,col].isnull())
    if missing > 0:
        fillt.append(col)

print(fill)
print(fillt)

['race', 'medical_specialty']
['race', 'medical_specialty']


In [47]:
#fill columns with 'Unknown'
for series in fill:
    fill_na(train,series)
    fill_na(test,series)

In [48]:
catcols = train.select_dtypes(include = ['object']).columns.tolist()

In [49]:
for col in catcols:
    print(col,'='*(50 - len(col)))
    print(summary_df(train,col))

            values  counts  frequency
0        Caucasian   66790   0.747017
1  AfricanAmerican   16881   0.188806
2          Unknown    2010   0.022481
3         Hispanic    1817   0.020322
4            Other    1334   0.014920
5            Asian     577   0.006453
            values  counts  frequency
0           Female   48101   0.537988
1             Male   41306   0.461989
2  Unknown/Invalid       2   0.000022
     values  counts  frequency
0   [70-80)   22798   0.254986
1   [60-70)   19861   0.222136
2   [50-60)   15320   0.171347
3   [80-90)   14793   0.165453
4   [40-50)    8682   0.097104
5   [30-40)    3381   0.037815
6  [90-100)    2331   0.026071
7   [20-30)    1478   0.016531
8   [10-20)     619   0.006923
9    [0-10)     146   0.001633
   values  counts  frequency
0       1   47142   0.527262
1       3   16764   0.187498
2       2   16368   0.183069
3       6    4653   0.052042
4       5    4173   0.046673
5       8     285   0.003188
6       7      14   0.000157
7       4

      values  counts  frequency
0        250   10334   0.115581
1        401    7370   0.082430
2        276    4477   0.050073
3        428    3920   0.043843
4        427    3418   0.038229
5        414    3262   0.036484
6        496    2283   0.025534
7        403    2036   0.022772
8        272    1775   0.019853
9        585    1742   0.019483
10       599    1671   0.018689
11   Unknown    1280   0.014316
12       V45    1239   0.013858
13    250.02    1201   0.013433
14       780    1190   0.013310
15       707    1177   0.013164
16       285    1079   0.012068
17       425     990   0.011073
18     250.6     966   0.010804
19       424     933   0.010435
20       305     814   0.009104
21    250.01     808   0.009037
22       682     795   0.008892
23       584     787   0.008802
24       518     681   0.007617
25        41     655   0.007326
26       493     623   0.006968
27       278     600   0.006711
28       530     560   0.006263
29       786     519   0.005805
30      

3    Down     499   0.005581
   values  counts  frequency
0      No   79931   0.893993
1  Steady    8263   0.092418
2      Up     716   0.008008
3    Down     499   0.005581
   values  counts  frequency
0      No   89390   0.999787
1  Steady      19   0.000213
   values  counts  frequency
0      No   82876   0.926931
1  Steady    6231   0.069691
2      Up     201   0.002248
3    Down     101   0.001130
   values  counts  frequency
0      No   83735   0.936539
1  Steady    5440   0.060844
2      Up     156   0.001745
3    Down      78   0.000872
   values  counts  frequency
0      No   89135   0.996935
1  Steady     262   0.002930
2      Up       9   0.000101
3    Down       3   0.000034
   values  counts  frequency
0      No   89374   0.999609
1  Steady      29   0.000324
2    Down       4   0.000045
3      Up       2   0.000022
   values  counts  frequency
0      No   89406   0.999966
1  Steady       3   0.000034
   values  counts  frequency
0      No   89373   0.999597
1  Steady     

In [50]:
def readmission_sum_df(file,col):
    #create counts
    tmp1  = file.groupby([col,'readmitted'])[['readmitted']].agg('count')
    tmp1.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp1.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp1.reset_index(inplace = True)
    
    return pd.merge(tmp1,pcts)

In [51]:
def by_readmission_df(file,col):
    #create counts
    tmp1  = file.groupby(['readmitted',col])[[col]].agg('count')
    tmp1.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp1.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp1.reset_index(inplace = True)
    
    return pd.merge(tmp1,pcts)

In [52]:
def col_sum_df(file,col,col2):
    #create counts
    tmp2  = file.groupby([col,col2])[[col2]].agg('count')
    tmp2.columns = ['count']
    #create readmission frequencies from counts
    pcts = tmp2.groupby(level=0).apply(lambda x: x / float(x.sum()))
    pcts.columns = ['frequency']
    pcts.reset_index(inplace = True)
    
    tmp2.reset_index(inplace = True)
    
    return pd.merge(tmp2,pcts)

#**Impute the 'Unknown/Invalid' gender rows in train and test, based on frequencies of readmission and discharge_disposition_id**

In [53]:
readmission_sum_df(train,'gender')

Unnamed: 0,gender,readmitted,count,frequency
0,Female,0,42600,0.885636
1,Female,1,5501,0.114364
2,Male,0,36627,0.886723
3,Male,1,4679,0.113277
4,Unknown/Invalid,0,2,1.0


In [54]:
readmission_sum_df(test,'gender')

Unnamed: 0,gender,readmitted,count,frequency
0,Female,0,4726,0.882869
1,Female,1,627,0.117131
2,Male,0,4073,0.889301
3,Male,1,507,0.110699
4,Unknown/Invalid,0,1,1.0


In [55]:
#col_sum_df(train,'gender','discharge_disposition_id')

In [56]:
train.loc[train['gender'] == 'Unknown/Invalid']

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
18777,257364294,78119847,Unknown,Unknown/Invalid,[70-80),1,22,7,8,Unknown,59,2,21,0,0,0,850,805,808,Injury,Injury,Injury,9,,,Steady,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Ch,Yes,0
20612,226864668,60524946,Unknown,Unknown/Invalid,[60-70),1,1,7,1,Unknown,38,1,6,0,0,0,808,873,E813,Injury,Injury,Other,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0


In [57]:
#perform imputation
train.loc[train['encounter_id'] == 257364294,'gender'] = 'Female'
train.loc[train['encounter_id'] == 226864668,'gender'] = 'Male'
test.loc[test['gender'] == 'Unknown/Invalid','gender'] = 'Female'

### **Collapse admission id, discharge_disposition_id, and admission_source_id, based off above tables; convert other categorical columns to integer for modelling**

#**Admin_IDs and Discharge_IDs Treatment:**

In [58]:
#store original id columns into a separate variable, for future use.
train_discharge_id = train['discharge_disposition_id'].copy()
test_discharge_id = test['discharge_disposition_id'].copy()

train_admin_source_id = train['admission_source_id'].copy()
test_admin_source_id = test['admission_source_id'].copy()

train_admin_type_id = train['admission_type_id'].copy()
test_admin_type_id = test['admission_type_id'].copy()

In [59]:
def collapse_ids(file,id_col,oldlist,newlist):
    
    for item in list(zip(oldlist,newlist)): 
        file.loc[:,id_col] = file.loc[:,id_col].replace(item[0],item[1])

In [60]:
#Create arrays for collapse_ids function:

admin_source_old = [2,3,6,8,9,10,11,12,13,14,15,17,18,19,20,22,23,24,25,26]
admin_source_new = [1,1,5,1,26,4,7,7,7,7,26,26,5,5,26,4,7,4,7,5]

discharge_old = [4,7,8,9,10,12,15,16,17,18,22,23,24,25,30,27,28,29]
discharge_new = [3,3,6,5,2,5,5,1,1,26,5,3,3,26,5,5,5,5]

admin_type_old = [2,4,6,7,8]
admin_type_new = [1,3,9,1,9]

print('admin_source:',len(admin_source_old) == len(admin_source_new))
print('admin_source_old uniques:',len(np.unique(admin_source_old))) 
print('admin_source_new uniques:',len(np.unique(admin_source_new)))
print('discharge:',len(discharge_old) == len(discharge_new))
print('discharge old uniques:',len(np.unique(discharge_old)))
print('discharge new uniques:',len(np.unique(discharge_new)))
print('admin_type:',len(admin_type_old) == len(admin_type_new))
print('admin_type_old uniques:',len(np.unique(admin_type_old))) 
print('admin_type_new uniques:',len(np.unique(admin_type_new)))

admin_source: True
admin_source_old uniques: 20
admin_source_new uniques: 5
discharge: True
discharge old uniques: 18
discharge new uniques: 6
admin_type: True
admin_type_old uniques: 5
admin_type_new uniques: 3


In [61]:
#Conduct Replacement
collapse_ids(train,'discharge_disposition_id',discharge_old,discharge_new)
collapse_ids(test,'discharge_disposition_id',discharge_old,discharge_new)

collapse_ids(train,'admission_source_id',admin_source_old,admin_source_new)
collapse_ids(test,'admission_source_id',admin_source_old,admin_source_new)

collapse_ids(train,'admission_type_id',admin_type_old,admin_type_new)
collapse_ids(test,'admission_type_id',admin_type_old,admin_type_new)

In [62]:
#readmission_sum_df(train,'discharge_disposition_id')

In [63]:
#readmission_sum_df(train,'admission_type_id')

In [64]:
#readmission_sum_df(test,'admission_type_id')

In [65]:
#readmission_sum_df(train,'admission_source_id')

In [66]:
#readmission_sum_df(test,'admission_source_id')

In [67]:
pd.set_option('display.max_colwidth', 120)

In [68]:
#train.sample(15)

In [69]:
#test.sample(15)

#**Age Column Treatment:**

In [70]:
# del train['age_start']
# del train['age_end']
# del test['age_start']
# del test['age_end']

In [71]:
#Take Age bins and create age_start and age_end columns:
train['age_start'], train['age_end'] = zip(*train['age'].map(lambda x: x.split('-')))
train['age_start'] = train['age_start'].map(lambda x: int(x[1:]))
train['age_end'] = train['age_end'].map(lambda x: int(x[:-1]))
test['age_start'], test['age_end'] = zip(*test['age'].map(lambda x: x.split('-')))
test['age_start'] = test['age_start'].map(lambda x: int(x[1:]))
test['age_end'] = test['age_end'].map(lambda x: int(x[:-1]))

In [72]:
for col in train_colnames:
    print(col,'='*(50 - len(col)))
    print(summary_df(train,col))

          values  counts  frequency
0      116046918       1   0.000011
1      131099352       1   0.000011
2      168831696       1   0.000011
3      160892568       1   0.000011
4      117656106       1   0.000011
5      426661238       1   0.000011
6      149818068       1   0.000011
7      190150518       1   0.000011
8       69108600       1   0.000011
9      303775712       1   0.000011
10     235394142       1   0.000011
11     157866702       1   0.000011
12     245529306       1   0.000011
13     111788112       1   0.000011
14      20073180       1   0.000011
15      72768222       1   0.000011
16     266733354       1   0.000011
17     204450528       1   0.000011
18     286235328       1   0.000011
19     140753634       1   0.000011
20      20302278       1   0.000011
21      64056090       1   0.000011
22      36018918       1   0.000011
23      67726008       1   0.000011
24     169159344       1   0.000011
25       6993210       1   0.000011
26     168412812       1   0

   values  counts  frequency
0       7   50264   0.562181
1       1   27425   0.306736
2       5    8889   0.099420
3       4    2831   0.031663
    values  counts  frequency
0        3   15679   0.175363
1        2   15275   0.170844
2        1   12419   0.138901
3        4   12263   0.137156
4        5    8743   0.097787
5        6    6609   0.073919
6        7    5159   0.057701
7        8    3823   0.042759
8        9    2601   0.029091
9       10    2047   0.022895
10      11    1591   0.017795
11      12    1240   0.013869
12      13    1051   0.011755
13      14     909   0.010167
                                  values  counts  frequency
0                                Unknown   43773   0.489582
1                       InternalMedicine   12846   0.143677
2                       Emergency/Trauma    6658   0.074467
3                 Family/GeneralPractice    6505   0.072756
4                             Cardiology    4747   0.053093
5                        Surgery-General    2

    values  counts  frequency
0        9   43080   0.481831
1        5   10143   0.113445
2        8    9323   0.104274
3        7    9179   0.102663
4        6    8975   0.100381
5        4    4969   0.055576
6        3    2527   0.028263
7        2     919   0.010279
8        1     198   0.002215
9       16      38   0.000425
10      10      14   0.000157
11      13      13   0.000145
12      15       9   0.000101
13      11       9   0.000101
14      12       7   0.000078
15      14       6   0.000067
  values  counts  frequency
0   None   84773   0.948148
1   Norm    2274   0.025434
2   >200    1295   0.014484
3   >300    1067   0.011934
  values  counts  frequency
0   None   74250   0.830453
1     >8    7330   0.081983
2   Norm    4457   0.049850
3     >7    3372   0.037714
   values  counts  frequency
0      No   71548   0.800233
1  Steady   16388   0.183293
2      Up     958   0.010715
3    Down     515   0.005760
   values  counts  frequency
0      No   88045   0.984744
1  Stea

#**Conversions to integer for other columns**

In [73]:
two_value_cols = ['diabetesMed','change','gender']

#convert two_value_columns to 0 and 1
#'No' = 0, 'Yes' = 1
train['diabetesMed'] = np.where((train['diabetesMed'] == 'No'), 0, 1)
test['diabetesMed'] = np.where((test['diabetesMed'] == 'No'), 0, 1)

#'No' = 0, 'CH' = 1
train['change'] = np.where((train['change'] == 'No'), 0, 1)
test['change'] = np.where((test['change'] == 'No'), 0, 1)

#'Female' = 0, 'Male' = 1
train['gender'] = np.where((train['gender'] == 'Female'), 0, 1)
test['gender'] = np.where((test['gender'] == 'Female'), 0, 1)

In [74]:
readmission_sum_df(train,'max_glu_serum')

Unnamed: 0,max_glu_serum,readmitted,count,frequency
0,>200,0,1124,0.867954
1,>200,1,171,0.132046
2,>300,0,906,0.84911
3,>300,1,161,0.15089
4,,0,75201,0.887087
5,,1,9572,0.112913
6,Norm,0,1998,0.878628
7,Norm,1,276,0.121372


In [75]:
readmission_sum_df(train,'A1Cresult')

Unnamed: 0,A1Cresult,readmitted,count,frequency
0,>7,0,3035,0.900059
1,>7,1,337,0.099941
2,>8,0,6597,0.9
3,>8,1,733,0.1
4,,0,65581,0.883246
5,,1,8669,0.116754
6,Norm,0,4016,0.901055
7,Norm,1,441,0.098945


In [76]:
#Convert medical tests
tests = ['max_glu_serum','A1Cresult']

#high --> 2, normal -->1, none -->0
train['max_glu_serum'] = np.where((train['max_glu_serum'] == '>200') | (train['max_glu_serum'] == '>300'),\
                                  2,np.where(train['max_glu_serum'] == 'None',0,1))
test['max_glu_serum'] = np.where((test['max_glu_serum'] == '>200') | (test['max_glu_serum'] == '>300'),\
                                  2,np.where(test['max_glu_serum'] == 'None',0,1))
train['A1Cresult'] = np.where((train['A1Cresult'] == '>7') | (train['A1Cresult'] == '>8'),\
                                  2,np.where(train['A1Cresult'] == 'None',0,1))
test['A1Cresult'] = np.where((test['A1Cresult'] == '>7') | (test['A1Cresult'] == '>8'),\
                                  2,np.where(test['A1Cresult'] == 'None',0,1))

In [77]:
#collapse common terms in medical specialty
train.loc[:,'medical_specialty'] = np.where(train['medical_specialty'].str.contains('Surgery|Surgeon|Surgical'),'Surgeon',train['medical_specialty'])
test.loc[:,'medical_specialty'] = np.where(test['medical_specialty'].str.contains('Surgery|Surgeon|Surgical'),'Surgeon',test['medical_specialty'])
train.loc[:,'medical_specialty'] = np.where(train['medical_specialty'].str.contains('Orthopedics'),'Orthopedics',train['medical_specialty'])
test.loc[:,'medical_specialty'] = np.where(test['medical_specialty'].str.contains('Orthopedics'),'Orthopedics',test['medical_specialty'])

In [78]:
#collapse medical_specialty, due to high cardinality

toptrain = train['medical_specialty'].isin(train['medical_specialty'].value_counts().index[:7])
toptest = test['medical_specialty'].isin(test['medical_specialty'].value_counts().index[:7])
train.loc[~toptrain, 'medical_specialty'] = "Other_Specialty"
test.loc[~toptest, 'medical_specialty'] = "Other_Specialty"

In [79]:
print(train['medical_specialty'].value_counts().index[:7])
print(test['medical_specialty'].value_counts().index[:7])

Index(['Unknown', 'InternalMedicine', 'Other_Specialty', 'Emergency/Trauma',
       'Family/GeneralPractice', 'Cardiology', 'Surgeon'],
      dtype='object')
Index(['Unknown', 'InternalMedicine', 'Other_Specialty', 'Emergency/Trauma',
       'Family/GeneralPractice', 'Cardiology', 'Surgeon'],
      dtype='object')


In [80]:
#summary_df(train,'medical_specialty')

In [81]:
#summary_df(test,'medical_specialty')

#**Store Original Medication Columns to preserve initial data. Then convert dosages to integers
-0 for not given
-1 for up, down, and steady
the goal is reflect whethere the medication was given to the patient or not.**

In [82]:
train.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'diag_1_descrip', 'diag_2_descrip', 'diag_3_descrip',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'age_start', 'age_end'],
      dtype='object')

In [83]:
medications = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',\
               'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide','pioglitazone',\
               'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone','tolazamide',\
               'examide', 'citoglipton', 'insulin','glyburide-metformin',\
               'glipizide-metformin','glimepiride-pioglitazone', 'metformin-rosiglitazone',\
               'metformin-pioglitazone']

print(len(medications))

medications_df = train[medications].copy()

23


In [84]:
# #pandas_profiling.ProfileReport(combined)
# combined = pd.concat([train,test], axis = 0, ignore_index = True)
# combined.profile_report(correlations={'cramers': False})

In [85]:
#just track whether there was a change in the medication that was given
for col in medications:
    train[col] = np.where((train[col]=='No')|(train[col]=='Steady'),0,1)
    test[col] = np.where((test[col]=='No')|(test[col] == 'Steady'),0,1)

In [86]:
#drop columns that are constant values, as indicated in the profile up above.
#constant values and no changes at all

dropped_meds = ['acetohexamide','examide','glimepiride-pioglitazone','glipizide-metformin',\
                'citoglipton','tolazamide','tolbutamide','troglitazone','metformin-pioglitazone',\
                'metformin-rosiglitazone']

train.drop(dropped_meds, axis = 1, inplace = True)
test.drop(dropped_meds, axis = 1, inplace = True)

In [87]:
selected_meds = [x for x in medications if x not in dropped_meds]
print(len(selected_meds))

selected_meds_df = train[selected_meds].copy()

13


In [88]:
train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end
73062,141933636,36231804,Caucasian,0,[90-100),1,3,1,3,Unknown,39,0,9,0,0,0,789,428,496.0,Other,Circulatory,Respiratory,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90,100
9422,24864336,16250544,Caucasian,1,[60-70),3,3,1,4,Other_Specialty,52,0,6,0,0,0,312,401,414.0,Other,Circulatory,Circulatory,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,60,70
53020,222279690,112803966,Caucasian,0,[80-90),3,3,1,9,Unknown,58,1,17,1,0,0,415,780,496.0,Circulatory,Other,Respiratory,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,80,90
34628,70815444,23608431,Caucasian,1,[50-60),5,1,5,7,Surgeon,22,3,18,0,0,2,540,403,250.01,Digestive,Circulatory,Diabetes,5,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,50,60
15337,154201092,103799052,Caucasian,1,[30-40),3,1,1,2,Family/GeneralPractice,39,2,18,0,0,0,278,518,780.0,Other,Respiratory,Other,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,30,40


### **Feature Engineering**

In [89]:
#summary_df(train,'number_outpatient')

In [90]:
#summary_df(train,'number_emergency')

In [91]:
#summary_df(train,'number_inpatient')

In [92]:
#avg number of all procedures done per day in the hospital
train['avg_total_procedures'] = (train['num_lab_procedures'] + train['num_procedures'])/train['time_in_hospital']
test['avg_total_procedures'] = (test['num_lab_procedures'] + test['num_procedures'])/test['time_in_hospital']

#if diabetes medication was given, was it changed?
train['diab_med_changed'] = train['change'] * train['diabetesMed']
test['diab_med_changed'] = test['change'] * test['diabetesMed']

#how many times was the medication changed?
train['num_of_changes'] = 0
test['num_of_changes'] = 0
for col in selected_meds:
    train['num_of_changes'] = train['num_of_changes'] + train[col]
    test['num_of_changes'] = test['num_of_changes'] + test[col]

#patient 'score' variable. Here, I'm trying to 'reward' patients who have gone to see their
#doctor in the previous year (number_outpatient > 0; most in the dataset have not seen their
#doctor in the previous year). However, the older a patient is, the more likely they are to have
#multiple visits to the doctor. So this must be counter-balanced by number of medications given
#scaled by number of diagnoses assessed during the hospital stay.

train['patient_score'] = (((train['age_start'] + train['age_end'])/2)/(train['number_outpatient'] + 1)) *\
                         (train['num_medications']/train['number_diagnoses'])

test['patient_score'] = (((test['age_start'] + test['age_end'])/2)/(test['number_outpatient'] + 1)) *\
                        (test['num_medications']/test['number_diagnoses'])

In [93]:
#col_sum_df(train,'number_outpatient','num_medications')

In [94]:
#train1 = train.copy()

In [95]:
#train1['patient_score'] = (((train1['age_start'] + train1['age_end'])/2)/(train1['number_outpatient'] + 1)) *\
 #                           (train1['num_medications']/train1['number_diagnoses'])

In [96]:
#train2 = train1[train1['patient_score'] < 1000]

In [97]:
#ax = sns.boxplot(x="readmitted", y="patient_score", data=train1)

In [98]:
#ax = sns.boxplot(x="readmitted", y="patient_score", data=train2)

In [99]:
#train1[['patient_score']].describe()

In [100]:
#train2[['patient_score']].describe()

In [101]:
#train1['patient_score'].median()

In [102]:
#train1.groupby('readmitted')['patient_score'].agg({'patient_score': ['mean','median','min','max','std']})

In [103]:
#train2.groupby('readmitted')['patient_score'].agg({'patient_score': ['mean','median','min','max','std']})

In [104]:
train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
46825,155941218,23906835,Caucasian,1,[70-80),1,1,7,3,InternalMedicine,34,0,14,0,0,0,486.0,496,276,Respiratory,Respiratory,Other,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,70,80,11.333333,0,0,131.25
72493,208890606,98021061,AfricanAmerican,1,[70-80),1,1,7,1,Emergency/Trauma,11,2,7,0,1,0,378.0,784,368,Other,Other,Other,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,70,80,13.0,1,0,75.0
63613,61574766,816597,Caucasian,1,[70-80),1,1,7,1,InternalMedicine,58,0,8,0,0,0,592.0,250,401,Genitourinary,Diabetes,Circulatory,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,70,80,58.0,0,0,100.0
86427,55873812,24030,Caucasian,0,[10-20),1,1,7,3,Emergency/Trauma,69,0,13,0,0,0,250.13,276,276,Diabetes,Other,Other,8,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,10,20,23.0,0,0,24.375
88758,10471320,59813991,Caucasian,1,[50-60),9,1,5,2,InternalMedicine,40,0,7,0,0,0,786.0,401,414,Respiratory,Circulatory,Circulatory,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,50,60,20.0,0,0,55.0


In [105]:
train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train,test)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 47 columns in train
There are now 47 columns in test


**Finished adding/removing columns. Now convert columns to numeric/non numeric**

In [106]:
addtocats = ['encounter_id', 'patient_nbr']
tocats.extend(addtocats)

In [107]:
for col in tocats:
    train.loc[:,col] = train.loc[:,col].astype('category')
    test.loc[:,col] = test.loc[:,col].astype('category')

In [108]:
#update numericcols for the feature engineered columns
numericcols = [x for x in numericcols if x not in tocats]
numericcols.extend(['age_start','age_end','avg_total_procedures','num_of_changes','patient_score'])

#make a list of the columns that were converted from str to number_encoded:
t1 = train.select_dtypes(exclude = ['object','category']).columns.tolist()
encoded_cols = [x for x in t1 if x not in numericcols]
encoded_cols

['gender',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'glipizide',
 'glyburide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'insulin',
 'glyburide-metformin',
 'change',
 'diabetesMed',
 'readmitted',
 'diab_med_changed']

In [109]:
encoded_cols.remove('readmitted')

In [110]:
tocats

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'encounter_id',
 'patient_nbr']

**Copy train and test before removing the above columns and proceeding with dummification and transformations**

In [111]:
remove = ['age','diag_1','diag_2','diag_3']#['encounter_id','patient_nbr','age','diag_1','diag_2','diag_3']
train1 = train.copy()
test1 = test.copy()

train1.drop(remove,inplace = True, axis = 1)
test1.drop(remove,inplace = True, axis = 1)

In [112]:
#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train1,test1)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 43 columns in train
There are now 43 columns in test


In [113]:
train1_target = train1['readmitted']
# train1.drop('readmitted', inplace = True, axis = 1)

test1_target = test1['readmitted']
# test1.drop('readmitted', inplace = True, axis = 1)

In [114]:
print('train1 shape is {}'.format(train1.shape))
print('train1_target shape is {}'.format(train1_target.shape))

print('test1 shape is {}'.format(test1.shape))
print('test1_target shape is {}'.format(test1_target.shape))

train1 shape is (89409, 43)
train1_target shape is (89409,)
test1 shape is (9934, 43)
test1_target shape is (9934,)


In [115]:
numericcols

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'age_start',
 'age_end',
 'avg_total_procedures',
 'num_of_changes',
 'patient_score']

In [116]:
for col in encoded_cols:
    train1.loc[:,col] = train1.loc[:,col].astype('category')
    test1.loc[:,col] = test1.loc[:,col].astype('category')

In [117]:
train1.tail()

Unnamed: 0,encounter_id,patient_nbr,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
89404,31296060,3344202,Caucasian,1,1,1,7,2,Cardiology,35,0,12,0,0,0,Respiratory,Circulatory,Circulatory,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,70,80,17.5,0,0,100.0
89405,159139902,93611655,Caucasian,1,5,1,1,5,Unknown,63,2,23,0,0,0,Other,Other,Other,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,60,70,13.0,1,1,166.111111
89406,232191828,85600899,Caucasian,1,3,3,1,3,Orthopedics,55,1,33,0,0,0,Musculoskeletal,Circulatory,Other,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,70,80,18.666667,1,1,275.0
89407,6740700,8208234,Caucasian,0,9,26,7,12,Family/GeneralPractice,77,2,21,0,0,0,Respiratory,Respiratory,Genitourinary,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,60,70,6.583333,1,1,151.666667
89408,60115668,77943780,Caucasian,0,9,1,5,1,Cardiology,2,5,17,0,0,0,Circulatory,Circulatory,Other,7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,40,50,7.0,1,1,109.285714


In [118]:
test1.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1_descrip,diag_2_descrip,diag_3_descrip,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,insulin,glyburide-metformin,change,diabetesMed,readmitted,age_start,age_end,avg_total_procedures,diab_med_changed,num_of_changes,patient_score
0,110939484,19274094,Caucasian,0,1,1,5,11,InternalMedicine,68,0,20,0,0,0,Diabetes,Genitourinary,Other,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,70,80,6.181818,0,0,300.0
1,170328306,65634327,Caucasian,1,1,1,1,1,Unknown,20,0,7,0,0,0,Other,Circulatory,Other,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,50,60,20.0,0,0,48.125
2,245688426,100657359,Caucasian,0,3,6,1,4,Unknown,21,3,23,1,0,2,Musculoskeletal,Musculoskeletal,Musculoskeletal,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,60,70,6.0,0,0,106.785714
3,150826224,83144448,Caucasian,1,1,1,1,12,Other_Specialty,28,0,19,0,0,1,Respiratory,Other,Other,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,30,40,2.333333,0,0,95.0
4,135993852,65234214,AfricanAmerican,0,1,2,7,1,Unknown,21,0,6,0,0,0,Circulatory,Genitourinary,Circulatory,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,60,70,21.0,0,0,55.714286


In [119]:
combined = pd.concat([train1,test1], axis = 0, ignore_index = True)

In [120]:
#combined = pd.concat([train1,test1], axis = 0, ignore_index = True)
# combined[numericcols].profile_report(correlations={'cramers': False})

In [121]:
train_rows, test_rows, train_colnames, test_colnames = update_rows_cols(train1,test1)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 43 columns in train
There are now 43 columns in test


##**Check for Skewness and apply BoxCox Transformation**

In [122]:
#test for skewness
for col in numericcols:
    print(col, stats.skewtest(train1[col]))

time_in_hospital SkewtestResult(statistic=113.40774689593859, pvalue=0.0)
num_lab_procedures SkewtestResult(statistic=-28.83299573225931, pvalue=8.279143504923623e-183)
num_procedures SkewtestResult(statistic=125.3889564654774, pvalue=0.0)
num_medications SkewtestResult(statistic=126.51425503304895, pvalue=0.0)
number_outpatient SkewtestResult(statistic=304.5288260627894, pvalue=0.0)
number_emergency SkewtestResult(statistic=394.5413920304001, pvalue=0.0)
number_inpatient SkewtestResult(statistic=218.71803745260752, pvalue=0.0)
number_diagnoses SkewtestResult(statistic=-91.83053560136733, pvalue=0.0)
age_start SkewtestResult(statistic=-70.28262307822187, pvalue=0.0)
age_end SkewtestResult(statistic=-70.28262307822187, pvalue=0.0)
avg_total_procedures SkewtestResult(statistic=152.6974843038501, pvalue=0.0)
num_of_changes SkewtestResult(statistic=132.1115255014073, pvalue=0.0)
patient_score SkewtestResult(statistic=170.966104522399, pvalue=0.0)


In [123]:
#correct for skew:
withzeros = ['num_of_changes','num_procedures','number_emergency','number_inpatient','number_outpatient']

skewedvalues = combined[numericcols].apply(lambda x: skew(x))

#skewness should be within -1 and 1,apparently. Correct for anything greater than 0.75
skewedvariables = skewedvalues[abs(skewedvalues) > 0.75]

nonzeros = [x for x in skewedvariables.index.tolist() if x not in withzeros]

skewedvariables1 = combined[withzeros] + 1 #the transform fails if it attempts to divide by zero, so add 1.
skewedvariables_reg = combined[nonzeros]

In [124]:
nonzeros

['time_in_hospital',
 'num_medications',
 'number_diagnoses',
 'avg_total_procedures',
 'patient_score']

In [125]:
#Box-Cox Transformation

for variable1 in withzeros:
     combined[variable1] = stats.boxcox(skewedvariables1[variable1])[0]

for variable2 in nonzeros:
     combined[variable2] = stats.boxcox(skewedvariables_reg[variable2])[0]


In [126]:
#marker for after_boxcox transformation.

after_boxcox = combined.copy()

train2 = after_boxcox[:train_rows]
test2 = after_boxcox[train_rows:]

In [127]:
print(train2.shape)
print(test2.shape)

(89409, 43)
(9934, 43)


In [126]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9934 entries, 89409 to 99342
Data columns (total 43 columns):
encounter_id                9934 non-null int64
patient_nbr                 9934 non-null int64
race                        9934 non-null object
gender                      9934 non-null category
admission_type_id           9934 non-null category
discharge_disposition_id    9934 non-null category
admission_source_id         9934 non-null category
time_in_hospital            9934 non-null float64
medical_specialty           9934 non-null object
num_lab_procedures          9934 non-null int64
num_procedures              9934 non-null float64
num_medications             9934 non-null float64
number_outpatient           9934 non-null float64
number_emergency            9934 non-null float64
number_inpatient            9934 non-null float64
diag_1_descrip              9934 non-null object
diag_2_descrip              9934 non-null object
diag_3_descrip              9934 non-null ob

**dummification**

In [128]:
train2 = pd.get_dummies(train2, drop_first = True)
test2 = pd.get_dummies(test2, drop_first = True)

In [129]:
#Update rows and column markers for train and test files:
train_rows,test_rows,train_colnames,test_colnames = update_rows_cols(train2,test2)

There are now 89409 rows in train
There are now 9934 rows in test
There are now 84 columns in train
There are now 84 columns in test


**Recombine dataframes, and then split into train test**

In [130]:
combined1 = pd.concat([train2,test2], axis = 0, ignore_index = True)
target1 = pd.concat([train1_target,test1_target], axis = 0, ignore_index = True)

In [131]:
print(combined1.shape)
print(target1.shape)

(99343, 84)
(99343,)


**scaling**

In [132]:
#robustscaler to standardize all numeric columns (not categorical)

#robust = RobustScaler()
#robustscaler = robust.fit(train2[numericcols])#:train_rows,numcolstx])


standard = StandardScaler()
standardscaler = standard.fit(combined1[numericcols])


combined1[numericcols] = standardscaler.transform(combined1[numericcols])
# train2[numericcols] = robustscaler.transform(train2[numericcols])
# test2[numericcols] = robustscaler.transform(test2[numericcols])

In [133]:
print(combined1.shape)
print(target1.shape)

(99343, 84)
(99343,)


#### Code only for Xing's proccess
#**++++++++++++++++++++++++++++++++++++++++++++**

In [135]:
combinedW = combined1.copy()
targetW = target1.copy()

print(combinedW.shape)
print(targetW.shape)

(99343, 84)
(99343,)


#**Whole**

In [136]:
removeW = ['encounter_id','patient_nbr']
combinedW.drop(removeW,inplace = True, axis = 1)
targetW = combinedW['readmitted']
combinedW.drop('readmitted', inplace = True, axis = 1)

X_trainW, X_testW, y_trainW, y_testW = ms.train_test_split(combinedW, targetW, test_size=0.2, random_state = 212)
trainW = pd.concat([X_trainW,y_trainW], axis = 1, ignore_index = False)
            #testW = pd.concat([X_testU,y_testU], axis = 1, ignore_index = False)
trainW.to_csv('dataset_diabetes/whole_df_train_EP.csv')
            #testW.to_csv('dataset_diabetes/df_test_UniqueEP')

#**Unique**

In [137]:
#combined1.to_csv('whole_train_EP.csv')
combinedU = combined1.copy()
targetU = target1.copy()

print(combinedU.shape)
print(targetU.shape)

(99343, 84)
(99343,)


In [138]:
#drop repeat patients in train and test file to preserve indepedence assumption for each observation
combinedU.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
combinedU.reset_index(inplace = True)
del combinedU['index']

removeU = ['encounter_id','patient_nbr']
combinedU.drop(removeU,inplace = True, axis = 1)
targetU = combinedU['readmitted']
combinedU.drop('readmitted', inplace = True, axis = 1)

X_trainU, X_testU, y_trainU, y_testU = ms.train_test_split(combinedU, targetU, test_size=0.2, random_state = 212)
trainU = pd.concat([X_trainU,y_trainU], axis = 1, ignore_index = False)
testU = pd.concat([X_testU,y_testU], axis = 1, ignore_index = False)
trainU.to_csv('dataset_diabetes/df_train_UniqueEP.csv')
testU.to_csv('dataset_diabetes/df_test_UniqueEP.csv')

In [139]:
#drop repeat patients in train and test file to preserve indepedence assumption for each observation

# repeat_patients = combinedU.copy()[combinedU.copy().duplicated(subset = 'patient_nbr', keep = 'first')].sort_values(by = ['patient_nbr'])
# repeat_patients.reset_index(inplace = True, keep = False)
# #del combinedU['index']
# removeU = ['encounter_id','patient_nbr']
# repeat_patients.drop(removeU,inplace = True, axis = 1)

# combinedU.drop_duplicates(subset = 'patient_nbr', keep = 'first', inplace = True)
# combinedU.reset_index(inplace = True)
# del combinedU['index']


# combinedU.drop(removeU,inplace = True, axis = 1)
# targetU = combinedU['readmitted']
# combinedU.drop('readmitted', inplace = True, axis = 1)


In [142]:
# repeat_patients.head()

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,glyburide-metformin,readmitted,age_start,age_end,avg_total_procedures,num_of_changes,patient_score,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_Unknown,gender_1,admission_type_id_3,admission_type_id_5,admission_type_id_9,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_26,admission_source_id_4,admission_source_id_5,admission_source_id_7,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Orthopedics,medical_specialty_Other_Specialty,medical_specialty_Surgeon,medical_specialty_Unknown,diag_1_descrip_Diabetes,diag_1_descrip_Digestive,diag_1_descrip_Genitourinary,diag_1_descrip_Injury,diag_1_descrip_Musculoskeletal,diag_1_descrip_Neoplasms,diag_1_descrip_Other,diag_1_descrip_Respiratory,diag_2_descrip_Diabetes,diag_2_descrip_Digestive,diag_2_descrip_Genitourinary,diag_2_descrip_Injury,diag_2_descrip_Musculoskeletal,diag_2_descrip_Neoplasms,diag_2_descrip_Other,diag_2_descrip_Respiratory,diag_3_descrip_Diabetes,diag_3_descrip_Digestive,diag_3_descrip_Genitourinary,diag_3_descrip_Injury,diag_3_descrip_Musculoskeletal,diag_3_descrip_Neoplasms,diag_3_descrip_Other,diag_3_descrip_Respiratory,max_glu_serum_1,max_glu_serum_2,A1Cresult_1,A1Cresult_2,metformin_1,repaglinide_1,nateglinide_1,chlorpropamide_1,glimepiride_1,glipizide_1,glyburide_1,pioglitazone_1,rosiglitazone_1,acarbose_1,miglitol_1,insulin_1,change_1,diabetesMed_1,diab_med_changed_1
30403,26264286,135,-0.249616,-0.607189,0.313392,-0.095072,-0.443808,-0.354559,1.240374,-1.301697,0,0,-0.67383,-0.67383,-0.112382,-0.611177,0.459758,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
18401,80742510,1152,1.205928,-0.658183,0.313392,0.167048,-0.443808,-0.354559,1.240374,-2.022061,0,0,-0.046678,-0.046678,-1.130929,-0.611177,2.784796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
15801,8380170,1152,0.759034,0.004746,0.86666,-0.235472,-0.443808,-0.354559,1.240374,-2.022061,0,0,-0.67383,-0.67383,-0.500249,-0.611177,1.97868,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
71744,83281464,1152,1.865735,-0.301221,0.313392,0.40838,-0.443808,-0.354559,1.560378,-1.301697,0,0,-0.046678,-0.046678,-1.301332,1.631053,1.167499,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
3576,30180318,1152,0.759034,0.106735,1.395547,0.038868,-0.443808,-0.354559,1.560378,-0.894041,0,0,-0.67383,-0.67383,-0.40999,1.631053,0.282523,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1


In [140]:
print(combinedU.shape)
print(targetU.shape)

(69990, 81)
(69990,)


In [141]:
print(trainU.shape)
print(testU.shape)

(55992, 82)
(13998, 82)


In [142]:
print(combined1.shape)
print(target1.shape)

(99343, 84)
(99343,)


#### Over/undersampling train using SMOTE oversample and near miss to address the unbalanced classes

In [None]:
#Train test split
X_train, X_test, y_train, y_test = ms.train_test_split(combined1, target1, test_size=0.2, random_state = 212)

In [143]:

smt = SMOTE()
X_smoteW, y_smoteW = smt.fit_sample(X_trainW, y_trainW)
X_smoteU, y_smoteU = smt.fit_sample(X_trainU, y_trainU)

nr = NearMiss()
X_nearmissW, y_nearmissW = nr.fit_sample(X_trainW, y_trainW)
X_nearmissU, y_nearmissU = nr.fit_sample(X_trainU, y_trainU)

print('The number of observations in the WHOLE dataset for each class using SMOTE are now {}'.format(np.bincount(y_smoteW)))
print('The number of observations in the UNIQUE dataset for each class using SMOTE are now {}'.format(np.bincount(y_smoteU)))
print('The number of observations in the WHOLE dataset for each class using Near Miss are now {}'.format(np.bincount(y_nearmissW)))
print('The number of observations in the UNIQUE dataset for each class using Near Miss are now {}'.format(np.bincount(y_nearmissU)))

The number of observations in the WHOLE dataset for each class using SMOTE are now [70432 70432]
The number of observations in the UNIQUE dataset for each class using SMOTE are now [51806 51806]
The number of observations in the WHOLE dataset for each class using Near Miss are now [9042 9042]
The number of observations in the UNIQUE dataset for each class using Near Miss are now [4186 4186]


In [163]:
#X_smote
y_smoteU.head()

0    0
1    0
2    0
3    0
4    0
Name: readmitted, dtype: int64

## **Modelling**

In [None]:
logit = LogisticRegression()
logit.set_params(class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logit.fit(X_train, y_train)
print(logit.score(X_train, y_train))
cm = confusion_matrix(y_train, logit.predict(X_train))
cm

In [None]:
#Logistic Regression with SMOTE, UNIQUE

logitSU = LogisticRegression()
logitSU.set_params(max_iter = 2000)#class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logitSU.fit(X_smoteU, y_smoteU)
print(logitSU.score(X_smoteU, y_smoteU))
cmSU = confusion_matrix(y_smoteU, logitSU.predict(X_smoteU))
cmSU

In [None]:
#Logistic Regression with SMOTE, WHOLE

logitSW = LogisticRegression()
logitSW.set_params(max_iter = 2000)#class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logitSW.fit(X_smoteW, y_smoteW)
print(logitSW.score(X_smoteW, y_smoteW))
cmSW = confusion_matrix(y_smoteW, logitSW.predict(X_smoteW))
cmSW

In [None]:
#Logistic Regression with NEAR MISS, UNIQUE

logitNMU = LogisticRegression()
logitNMU.set_params(max_iter = 2000)#class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logitNMU.fit(X_nearmissU, y_nearmissU)
print(logitNMU.score(X_nearmissU, y_nearmissU))
cmNMU = confusion_matrix(y_nearmissU, logitNMU.predict(X_nearmissU))
cmNMU

In [None]:
#Logistic Regression with NEAR MISS, WHOLE

logitNMW = LogisticRegression()
logitNMW.set_params(max_iter = 2000)#class_weight = 'balanced') #turn on Ridge for heavily penalized coefficients. 
logitNMW.fit(X_nearmissW, y_nearmissW)
print(logitNMW.score(X_nearmissW, y_nearmissW))
cmNMW = confusion_matrix(y_nearmissW, logitNMW.predict(X_nearmissW))
cmNMW

In [None]:
prediction_test = logit.predict(X_test)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, prediction_test)))
print("Precision is {0:.2f}".format(precision_score(y_test, prediction_test)))
print("Recall is {0:.2f}".format(recall_score(y_test, prediction_test)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test, prediction_test)))

In [None]:
#Logistic Regression with SMOTE, UNIQUE
predictionSU_test = logitSU.predict(X_testU)

print("Accuracy is {0:.2f}".format(accuracy_score(y_testU, predictionSU_test)))
print("Precision is {0:.2f}".format(precision_score(y_testU, predictionSU_test)))
print("Recall is {0:.2f}".format(recall_score(y_testU, predictionSU_test)))
print("AUC is {0:.2f}".format(roc_auc_score(y_testU, predictionSU_test)))

In [None]:
#Logistic Regression with NEARM MISS
predictionNM_test = logitNM.predict(X_testU)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, predictionNM_test)))
print("Precision is {0:.2f}".format(precision_score(y_test, predictionNM_test)))
print("Recall is {0:.2f}".format(recall_score(y_test, predictionNM_test)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test, predictionNM_test)))

In [None]:
# use stratifiedKFold
skf = ms.StratifiedKFold(n_splits=10, shuffle=True, random_state=99)

In [None]:
logit1 = LogisticRegression()
logit1.set_params(class_weight = 'balanced')#turn on Ridge for heavily penalized coefficients. 
params1 = {'C':np.logspace(-4,4, 50)}

gs_logit1 = ms.GridSearchCV(estimator = logit1,
                           param_grid = params1,
                           cv = skf,
                           verbose = True, n_jobs = -1)
#print(logit.score(train2, train1_target))
#cm = confusion_matrix(train1_target, logit.predict(train2))
#cm

In [None]:
gs_logit1.fit(X_train, y_train)
gs_logit1.best_params_


In [None]:
#save the best result
logit_best = gs_logit1.best_estimator_
logit_best_predict = logit_best.predict(X_train)
cm1 = confusion_matrix(y_train, logit_best.predict(X_train))
cm1

In [None]:
#large C is hard margin

svm_model = svm.SVC(kernel='rbf') #try radial


In [None]:
# C_range = np.logspace(-2, 10, 20)

# gamma_range = np.logspace(-9, 3, 20)
# param_grid = dict(gamma=gamma_range, C=C_range)

# sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# gs_svm = ms.GridSearchCV(estimator = svm_model, 
#                          param_grid = param_grid,
#                          cv = sss,
#                          verbose = True,
#                          n_jobs = -1)
# gs_svm.fit(X_train, y_train)

In [None]:
# estim = HyperoptEstimator( classifier=svc('mySVC') )
# estim.fit(x_svm,y_svm)

In [None]:
# #X_svm_train = X_train.copy()
# #X_svm_train.reset_index(inplace = True, drop = True)
# x_svm = np.array(X_svm_train)

In [None]:
# y_svm_train = y_train.copy()
# y_svm_train.reset_index(inplace = True, drop = True)
# y_svm = np.array(y_svm_train)

In [146]:
num_eval = 75

param_hyperopt= {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']), #check lightgbm for types
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}

In [144]:
# def hyperopt(param_space, X_smoteU, y_smoteU, X_testU, y_testU, num_eval):
    
#     start = time.time()
    
#     def objective_function(params):
#         clf = lgb.LGBMClassifier(**params)
#         score = cross_val_score(clf, X_smoteU, y_smoteU, cv=5).mean()
#         return {'loss': -score, 'status': STATUS_OK}

#     trials = Trials()
#     best_param = fmin(objective_function, 
#                       param_space, 
#                       algo=tpe.suggest, 
#                       max_evals=num_eval, 
#                       trials=trials,
#                       rstate= np.random.RandomState(1))
#     loss = [x['result']['loss'] for x in trials.trials]
    
#     best_param_values = [x for x in best_param.values()]
    
#     if best_param_values[0] == 0:
#         boosting_type = 'gbdt'
#     else:
#         boosting_type= 'dart'
    
#     clf_best = lgb.LGBMClassifier(learning_rate=best_param_values[2],
#                                   num_leaves=int(best_param_values[5]),
#                                   max_depth=int(best_param_values[3]),
#                                   n_estimators=int(best_param_values[4]),
#                                   boosting_type=boosting_type,
#                                   colsample_bytree=best_param_values[1],
#                                   reg_lambda=best_param_values[6],
#                                  )
                                  
#     clf_best.fit(X_smoteU, y_smoteU)
    
#     print("")
#     print("##### Results")
#     print("Score best parameters: ", min(loss)*-1)
#     print("Best parameters: ", best_param)
#     print("Test Score: ", clf_best.score(X_testU, y_testU))
#     print("Time elapsed: ", time.time() - start)
#     print("Parameter combinations evaluated: ", num_eval)
    
#     return trials

In [147]:
results_hyperopt = hyperopt(param_hyperopt, X_smoteU, y_smoteU, X_testU, y_testU, num_eval)

100%|██████████| 75/75 [02:55<00:00,  2.44s/it, best loss: -0.9511747378870936]

##### Results
Score best parameters:  0.9511747378870936
Best parameters:  {'boosting_type': 1, 'colsample_by_tree': 0.6273900956160131, 'learning_rate': 0.6203348337290178, 'max_depth': 9.0, 'n_estimators': 33.0, 'num_leaves': 44.0, 'reg_lambda': 0.9159444942633306}
Test Score:  0.9212744677811116
Time elapsed:  175.6831259727478
Parameter combinations evaluated:  75


In [150]:
results_hyperopt

<hyperopt.base.Trials at 0x1a3e6fe710>

In [None]:
num_eval = 75

param_hyperopt= {
                'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
                'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
                'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
                'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
                'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']), #check lightgbm for types
                'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
                'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
                }

In [152]:
def objective_function(params):
    clf = lgb.LGBMClassifier(**params)
    score = cross_val_score(clf, X_smoteU, y_smoteU, cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}             

In [156]:
trials = Trials()
best_param = fmin(objective_function, 
                  param_hyperopt, 
                  algo=tpe.suggest, 
                  max_evals=num_eval, 
                  trials=trials,
                  rstate= np.random.RandomState(1))
loss = [x['result']['loss'] for x in trials.trials]
best_param_values = [x for x in best_param.values()]

100%|██████████| 75/75 [02:57<00:00,  2.69s/it, best loss: -0.9511747378870936]


In [177]:
best_param_values

[1,
 0.6273900956160131,
 0.6203348337290178,
 9.0,
 33.0,
 44.0,
 0.9159444942633306]

In [176]:

if best_param_values[0] == 0:
    boosting_type = 'gbdt'
else:
    boosting_type= 'dart'
    
clf_best = lgb.LGBMClassifier(learning_rate=best_param_values[2],
                              num_leaves=int(best_param_values[5]),
                              max_depth=int(best_param_values[3]),
                              n_estimators=int(best_param_values[4]),
                              boosting_type=boosting_type,
                              colsample_bytree=best_param_values[1],
                              reg_lambda=best_param_values[6],
                              )
                                  
clf_best.fit(X_smoteU, y_smoteU)

LGBMClassifier(boosting_type='dart', class_weight=None,
               colsample_bytree=0.6273900956160131, importance_type='split',
               learning_rate=0.6203348337290178, max_depth=9,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=33, n_jobs=-1, num_leaves=44, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.9159444942633306,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [None]:
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, predictionNM_test)))
print("Precision is {0:.2f}".format(precision_score(y_test, predictionNM_test)))
print("Recall is {0:.2f}".format(recall_score(y_test, predictionNM_test)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test, predictionNM_test)))

#**Experiment**

In [171]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [157]:

#from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

In [159]:
lr=LogisticRegression(random_state = 42)
lr.set_params(max_iter = 2000)
lr.fit(X_smoteU, y_smoteU)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [173]:
y_train_preds = lr.predict_proba(X_smoteU)[:,1]
y_valid_preds = lr.predict_proba(X_testU)[:,1]

thresh = 0.5

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_testU,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_testU,y_valid_preds, thresh)

Logistic Regression
Training:


ValueError: Found input variables with inconsistent numbers of samples: [13998, 103612]

In [162]:
y_train_preds = lr.predict_proba(np.array(y_smoteU).reshape(-1,1))
y_train_preds = lr.predict_proba(np.array(y_testU).reshape(-1,1))

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 1 1 1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [164]:
np.array(y_smoteU)

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [167]:
np.array(y_smoteU).reshape(-1,1)

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [None]:
from sklear.utils import resample
resample(df,)

In [None]:
pd.concat(df[df.target == 0], resample(df[df.target ==1],1000), axis = 0)