In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
test_dataset = pd.read_csv('./test_dataset.csv')
train_dataset = pd.read_csv('./train_dataset.csv')
#import datasets

In [3]:
print(train_dataset.shape)
print(test_dataset.shape)
#get shape of train and test data set

(91589, 50)
(10177, 50)


In [4]:
train_dataset['dataset'] = 'train'
test_dataset['dataset'] = 'test'
#add new column to make separating easier later

In [5]:
merged_data = pd.concat([train_dataset, test_dataset], axis=0)
merged_data.shape
#merge data set

(101766, 51)

In [6]:
merged_data.info()
#get the column and dtype

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101766 entries, 0 to 10176
Data columns (total 51 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            101766

In [7]:
merged_data.describe()
#summarize the numerical valuesa

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [8]:
df = merged_data.copy(deep=True)
#make a copy for data exploration

### Data Exploration

#### Exploring the target variable, readmitted rate

In [9]:
df.readmitted.nunique()
#how many categories of 'readmitted columns'


3

In [10]:
df.groupby('readmitted').size()
#get size of the readmitted categories


readmitted
<30    11357
>30    35545
NO     54864
dtype: int64

In [11]:
df['readmitted'] = df['readmitted'].replace('>30', 0) #for now, we will replace >30 with 2, technically this is should be 0, but we can replace it later
df['readmitted'] = df['readmitted'].replace('<30', 1) 
df['readmitted'] = df['readmitted'].replace('NO', 0)
#replace target variable with numerical value


### feature examination, left to right

In [12]:
df.race.unique()

array(['Caucasian', 'AfricanAmerican', '?', 'Asian', 'Other', 'Hispanic'],
      dtype=object)

In [13]:
dict_race = {'?': 0, 'Caucasian':1, 'AfricanAmerican':2,'Asian':3,'Other':4,'Hispanic':5}
df.race.replace(dict_race, inplace=True)
#impute race feature with nominal values

In [14]:
df.gender.unique()


array(['Female', 'Male', 'Unknown/Invalid'], dtype=object)

In [15]:
dict_gender = {'Unknown/Invalid': 0, 'Female':1, 'Male':2}
df.gender.replace(dict_gender, inplace=True)
#impute gender feature with nominal value


In [16]:
df.age.unique()


array(['[60-70)', '[40-50)', '[80-90)', '[70-80)', '[50-60)', '[20-30)',
       '[90-100)', '[30-40)', '[0-10)', '[10-20)'], dtype=object)

In [17]:
dict_age = {'[60-70)': 65, 
            '[40-50)':45, 
            '[80-90)':85,
               '[70-80)':75,
               '[50-60)':55,
               '[20-30)':25,
               '[90-100)':95,
               '[30-40)':35,
               '[0-10)':5,
               '[10-20)':15}
df.age.replace(dict_age, inplace=True)
#impute age feature with median num


In [18]:
# Creating additional columns for diagnosis
df['level1_diag1'] = df['diag_1']
df['level1_diag2'] = df['diag_2']
df['level1_diag3'] = df['diag_3']


In [19]:
df.loc[df['diag_1'].str.contains('V'), ['level1_diag1']] = 0
df.loc[df['diag_1'].str.contains('E'), ['level1_diag1']] = 0
df.loc[df['diag_2'].str.contains('V'), ['level1_diag2']] = 0
df.loc[df['diag_2'].str.contains('E'), ['level1_diag2']] = 0
df.loc[df['diag_3'].str.contains('V'), ['level1_diag3']] = 0
df.loc[df['diag_3'].str.contains('E'), ['level1_diag3']] = 0
df['level1_diag1'] = df['level1_diag1'].replace('?', -1)
df['level1_diag2'] = df['level1_diag2'].replace('?', -1)
df['level1_diag3'] = df['level1_diag3'].replace('?', -1)


In [20]:
df['level1_diag1'] = df['level1_diag1'].astype(float)
df['level1_diag2'] = df['level1_diag2'].astype(float)
df['level1_diag3'] = df['level1_diag3'].astype(float)


In [21]:
for index, row in df.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        df.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        df.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        df.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        df.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        df.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        df.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        df.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        df.loc[index, 'level1_diag1'] = 8
    else:
        df.loc[index, 'level1_diag1'] = 0
        
    if (row['level1_diag2'] >= 390 and row['level1_diag2'] < 460) or (np.floor(row['level1_diag2']) == 785):
        df.loc[index, 'level1_diag2'] = 1
    elif (row['level1_diag2'] >= 460 and row['level1_diag2'] < 520) or (np.floor(row['level1_diag2']) == 786):
        df.loc[index, 'level1_diag2'] = 2
    elif (row['level1_diag2'] >= 520 and row['level1_diag2'] < 580) or (np.floor(row['level1_diag2']) == 787):
        df.loc[index, 'level1_diag2'] = 3
    elif (np.floor(row['level1_diag2']) == 250):
        df.loc[index, 'level1_diag2'] = 4
    elif (row['level1_diag2'] >= 800 and row['level1_diag2'] < 1000):
        df.loc[index, 'level1_diag2'] = 5
    elif (row['level1_diag2'] >= 710 and row['level1_diag2'] < 740):
        df.loc[index, 'level1_diag2'] = 6
    elif (row['level1_diag2'] >= 580 and row['level1_diag2'] < 630) or (np.floor(row['level1_diag2']) == 788):
        df.loc[index, 'level1_diag2'] = 7
    elif (row['level1_diag2'] >= 140 and row['level1_diag2'] < 240):
        df.loc[index, 'level1_diag2'] = 8
    else:
        df.loc[index, 'level1_diag2'] = 0
    
    if (row['level1_diag3'] >= 390 and row['level1_diag3'] < 460) or (np.floor(row['level1_diag3']) == 785):
        df.loc[index, 'level1_diag3'] = 1
    elif (row['level1_diag3'] >= 460 and row['level1_diag3'] < 520) or (np.floor(row['level1_diag3']) == 786):
        df.loc[index, 'level1_diag3'] = 2
    elif (row['level1_diag3'] >= 520 and row['level1_diag3'] < 580) or (np.floor(row['level1_diag3']) == 787):
        df.loc[index, 'level1_diag3'] = 3
    elif (np.floor(row['level1_diag3']) == 250):
        df.loc[index, 'level1_diag3'] = 4
    elif (row['level1_diag3'] >= 800 and row['level1_diag3'] < 1000):
        df.loc[index, 'level1_diag3'] = 5
    elif (row['level1_diag3'] >= 710 and row['level1_diag3'] < 740):
        df.loc[index, 'level1_diag3'] = 6
    elif (row['level1_diag3'] >= 580 and row['level1_diag3'] < 630) or (np.floor(row['level1_diag3']) == 788):
        df.loc[index, 'level1_diag3'] = 7
    elif (row['level1_diag3'] >= 140 and row['level1_diag3'] < 240):
        df.loc[index, 'level1_diag3'] = 8
    else:
        df.loc[index, 'level1_diag3'] = 0

In [22]:
dict_max_glu_seru = {'None': -999, 
                     'Norm':0, 
                     '>300':1,
                    '>200':1}
df.max_glu_serum.replace(dict_max_glu_seru, inplace=True)
#replace max_glu_serum with numerical values

In [23]:
df.A1Cresult.unique()

array(['Norm', 'None', '>7', '>8'], dtype=object)

In [24]:
dict_A1Cresult = {'None': -999, 
                     'Norm':0, 
                     '>7':1,
                    '>8':1}
df.A1Cresult.replace(dict_A1Cresult, inplace=True)
#replace A1C with numerical values

In [25]:
drug_list = df.loc[:, 'metformin':'diabetesMed']
for column in list(drug_list):
    n = drug_list[column].unique()
    print(n)
#values in drug list is has similar values, make 1 library

['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady']
['No' 'Steady' 'Down' 'Up']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady' 'Down' 'Up']
['No' 'Steady' 'Up' 'Down']
['No' 'Steady']
['No' 'Steady' 'Up']
['No']
['No']
['No' 'Down' 'Steady' 'Up']
['No' 'Steady' 'Down' 'Up']
['No' 'Steady']
['No' 'Steady']
['No' 'Steady']
['No' 'Steady']
['No' 'Ch']
['No' 'Yes']


In [26]:
dict_drug = {'No': 0, 
            'Down':1, 
            'Steady':2,
            'Up':3,
             'Ch':1,
             'Yes':1
            }
#replace drug text with numerical value

In [27]:
drug_list = df.loc[:, 'metformin':'diabetesMed']

In [28]:
for column in drug_list:
    drug_list[column] = drug_list[column].map(dict_drug)


In [29]:
df.loc[:, 'metformin':'diabetesMed'] = drug_list


In [30]:
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,dataset,level1_diag1,level1_diag2,level1_diag3
0,7733208,3291489,1,1,65,?,1,1,7,2,...,0,0,0,0,0,0,train,4.0,7.0,0.0
1,152449578,84529188,1,2,45,?,1,2,7,3,...,0,0,0,0,0,0,train,0.0,1.0,0.0
2,440311646,121372727,1,2,45,?,1,3,7,13,...,0,0,0,1,1,0,train,6.0,6.0,6.0
3,106684962,24066279,1,1,85,?,5,3,17,4,...,0,0,0,1,1,0,train,2.0,0.0,0.0
4,139779162,86645961,2,2,65,?,1,4,1,3,...,0,0,0,1,1,0,train,1.0,7.0,1.0
5,223277988,59559255,1,2,75,?,3,22,1,5,...,0,0,0,1,1,0,train,8.0,4.0,0.0
6,127005696,72034335,1,2,85,?,3,1,1,4,...,0,0,0,1,1,0,train,3.0,4.0,1.0
7,37320456,23560803,1,1,65,?,5,3,17,6,...,0,0,0,1,1,1,train,5.0,0.0,0.0
8,144033276,6382269,2,2,55,?,1,1,7,2,...,0,0,0,1,1,0,train,2.0,1.0,0.0
9,166008048,89445645,1,2,75,?,2,1,1,2,...,0,0,0,0,1,0,train,4.0,1.0,0.0


### Missing Values

In [31]:
df = df.replace('?', np.nan)
#saw in dataset missing value or unknown has '?', will replace with nan


In [32]:
df.isnull().sum().sort_values(ascending=False).head(10)
#list col with missing values


weight               98569
medical_specialty    49949
payer_code           40256
diag_3                1423
diag_2                 358
diag_1                  21
level1_diag3             0
num_medications          0
metformin                0
A1Cresult                0
dtype: int64

In [33]:
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1)
#dropping columns with too many missing values

df = df.drop(['examide', 'citoglipton'], axis = 1)
#these 2 drugs were all 'no'


In [34]:
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]
#11, 13, 14, 19, 20, 21 are patients who are expired so no re-admission possible


In [35]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3'],
      dtype='object')

In [36]:
df = df.drop(['diag_1', 
              'diag_2', 
              'diag_3'], axis = 1)
#remove the columns that were engineered into other columns


In [37]:
df.shape

(99343, 46)

In [38]:
df['random'] = np.random.randint(-999, 999, df.shape[0])
df.random

0        179
1        270
2       -213
3         83
4        380
5        490
6        972
7       -855
8        527
9        898
10       -32
11      -146
12       321
13       165
14      -300
15        47
16      -707
17       389
18      -105
19       -86
20      -857
22       588
23      -145
24       388
25       886
26       889
27       464
28       624
29       807
30       440
        ... 
10146   -118
10147    266
10148    933
10149   -176
10150    422
10151   -545
10152   -105
10153   -170
10154    317
10155    289
10156    734
10157    694
10158    161
10159    395
10160    154
10161   -266
10162    454
10163    873
10164    430
10165   -481
10166    185
10167    559
10168   -514
10169    112
10170    339
10171    922
10172   -511
10174      4
10175    106
10176    -29
Name: random, Length: 99343, dtype: int64

In [39]:
df.to_csv('./merged_data_preprocessed.csv')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99343 entries, 0 to 10176
Data columns (total 47 columns):
encounter_id                99343 non-null int64
patient_nbr                 99343 non-null int64
race                        99343 non-null int64
gender                      99343 non-null int64
age                         99343 non-null int64
admission_type_id           99343 non-null int64
discharge_disposition_id    99343 non-null int64
admission_source_id         99343 non-null int64
time_in_hospital            99343 non-null int64
num_lab_procedures          99343 non-null int64
num_procedures              99343 non-null int64
num_medications             99343 non-null int64
number_outpatient           99343 non-null int64
number_emergency            99343 non-null int64
number_inpatient            99343 non-null int64
number_diagnoses            99343 non-null int64
max_glu_serum               99343 non-null int64
A1Cresult                   99343 non-null int64
metformin

In [41]:
df.isnull().sum().sort_values(ascending=False)


random                      0
num_medications             0
nateglinide                 0
repaglinide                 0
metformin                   0
A1Cresult                   0
max_glu_serum               0
number_diagnoses            0
number_inpatient            0
number_emergency            0
number_outpatient           0
num_procedures              0
glimepiride                 0
num_lab_procedures          0
time_in_hospital            0
admission_source_id         0
discharge_disposition_id    0
admission_type_id           0
age                         0
gender                      0
race                        0
patient_nbr                 0
chlorpropamide              0
acetohexamide               0
level1_diag3                0
glipizide-metformin         0
level1_diag2                0
level1_diag1                0
dataset                     0
readmitted                  0
diabetesMed                 0
change                      0
metformin-pioglitazone      0
metformin-

In [42]:
df.to_csv('./merged_data_preprocessed.csv')

In [43]:
df.dataset.unique()

array(['train', 'test'], dtype=object)

In [44]:
df_train = df[df.dataset=='train']
df_train = df_train.drop(columns='dataset')
df_train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
49564,230116170,87185628,2,2,65,1,1,7,4,46,...,0,0,0,0,0,0,3.0,0.0,0.0,282
56802,156643446,35687133,1,2,65,1,1,7,7,37,...,0,0,0,1,1,0,5.0,2.0,1.0,193
45217,73228146,1699029,1,1,65,3,3,1,7,24,...,0,0,0,0,1,0,6.0,4.0,0.0,655
74592,431476676,41827590,0,2,85,1,6,7,5,45,...,0,0,0,0,1,0,2.0,1.0,1.0,960
29477,41239698,24485670,1,1,55,2,1,1,2,51,...,0,0,0,0,1,0,3.0,4.0,0.0,402


In [45]:
df_test = df[df.dataset=='test']
df_test = df_test.drop(columns='dataset')
df_test.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
9091,136287744,24481782,1,1,75,1,3,7,8,64,...,0,0,0,1,1,0,2.0,1.0,2.0,-79
4221,122782782,99637119,1,2,75,2,6,7,4,25,...,0,0,0,1,1,0,1.0,1.0,0.0,43
1744,190664100,60206454,1,2,55,1,1,7,4,49,...,0,0,0,1,1,0,5.0,4.0,1.0,-865
129,148870182,74016756,2,2,55,1,1,7,5,46,...,0,0,0,1,1,0,0.0,4.0,0.0,-228
4295,15254370,4383171,1,2,55,1,2,7,4,39,...,0,0,0,1,1,0,1.0,1.0,4.0,-408


In [46]:
df.corr().abs()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
encounter_id,1.0,0.513446,0.017388,0.005916,0.070989,0.157088,0.143416,0.110612,0.063806,0.029005,...,0.002121,0.00672,0.007379,0.096787,0.052211,0.007992,0.014005,0.032001,0.007295,0.000896
patient_nbr,0.513446,1.0,0.001953,0.006925,0.07008,0.009455,0.145774,0.0306,0.025033,0.01172,...,0.001056,0.009588,0.001058,0.056542,0.019005,0.008765,0.013789,0.012821,0.009122,0.00147
race,0.017388,0.001953,1.0,0.032038,0.139067,0.020048,0.001065,0.005506,0.006321,0.009122,...,0.001246,0.007545,0.001246,0.007123,0.002401,0.003157,0.000263,0.006677,0.000731,0.001125
gender,0.005916,0.006925,0.032038,1.0,0.050532,0.014805,0.023388,0.003925,0.030237,0.002746,...,0.002939,0.004843,0.002939,0.015297,0.016956,0.002512,0.018798,0.023048,0.009421,0.00013
age,0.070989,0.07008,0.139067,0.050532,1.0,0.004857,0.096318,0.040766,0.107077,0.016323,...,0.000148,0.002605,0.000148,0.033694,0.018653,0.022196,0.018116,0.015802,0.043006,0.003437
admission_type_id,0.157088,0.009455,0.020048,0.014805,0.004857,1.0,0.09364,0.103586,0.012865,0.141733,...,0.00226,9.4e-05,0.002127,0.006844,0.001169,0.013218,0.054168,0.018127,0.017616,0.002009
discharge_disposition_id,0.143416,0.145774,0.001065,0.023388,0.096318,0.09364,1.0,0.009605,0.161163,0.009782,...,0.001541,1.5e-05,0.000317,0.005675,0.022022,0.063374,0.028421,0.001891,0.005912,0.005007
admission_source_id,0.110612,0.0306,0.005506,0.003925,0.040766,0.103586,0.009605,1.0,0.007218,0.052851,...,0.00099,0.001911,0.003693,0.003962,0.003561,0.007877,0.070802,0.022565,0.023092,0.002812
time_in_hospital,0.063806,0.025033,0.006321,0.030237,0.107077,0.012865,0.161163,0.007218,1.0,0.319855,...,0.002543,0.000573,0.001732,0.107613,0.060719,0.046886,0.001306,0.015729,0.035342,0.003027
num_lab_procedures,0.029005,0.01172,0.009122,0.002746,0.016323,0.141733,0.009782,0.052851,0.319855,1.0,...,0.000794,0.00128,0.003221,0.065333,0.033979,0.024044,0.034308,0.004042,0.023004,0.0022


In [47]:
y_test_dataset = df_test.readmitted
X_test_dataset = df_test.drop(columns='readmitted')

In [48]:
y_train_dataset = df_train.readmitted
X_train_dataset = df_train.drop(columns='readmitted')

In [49]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_dataset, y_train_dataset, test_size=0.2)


In [51]:
#logistic regression

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(C=1e4)
logistic.fit(X_train, y_train)
logistic.score(X_train, y_train)



0.8855397262572176

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, logistic.predict(X_train))

array([[63340,     0],
       [ 8187,     0]])

In [53]:
y_train.value_counts()

0    63340
1     8187
Name: readmitted, dtype: int64

In [54]:
logistic.score(X_valid, y_valid)

0.8885471423778101

In [55]:
y_valid_predict = logistic.predict(X_valid)


In [56]:
pd.crosstab(pd.Series(y_valid, name = 'Actual'), pd.Series(y_valid_predict, name = 'Predict'), margins = True)


Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3087,3087
1,395,395
All,3482,3482


In [61]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


In [62]:
thresh = 0.5


In [64]:
def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

In [69]:
y_train_preds = logistic.predict_proba(X_train)
y_valid_preds = logistic.predict_proba(X_valid)

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:


ValueError: bad input shape (71527, 2)