## Data Analysis

## Data preprocessing

1) Drop unwanted columns : patient_nbr, encounter_id, weight, payer_code, medical_specialty.

2) Drop columns 'citoglipton', 'examide' which has the same values across all the rows. So these columns won't help us in modeling

2) cleaned up race column ( replaced ? to another category and applied LabelEncoder).

3) drop rows which has invalid gender.

4) cleaned age column by removing the interval and putting the median value. 

5) assigned 3 categories to admission_type_id column

6) assigned 3 categories to admission_source_id column

7) remove any rows where the patient is expired based on discharge_disposition_id = 11

8) cleaning up the max glu serum into 3 categories

9) cleaning up the A1Cresult

10) cleaning up the diag columns


In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

df = pd.read_csv("diabetic_data.csv")
index_mapping =  pd.read_csv("IDs_mapping.csv")


def replaceColumn(df, col, oldval, newval):
    df[col] = df[col].replace(oldval, newval)
    return df

def replaceColumnList(df, col, listOfOldVal, newval):
    newDf = df
    for oldVal in listOfOldVal:
        newDf = replaceColumn(df, col, oldVal, newval)
    return newDf

def transformLabelEncoder(df, col):
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])
    return df;

def scale(df, col):
    x = df[[col]].values.astype(int)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df[col] = x_scaled
    return df;

def diagColn(df):
    colList = ['diag_1','diag_2','diag_3']
    for col in colList:
        df.loc[df[col].str.contains('E'), col] = '0'
        df.loc[df[col].str.contains('V'), col] = '0'
        df = replaceColumn(df, col, '?', -1)
        df = replaceColumn(df, col, '0', 0)
        df[col] = pd.to_numeric(df[col])

        df[col] = np.where(df[col].between(1, 139, inclusive=True), 1, df[col])
        df[col] = np.where(df[col].between(140, 239, inclusive=True), 2, df[col])
        df[col] = np.where(df[col].between(240, 279, inclusive=True), 3, df[col])
        df[col] = np.where(df[col].between(280, 289, inclusive=True), 4, df[col])
        df[col] = np.where(df[col].between(290, 319, inclusive=True), 5, df[col])
        df[col] = np.where(df[col].between(320, 389, inclusive=True), 6, df[col])
        df[col] = np.where(df[col].between(390, 459, inclusive=True), 7, df[col])
        df[col] = np.where(df[col].between(460, 519, inclusive=True), 8, df[col])
        df[col] = np.where(df[col].between(520, 579, inclusive=True), 9, df[col])
        df[col] = np.where(df[col].between(580, 629, inclusive=True), 10, df[col])
        df[col] = np.where(df[col].between(630, 679, inclusive=True), 11, df[col])
        df[col] = np.where(df[col].between(680, 709, inclusive=True), 12, df[col])
        df[col] = np.where(df[col].between(710, 739, inclusive=True), 13, df[col])
        df[col] = np.where(df[col].between(740, 759, inclusive=True), 14, df[col])
        df[col] = np.where(df[col].between(760, 779, inclusive=True), 15, df[col])
        df[col] = np.where(df[col].between(780, 799, inclusive=True), 16, df[col])
        df[col] = np.where(df[col].between(800, 999, inclusive=True), 17, df[col])
    return df

# drop few columns such as patient_nbr, encounter_id, weight, payer_code, medical_specialty
df = df.drop(['patient_nbr', 'encounter_id', 'weight', 'payer_code', 'medical_specialty', 'citoglipton', 'examide'], axis=1)

# fixing race column
df = replaceColumn(df, 'race', '?', "unknown")
df = transformLabelEncoder(df, "race")

# dropping rows where gender is invalid
df = df[df.gender != 'Unknown/Invalid']

# fixing age column
# TODO maybe send a list of tuple to do mass replace in 1 shot
df = replaceColumn(df, 'age', '[0-10)', 4)
df = replaceColumn(df, 'age', '[10-20)', 14)
df = replaceColumn(df, 'age', '[20-30)', 24)
df = replaceColumn(df, 'age', '[30-40)', 34)
df = replaceColumn(df, 'age', '[40-50)', 44)
df = replaceColumn(df, 'age', '[50-60)', 54)
df = replaceColumn(df, 'age', '[60-70)', 64)
df = replaceColumn(df, 'age', '[70-80)', 74)
df = replaceColumn(df, 'age', '[80-90)', 84)
df = replaceColumn(df, 'age', '[90-100)', 94)

# fixing admission_type_id column
df = replaceColumnList(df, 'admission_type_id', [2,7], 1)
df = replaceColumnList(df, 'admission_type_id', [6,8], 5)
df = replaceColumnList(df, 'admission_type_id', [4], 3)

# fixing admission_source_id column
df = replaceColumnList(df, 'admission_source_id', [2,3], 1)
df = replaceColumnList(df, 'admission_source_id', [5,6,22], 4)
df = replaceColumnList(df, 'admission_source_id', [10,25], 7)
df = replaceColumnList(df, 'admission_source_id', [17,20], 9)
df = replaceColumnList(df, 'admission_source_id', [13,14], 11)

print('admission_type_id', df['admission_type_id'][df['admission_type_id'] == 7].count())
print('admission_source_id', df['admission_source_id'][df['admission_source_id'] == 2].count())
print('discharge_disposition_id', df['discharge_disposition_id'][df['discharge_disposition_id'] == 11].count())

# dropping people who expired already
df = df[df.discharge_disposition_id != 11]

print('discharge_disposition_id', df['discharge_disposition_id'][df['discharge_disposition_id'] == 11].count())

# cleaning up the max glu serum
df = replaceColumnList(df, 'max_glu_serum', ['>300', '>200'], 1)
df = replaceColumnList(df, 'max_glu_serum', ['Norm'], 0)
df = replaceColumnList(df, 'max_glu_serum', ['None'], -1)

print('max_glu_serum : 1', df['max_glu_serum'][df['max_glu_serum'] == 1].count())
print('max_glu_serum : 0', df['max_glu_serum'][df['max_glu_serum'] == 0].count())

# cleaning up the A1Cresult
df = replaceColumnList(df, 'A1Cresult', ['>7', '>8'], 1)
df = replaceColumnList(df, 'A1Cresult', ['Norm'], 0)
df = replaceColumnList(df, 'A1Cresult', ['None'], -1)

print('A1Cresult : 1', df['A1Cresult'][df['A1Cresult'] == 1].count())
print('A1Cresult : 0', df['A1Cresult'][df['A1Cresult'] == 0].count())

# cleaning up the diag columns
df = diagColn(df)

print('diag_1 : 0', df['diag_1'][df['diag_1'] == 250.83].count())
    
# Run the normalizer on the dataframe
# df_norm = scale(df, 'time_in_hospital')
# df_norm = scale(df_norm, 'num_lab_procedures')
# df_norm = scale(df_norm, 'num_procedures')
# df_norm = scale(df_norm, 'num_medications')
# df_norm = scale(df_norm, 'number_outpatient')
# df_norm = scale(df_norm, 'number_emergency')
# df_norm = scale(df_norm, 'number_inpatient')
# df_norm

drugEncoder = preprocessing.LabelEncoder()
drugEncoder.fit(df['metformin'])
for name,values in df.loc[:, 'metformin': 'metformin-pioglitazone'].iteritems():
    df[name] = drugEncoder.transform(values)
print("map of encoder: " + str(list(drugEncoder.classes_)))


df = transformLabelEncoder(df, 'change')
df = transformLabelEncoder(df, 'diabetesMed')
df = replaceColumnList(df, 'readmitted', ['>30','<30'], 1)
df = replaceColumnList(df, 'readmitted', ['NO'], 0)

df = transformLabelEncoder(df, 'gender')

df.to_csv("dataCategorizedNew.csv", sep=',', header=True)



admission_type_id 0
admission_source_id 0
discharge_disposition_id 1642
discharge_disposition_id 0
max_glu_serum : 1 2651
max_glu_serum : 0 2574
A1Cresult : 1 11935
A1Cresult : 0 4942
diag_1 : 0 0
map of encoder: ['Down', 'No', 'Steady', 'Up']


In [40]:
cols = pd.read_csv("dataCategorizedNew.csv", nrows=1).columns

Preprocessed_df_x = pd.read_csv("dataCategorizedNew.csv", usecols=cols[:-1])
Preprocessed_df_y = pd.read_csv("dataCategorizedNew.csv", usecols=cols[-1:])

In [41]:
Preprocessed_df_x

Unnamed: 0.1,Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,0,2,0,4,5,25,1,1,41,0,...,1,1,1,1,1,1,1,1,1,0
1,1,2,0,14,1,1,7,3,59,0,...,1,1,3,1,1,1,1,1,0,1
2,2,0,0,24,1,1,7,2,11,5,...,1,1,1,1,1,1,1,1,1,1
3,3,2,1,34,1,1,7,2,44,1,...,1,1,3,1,1,1,1,1,0,1
4,4,2,1,44,1,1,7,1,51,0,...,1,1,2,1,1,1,1,1,0,1
5,5,2,1,54,1,1,1,3,31,6,...,1,1,2,1,1,1,1,1,1,1
6,6,2,1,64,3,1,1,4,70,1,...,1,1,2,1,1,1,1,1,0,1
7,7,2,1,74,1,1,7,5,73,0,...,1,1,1,1,1,1,1,1,1,1
8,8,2,0,84,1,1,4,13,68,2,...,1,1,2,1,1,1,1,1,0,1
9,9,2,0,94,3,3,4,12,33,3,...,1,1,2,1,1,1,1,1,0,1


In [42]:
Preprocessed_df_x.corr()

Unnamed: 0.1,Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
Unnamed: 0,1.0,0.081784,0.004736,0.078267,-0.145881,-0.150084,-0.018618,-0.066695,-0.024632,-0.020139,...,-0.009465,-0.02034,0.051261,0.040706,0.002788,0.002819,0.006099,0.005133,-0.11136,0.058457
race,0.081784,1.0,0.0628,0.113806,0.064847,-0.022534,-0.026735,-0.018885,-0.014693,0.024162,...,0.001482,-0.001649,-0.001456,0.010674,0.003085,0.000856,0.014821,0.000856,-0.014743,0.011376
gender,0.004736,0.0628,1.0,-0.050733,0.01356,-0.022588,-0.009868,-0.030499,-0.003073,0.060519,...,0.005907,0.007357,0.0036,0.003877,0.007023,-0.002929,0.004823,-0.002929,-0.015376,0.016717
age,0.078267,0.113806,-0.050733,1.0,-0.005052,0.105074,0.039205,0.10819,0.01788,-0.029493,...,-0.00143,0.005405,-0.023758,-0.00109,0.002154,-0.000165,0.00257,-0.000165,0.034388,-0.019786
admission_type_id,-0.145881,0.064847,0.01356,-0.005052,1.0,0.091328,-0.18849,-0.023274,-0.158191,0.117486,...,0.002261,0.007571,-0.009192,-0.001577,-0.002757,-0.001861,0.000727,0.002889,-0.009714,-0.002356
discharge_disposition_id,-0.150084,-0.022534,-0.022588,0.105074,0.091328,1.0,-0.003204,0.164144,0.013025,0.008976,...,0.006691,0.00979,-0.024944,-0.003889,0.000711,-0.001566,-8.2e-05,-0.00036,0.007192,-0.023664
admission_source_id,-0.018618,-0.026735,-0.009868,0.039205,-0.18849,-0.003204,1.0,0.001787,0.160561,-0.19497,...,0.003573,0.002068,-0.006754,-0.017331,0.001961,0.002063,-0.001737,-0.004519,-0.007901,0.001716
time_in_hospital,-0.066695,-0.018885,-0.030499,0.10819,-0.023274,0.164144,0.001787,1.0,0.319741,0.189963,...,0.003577,-0.003211,0.045079,-0.003582,-0.000608,-0.002539,-0.000585,0.001711,-0.10758,0.060832
num_lab_procedures,-0.024632,-0.014693,-0.003073,0.01788,-0.158191,0.013025,0.160561,0.319741,1.0,0.051744,...,0.003829,-0.000317,0.035095,-0.00958,-0.007518,-0.000796,0.001266,-0.003212,-0.065362,0.034357
num_procedures,-0.020139,0.024162,0.060519,-0.029493,0.117486,0.008976,-0.19497,0.189963,0.051744,1.0,...,-0.004284,0.002228,0.00727,-0.00129,-0.00531,-0.002474,0.004388,-0.000615,-0.002924,-0.009345


In [43]:
Preprocessed_df_x.cov()

Unnamed: 0.1,Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
Unnamed: 0,864037200.0,2368.186731,69.400334,36688.8656,-5706.920862,-23113.952635,-1576.599941,-5831.416819,-14206.240778,-1006.533028,...,-1.522968,-12.2433,1262.871337,101.2691,0.9337472,0.261904,0.8012151,0.4768361,-1632.422325,721.100883
race,2368.187,0.970431,0.030843,1.787877,0.085018,-0.116302,-0.075873,-0.055337,-0.283982,0.040471,...,7.991928e-06,-3.327316e-05,-0.001202,0.0008899499,3.463169e-05,2.663976e-06,6.525604e-05,2.663976e-06,-0.007243,0.004703
gender,69.40033,0.030843,0.248558,-0.403362,0.008997,-0.059002,-0.014173,-0.045229,-0.030059,0.051302,...,1.612093e-05,7.511347e-05,0.001504,0.0001635862,3.98933e-05,-4.614373e-06,1.074728e-05,-4.614373e-06,-0.003823,0.003498
age,36688.87,1.787877,-0.403362,254.320098,-0.107213,8.779275,1.801148,5.132084,5.594649,-0.799739,...,-0.000124804,0.001765164,-0.317545,-0.001470882,0.0003913972,-8.307966e-06,0.0001831444,-8.307966e-06,0.273482,-0.132418
admission_type_id,-5706.921,0.085018,0.008997,-0.107213,1.771218,0.636816,-0.722681,-0.092136,-4.130756,0.265861,...,1.647367e-05,0.0002063316,-0.010253,-0.000177637,-4.181157e-05,-7.826128e-06,4.323773e-06,1.21499e-05,-0.006447,-0.001316
discharge_disposition_id,-23113.95,-0.116302,-0.059002,8.779275,0.636816,27.450206,-0.048358,2.558091,1.338899,0.079967,...,0.0001918883,0.00105032,-0.109534,-0.001724509,4.24627e-05,-2.592937e-05,-1.918673e-06,-5.953344e-06,0.01879,-0.05203
admission_source_id,-1576.6,-0.075873,-0.014173,1.801148,-0.722681,-0.048358,8.299344,0.015313,9.075557,-0.955041,...,5.634124e-05,0.0001219716,-0.016308,-0.004225683,6.43611e-05,1.878041e-05,-2.236726e-05,-4.114767e-05,-0.011352,0.002074
time_in_hospital,-5831.417,-0.055337,-0.045229,5.132084,-0.092136,2.558091,0.015313,8.847795,18.660661,0.960768,...,5.824574e-05,-0.0001955568,0.112383,-0.0009019001,-2.060751e-05,-2.386615e-05,-7.780238e-06,1.608591e-05,-0.159583,0.075935
num_lab_procedures,-14206.24,-0.283982,-0.030059,5.594649,-4.130756,1.338899,9.075557,18.660661,384.967088,1.726255,...,0.0004112055,-0.0001271948,0.577117,-0.01590789,-0.001680621,-4.937444e-05,0.0001109994,-0.0001991947,-0.639544,0.282893
num_procedures,-1006.533,0.040471,0.051302,-0.799739,0.265861,0.079967,-0.955041,0.960768,1.726255,2.891113,...,-3.987644e-05,7.758301e-05,0.01036,-0.0001855967,-0.0001028818,-1.329215e-05,3.334379e-05,-3.304132e-06,-0.002479,-0.006669


In [44]:
from sklearn.neural_network import MLPClassifier

In [49]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
...                     hidden_layer_sizes=(5, 2), random_state=1)

In [46]:
clf.fit(Preprocessed_df_x,Preprocessed_df_y)

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [47]:
from sklearn.model_selection import train_test_split

In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(Preprocessed_df_x, Preprocessed_df_y, test_size=0.20, random_state=80)

In [88]:
from sklearn.linear_model import LogisticRegression

In [111]:
LRclassifier = LogisticRegression(class_weight = 'balanced',random_state = 20, solver = 'liblinear',multi_class = 'ovr',verbose = 2,)
LRclassifier.fit(X_train,Y_train)
Y_predict = LRclassifier.predict(X_test)
print(Y_predict)

  y = column_or_1d(y, warn=True)


[0 1 1 ... 0 1 0]


In [112]:
from sklearn.metrics import f1_score,accuracy_score
f1_score(Y_test, Y_predict, average='macro')
accuracy_score(Y_test, Y_predict)

0.6239700374531835