## Data Analysis

## Data preprocessing

1) Drop unwanted columns : patient_nbr, encounter_id, weight, payer_code, medical_specialty.

2) Drop columns 'citoglipton', 'examide' which has the same values across all the rows. So these columns won't help us in modeling

2) cleaned up race column ( replaced ? to another category and applied LabelEncoder).

3) drop rows which has invalid gender.

4) cleaned age column by removing the interval and putting the median value. 

5) assigned 3 categories to admission_type_id column

6) assigned 3 categories to admission_source_id column

7) remove any rows where the patient is expired based on discharge_disposition_id = 11

8) cleaning up the max glu serum into 3 categories

9) cleaning up the A1Cresult

10) cleaning up the diag columns


In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

df = pd.read_csv("dataset_diabetes/diabetic_data.csv")
index_mapping =  pd.read_csv("dataset_diabetes/IDs_mapping.csv")


def replaceColumn(df, col, oldval, newval):
    df[col] = df[col].replace(oldval, newval)
    return df

def replaceColumnList(df, col, listOfOldVal, newval):
    newDf = df
    for oldVal in listOfOldVal:
        newDf = replaceColumn(df, col, oldVal, newval)
    return newDf

def transformLabelEncoder(df, col):
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])
    return df;

def scale(df, col):
    x = df[[col]].values.astype(int)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df[col] = x_scaled
    return df;

def diagColn(df):
    colList = ['diag_1','diag_2','diag_3']
    for col in colList:
        df.loc[df[col].str.contains('E'), col] = '0'
        df.loc[df[col].str.contains('V'), col] = '0'
        df = replaceColumn(df, col, '?', -1)
        df = replaceColumn(df, col, '0', 0)
        df[col] = pd.to_numeric(df[col])

        df[col] = np.where(df[col].between(1, 139, inclusive=True), 1, df[col])
        df[col] = np.where(df[col].between(140, 239, inclusive=True), 2, df[col])
        df[col] = np.where(df[col].between(240, 279, inclusive=True), 3, df[col])
        df[col] = np.where(df[col].between(280, 289, inclusive=True), 4, df[col])
        df[col] = np.where(df[col].between(290, 319, inclusive=True), 5, df[col])
        df[col] = np.where(df[col].between(320, 389, inclusive=True), 6, df[col])
        df[col] = np.where(df[col].between(390, 459, inclusive=True), 7, df[col])
        df[col] = np.where(df[col].between(460, 519, inclusive=True), 8, df[col])
        df[col] = np.where(df[col].between(520, 579, inclusive=True), 9, df[col])
        df[col] = np.where(df[col].between(580, 629, inclusive=True), 10, df[col])
        df[col] = np.where(df[col].between(630, 679, inclusive=True), 11, df[col])
        df[col] = np.where(df[col].between(680, 709, inclusive=True), 12, df[col])
        df[col] = np.where(df[col].between(710, 739, inclusive=True), 13, df[col])
        df[col] = np.where(df[col].between(740, 759, inclusive=True), 14, df[col])
        df[col] = np.where(df[col].between(760, 779, inclusive=True), 15, df[col])
        df[col] = np.where(df[col].between(780, 799, inclusive=True), 16, df[col])
        df[col] = np.where(df[col].between(800, 999, inclusive=True), 17, df[col])
    return df

# drop few columns such as patient_nbr, encounter_id, weight, payer_code, medical_specialty
df = df.drop(['patient_nbr', 'encounter_id', 'weight', 'payer_code', 'medical_specialty', 'citoglipton', 'examide'], axis=1)

# fixing race column
df = replaceColumn(df, 'race', '?', "unknown")
df = transformLabelEncoder(df, "race")

# dropping rows where gender is invalid
df = df[df.gender != 'Unknown/Invalid']

# fixing age column
# TODO maybe send a list of tuple to do mass replace in 1 shot
df = replaceColumn(df, 'age', '[0-10)', 4)
df = replaceColumn(df, 'age', '[10-20)', 14)
df = replaceColumn(df, 'age', '[20-30)', 24)
df = replaceColumn(df, 'age', '[30-40)', 34)
df = replaceColumn(df, 'age', '[40-50)', 44)
df = replaceColumn(df, 'age', '[50-60)', 54)
df = replaceColumn(df, 'age', '[60-70)', 64)
df = replaceColumn(df, 'age', '[70-80)', 74)
df = replaceColumn(df, 'age', '[80-90)', 84)
df = replaceColumn(df, 'age', '[90-100)', 94)

# fixing admission_type_id column
df = replaceColumnList(df, 'admission_type_id', [2,7], 1)
df = replaceColumnList(df, 'admission_type_id', [6,8], 5)
df = replaceColumnList(df, 'admission_type_id', [4], 3)

# fixing admission_source_id column
df = replaceColumnList(df, 'admission_source_id', [2,3], 1)
df = replaceColumnList(df, 'admission_source_id', [5,6,22], 4)
df = replaceColumnList(df, 'admission_source_id', [10,25], 7)
df = replaceColumnList(df, 'admission_source_id', [17,20], 9)
df = replaceColumnList(df, 'admission_source_id', [13,14], 11)

print('admission_type_id', df['admission_type_id'][df['admission_type_id'] == 7].count())
print('admission_source_id', df['admission_source_id'][df['admission_source_id'] == 2].count())
print('discharge_disposition_id', df['discharge_disposition_id'][df['discharge_disposition_id'] == 11].count())

# dropping people who expired already
df = df[df.discharge_disposition_id != 11]

print('discharge_disposition_id', df['discharge_disposition_id'][df['discharge_disposition_id'] == 11].count())

# cleaning up the max glu serum
df = replaceColumnList(df, 'max_glu_serum', ['>300', '>200'], 1)
df = replaceColumnList(df, 'max_glu_serum', ['Norm'], 0)
df = replaceColumnList(df, 'max_glu_serum', ['None'], -1)

print('max_glu_serum : 1', df['max_glu_serum'][df['max_glu_serum'] == 1].count())
print('max_glu_serum : 0', df['max_glu_serum'][df['max_glu_serum'] == 0].count())

# cleaning up the A1Cresult
df = replaceColumnList(df, 'A1Cresult', ['>7', '>8'], 1)
df = replaceColumnList(df, 'A1Cresult', ['Norm'], 0)
df = replaceColumnList(df, 'A1Cresult', ['None'], -1)

print('A1Cresult : 1', df['A1Cresult'][df['A1Cresult'] == 1].count())
print('A1Cresult : 0', df['A1Cresult'][df['A1Cresult'] == 0].count())

# cleaning up the diag columns
df = diagColn(df)

print('diag_1 : 0', df['diag_1'][df['diag_1'] == 250.83].count())
    
# Run the normalizer on the dataframe
# df_norm = scale(df, 'time_in_hospital')
# df_norm = scale(df_norm, 'num_lab_procedures')
# df_norm = scale(df_norm, 'num_procedures')
# df_norm = scale(df_norm, 'num_medications')
# df_norm = scale(df_norm, 'number_outpatient')
# df_norm = scale(df_norm, 'number_emergency')
# df_norm = scale(df_norm, 'number_inpatient')
# df_norm

drugEncoder = preprocessing.LabelEncoder()
drugEncoder.fit(df['metformin'])
for name,values in df.loc[:, 'metformin': 'metformin-pioglitazone'].iteritems():
    df[name] = drugEncoder.transform(values)
print("map of encoder: " + str(list(drugEncoder.classes_)))


df = transformLabelEncoder(df, 'change')
df = transformLabelEncoder(df, 'diabetesMed')
df = replaceColumnList(df, 'readmitted', ['>30','<30'], 1)
df = replaceColumnList(df, 'readmitted', ['NO'], 0)

df = transformLabelEncoder(df, 'gender')

df.to_csv("dataCategorizedNew.csv", sep=',', header=True)



IOError: File diabetic_data.csv does not exist

In [None]:
cols = pd.read_csv("dataCategorizedNew.csv", nrows=1).columns

Preprocessed_df_x = pd.read_csv("dataCategorizedNew.csv", usecols=cols[:-1])
Preprocessed_df_y = pd.read_csv("dataCategorizedNew.csv", usecols=cols[-1:])

In [None]:
Preprocessed_df_x

In [None]:
Preprocessed_df_x.corr()

In [None]:
Preprocessed_df_x.cov()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(Preprocessed_df_x, Preprocessed_df_y, test_size=0.20, random_state=80)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LRclassifier = LogisticRegression(class_weight = 'balanced',random_state = 20, solver = 'liblinear',multi_class = 'ovr',verbose = 2,)
LRclassifier.fit(X_train,Y_train)
Y_predict = LRclassifier.predict(X_test)
print(Y_predict)

In [None]:
from sklearn.metrics import f1_score,accuracy_score
f1_score(Y_test, Y_predict, average='macro')
accuracy_score(Y_test, Y_predict)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)

print(f1_score(Y_test, Y_predict, average='macro'))
print(accuracy_score(Y_test, Y_predict))