In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from tqdm import tqdm
import joblib

In [2]:
#Load the dataset
path = "D:/Datasets/Diabetes/diabetic_data.csv"
df = pd.read_csv(path)

df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
#Check data types
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

# 1. Value transformation

In [4]:
#Drop acetohexamide, cytoglipton and examide columns which have only 1 value
#Drop encounter_id, patient_nbr and payer_code which are meaningless
df.drop(["acetohexamide", "citoglipton", "examide", "encounter_id", "patient_nbr", "payer_code"], axis=1, inplace=True)
df.columns

Index(['race', 'gender', 'age', 'weight', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
       'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
       'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
#For AICresult column
def Replace_values_A1C(a):
    if a == ">7" or a == ">8":
        return "Abnorm"
    elif a == "?":
        return np.NaN
    else:
        return a

df["A1Cresult"] = df["A1Cresult"].map(lambda x: Replace_values_A1C(x))
df["A1Cresult"].value_counts()

None      84748
Abnorm    12028
Norm       4990
Name: A1Cresult, dtype: int64

In [6]:
#For max_glu_serum column
def Replace_values_MGS(a):
    if a == ">200" or a == ">300":
        return "Abnorm"
    elif a == "?":
        return np.NaN
    else:
        return a
    
df["max_glu_serum"] = df["max_glu_serum"].map(lambda x: Replace_values_MGS(x))
df["max_glu_serum"].value_counts()

None      96420
Abnorm     2749
Norm       2597
Name: max_glu_serum, dtype: int64

In [7]:
#For age column
def Replace_values_Age(a):
    if a == "[0-10)":
        return "child"
    elif a == "[10-20)":
        return "young"
    elif a == "[20-30)" or a == "[30-40)" or a == "[40-50)" or a == "[50-60)":
        return "adult"
    elif a == "?":
        return np.NaN
    else:
        return "elderly"
    
df["age"] = df["age"].map(lambda x: Replace_values_Age(x))
df["age"].value_counts()

elderly    68541
adult      32373
young        691
child        161
Name: age, dtype: int64

In [8]:
#For diag_1, diag_2 and diag_3 columns
#Map the elements with the unique IDs
values_d1 = df["diag_1"].unique().tolist()
values_d1.remove("?")
IDs_d1 = np.arange(len(values_d1))
map_d1 = dict(zip(values_d1, IDs_d1))
map_d1["?"] = np.NaN

values_d2 = df["diag_2"].unique().tolist()
values_d2.remove("?")
IDs_d2 = np.arange(len(values_d2))
map_d2 = dict(zip(values_d2, IDs_d2))
map_d2["?"] = np.NaN

values_d3 = df["diag_3"].unique().tolist()
values_d3.remove("?")
IDs_d3 = np.arange(len(values_d3))
map_d3 = dict(zip(values_d3, IDs_d3))
map_d3["?"] = np.NaN

df["diag_1"] = df["diag_1"].map(map_d1)
df["diag_2"] = df["diag_2"].map(map_d2)
df["diag_3"] = df["diag_3"].map(map_d3)

df["diag_1"].unique()

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
       110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
       121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131.,
       132., 133., 134., 135., 136., 137., 138.,  nan, 139., 140., 141.,
       142., 143., 144., 145., 146., 147., 148., 14

In [9]:
#For readmitted column
#Readmitted within 30 days -> 1,  not -> 0
def Replace_values_Readmitted(a):
    if a == "<30":
        return 1
    elif a == "?":
        return np.NaN
    else: return 0
    
df["readmitted"] = df["readmitted"].map(lambda x: Replace_values_Readmitted(x))
df["readmitted"].value_counts()

0    90409
1    11357
Name: readmitted, dtype: int64

In [10]:
#Map other nominal features
def Map_Object_Column(column):
    values = column.unique().tolist()
    
    if "?" in values:
        values.remove("?")
    else: pass
    
    IDs = np.arange(len(values))
    map_column = dict(zip(values, IDs))
    map_column["?"] = np.NaN
    
    return column.map(map_column)

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = Map_Object_Column(df[column])
    else: pass
    

#Restore the dataset after value transformation
df.to_csv("D:/Datasets/Diabetes/Diabetes_withMissingValues.csv")

# 2. Missing value handling and data balancing
### 2.1 Drop the columns with missing value > 40%

In [None]:
#Find the missing values
def Missing_Value_Identification(a):
    if a == "?":
        return np.NaN
    else: return a

for column in df.columns:
    df[column] = df[column].map(lambda x: Missing_Value_Identification(x))

#Proportion of missing values
df.isnull().sum()/df.shape[0]

In [None]:
df.head()

In [None]:
#Drop the columns with missing% > 40%
df.drop(["weight", "medical_specialty"], axis=1, inplace=True)
df.columns

In [None]:
df.isnull().sum()

In [None]:
#Get the index of the missing values
    #Method for identifying the nan values in a given row
idex_nans = dict()

def Index_Nan_In_Row(row_id):
    row = df.iloc[row_id,:]
    idx = np.arange(len(row))
    idx_nan = [i for i in idx if str(row[i])=="nan"]
    
    if idx_nan != []:
        idex_nans[str(row_id)]=str(idx_nan)
    else: pass

#Loop all the rows
for row_id in range(df.shape[0]):
    Index_Nan_In_Row(row_id)

In [None]:
#Store the missing values info
import json

with open("D:/Datasets/Diabetes/missing_value_idx.json", "w") as fp:
    json.dump(idex_nans, fp)

### 2.2 Arithmetic mean and mode

In [None]:
'''Since the rest columns with missing values are of nominal type
Use mode to fill the missing data'''

#Check the nominal features
df_mode = df.copy()
mode_race = df_mode["race"].value_counts(sort=True,ascending=False)[0]
mode_d1 = df_mode["diag_1"].value_counts(sort=True,ascending=False)[0]
mode_d2 = df_mode["diag_2"].value_counts(sort=True,ascending=False)[0]
mode_d3 = df_mode["diag_3"].value_counts(sort=True,ascending=False)[0]

mode_race, mode_d1, mode_d2, mode_d3

In [None]:
df.dtypes

In [None]:
df_mode["race"].fillna(mode_race, inplace=True)
df_mode["diag_1"].fillna(mode_d1, inplace=True)
df_mode["diag_2"].fillna(mode_d2, inplace=True)
df_mode["diag_3"].fillna(mode_d3, inplace=True)

df_mode.isnull().sum().sum()

### 2.3 SMOTE and undersampling

In [None]:
#Get the predictors and target

X = df_mode.iloc[:, :-1]
Y = df_mode.iloc[:, -1]

In [None]:
#SMOTE the minority class, undersample the majority class
smote = SMOTE(n_jobs=-1)
enn = EditedNearestNeighbours(n_jobs=-1)
smoteenn = SMOTEENN(smote=smote, enn=enn, random_state=100)

X_resampled, Y_resampled = smoteenn.fit_resample(X, Y)

Y_resampled.value_counts()

### 2.3 Random Forest method

#### 2.3.1 Tuning of RF

In [None]:
# #1. Pre-train the RF
# #Tune with Hyperparameter range
# param_grid={
#     "n_estimators":[800,900,1000],
#     "max_depth":[3,4,5,6],
#     "min_impurity_decrease":[0,0.025,0.05],
#     "max_samples":[15, 20, 30]
# }

# rf_test = RandomForestClassifier(criterion="gini", bootstrap=True, oob_score=True, n_jobs=-1, random_state=1)
# search = GridSearchCV(rf_test, param_grid, n_jobs=-1)
# search.fit(X_resampled, Y_resampled)

# print("Best parameter (ob score=%0.3f):" % search.best_score_)
# print(search.best_params_)

In [None]:
#2. Pretrain a RF with the best hypa
rf = RandomForestClassifier(n_estimators=1000, criterion="gini", \
                                    max_depth=5, min_impurity_decrease=0., max_samples=30,\
                            bootstrap=True, oob_score=True, n_jobs=-1, random_state=1)
rf.fit(X_resampled, Y_resampled)

print("OOB acc: ", rf.oob_score_)

#Save the trained RF
joblib.dump(rf, "D:/RF_pretrained.sav")

In [None]:
#Read the dataset processed by RF missing value method in the last iteration
df_rf1=pd.read_csv("D:/Diabetes_FillMissing_by_RF1.csv")

df_rf1 = df_rf1.iloc[:,1:]
df_rf1.head()

In [None]:
#Apply the RF on the original dataset X which has the initial guesses
rf = joblib.load("D:/RF_pretrained.sav")
leaf_idx = rf.apply(df_rf1.iloc[:,:-1])

np.unique(leaf_idx, return_counts=True)

#### 2.3.2 Proximity Matrix

In [None]:
#3. Build proximity matrix
import time
from numba import jit,guvectorize, int64, vectorize, int32
from numba import cuda

@jit(parallel=True)
def Comparison(arr1, arr2):
    comparison = arr1 == arr2
    return np.sum(comparison)
    
    
def Proximity_Matrix(pm, leaf_idx):
#     global pm
    n_samples=leaf_idx.shape[0]
    
    #Update the matrix
    start=time.time()
    for i in range(n_samples-1):
        for j in range(i+1, n_samples):

            #Compare the similarity between the samples in the leaf index
            adds = Comparison(leaf_idx[i], leaf_idx[j])

            pm[i][j] += adds
            pm[j][i] += adds
            
        #Demonstrate the progress    
        if i%200==0:
            end=time.time()
            
            n_total = n_samples*(n_samples-1)/2
            n_elapsed = (n_samples-1+n_samples-1-(i+1))*(i+1)/2
            
            time_elapsed = (end-start)//60
            print("PM progress... {}-th 200-sample is done! Elapsed time {} mins\n\
            Rest time: {} mins".format((i+1)//200, time_elapsed, (time_elapsed/n_elapsed)*n_total//60+1))
                
    

In [None]:
pm = np.zeros(shape=(leaf_idx.shape[0], leaf_idx.shape[0]), dtype=np.int16)
Proximity_Matrix(pm, leaf_idx)

In [None]:
a = np.array([1,2,3,4,5,4,7,8,4])
idx = np.argwhere(a==4)
a[idx].size

In [None]:
np.save("D:/pm2.npy", pm)

In [None]:
#Get the value frequency in DF
@jit
def Get_Value_Frequency(value, arr):
    #Find the number of the desired values in the column
    idx = np.argwhere(arr==value)
    
    #The sum of the desired value counts
    value_sum = arr[idx].size
    
    return value_sum/arr.size


#Caluculate weight frequency
@jit(parallel=True)
def Weight_Frequency(pm, row_id, P_value, F_value):
    #Proximity of all the values
    P_values = pm[row_id].sum()
    W_value = P_value/P_values

    #Weight Frequency of the value
    return F_value*W_value


#4. Calculate the final guess

def Refine_Guess(row_id, column_id, df, pm, nominal):
    #column_id: the column needed to evaluate
    #row_id: the sample which have missing values in the column
    #df: the original dataset with missing values
    #pm:the proximity matrix
    
    #All the unique values in the original column
    arr_unique = np.unique(df[:,column_id])
    values = arr_unique[~np.isnan(arr_unique)]
    
    #For nominal feature, use weight frequency
    #For numeric feature, use weight average of the values
    WF_values=[]
    VF_values=0
    for value in tqdm(values):
        #Frequency of the value in the column
        F_value = Get_Value_Frequency(value=value, arr=df[:,column_id])
    
        #Weight in proximity matrix
            #Proximity of the value
        P_value=0
            
        for v in range(len(df[:,column_id])):
            if df[:,column_id][v] == value:
                #Get the proximity from PM by row_id and column id
                P_value+=pm[row_id][v]
            
            #Proximity of all the values
        WF_values.append(Weight_Frequency(pm, row_id, P_value, F_value))
        
        #Weight Average of the value
#         VF_values+=value*W_value

    #Find the highest WF_value's index for nominal feature
    idx = np.argmax(WF_values)
    
    #Return the final guessed value
    if nominal==True:
        return values[idx]
#     else:
#         return VF_values

In [None]:
#5. Repeat 1~ 4 steps
#Unitl convergence between current guesses and the last guesses
#or difference below some tolerance

def Reat_Until_Converge(df, pm, max_iters, toler):
    for itera in range(max_iters):
#         x=df.iloc[:,:-1]
#         y=df.iloc[:,-1]

#         leaf_idx = Build_Apply_RF(x, y)
#         print("Apply RF done... leaf_idx", leaf_idx.shape)

#         pm = Proximity_Matrix(leaf_idx)
#         print("Proximity_Matrix built... ", pm.shape)
        
        #Define a tempo array for saving the values of the dataframe with initial guesses
        df_rf = df.copy().to_numpy()

        values_last_guessed = []
        values_current_guess = []

        for row_id in tqdm(list(idex_nans.keys())):
            for column_id in idex_nans[int(row_id)]:
                #Get the last guess
                value_last_guessed = df_rf[int(row_id), column_id]
                values_last_guessed.append(value_last_guessed)

                #Get the next guess
                value_current_guess = Refine_Guess(int(row_id), column_id, df_rf, pm, nominal=True)
                values_current_guess.append(value_current_guess)

                #Change the dataset according to current guess
                df_rf[int(row_id), column_id] = value_current_guess

        values_last_guessed=np.array(values_last_guessed)
        values_current_guess=np.array(values_current_guess)
        

        error = np.sum(np.abs(values_last_guessed-values_current_guess))/values_last_guessed.shape[0]
        print("Iteration: ", itera, "  error: ", error)
               
        if error <= toler:
            return df_rf, pm, error
        
    #Return pm, df
    return df_rf, error

In [None]:
df_rf,error = Reat_Until_Converge(df_rf1, pm, 1, toler=0)
df_rf.shape, error

In [None]:
#Save the dataset with missing value filled by RF
dataframe_rf = pd.DataFrame(df_rf, columns=df_mode.columns)
dataframe_rf.to_csv("D:/Diabetes_FillMissing_by_RF2.csv")