In [1]:
import numpy as np
import pandas as pd
import dt

from sklearn.decomposition import PCA

pd.options.mode.chained_assignment = None  # default='warn'

Scores: [100.0, 100.0, 100.0, 100.0, 100.0]
Mean Accuracy: 100.000%


In [2]:
temp = lambda col: col not in ['payer_code']
df = pd.read_csv('dataset/pre_processed.csv', na_values='?', usecols=temp)
y = df['diabetesMed'].map({'No' : 0, 'Yes' : 1})
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),[75-100),1.000000,1.000000,1.000000,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),[75-100),1.000000,1.000000,7.000000,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),[75-100),1.000000,1.000000,7.000000,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),[75-100),1.000000,1.000000,7.000000,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),[75-100),2.000000,1.000000,7.000000,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),[75-100),3.000000,1.000000,2.000000,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),[75-100),1.000000,1.000000,2.000000,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),[75-100),2.000000,1.000000,7.000000,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),[75-100),3.000000,3.000000,4.000000,9,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),[75-100),1.000000,1.000000,4.000000,7,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
def feature_engineering(data):
    df = data.copy()
    def delete_column(del_col):
        for col in del_col: df.drop(col, 1, inplace=True)
            
    #Delete id column and label              
    delete_column(['encounter_id', "patient_nbr", "diabetesMed"])
    
    #Delete constant col
    df = df.loc[:, (df != df.iloc[0]).any()]
    
    #Delete categorical column more than 2 values (simplicity)
    excluded_col = ['gender', 'race', 'age']
    obj_col = [col for col in df.columns if df.dtypes[col] == 'O' and 
               len(df[col].unique()) > 2 and col not in excluded_col]
    delete_column(obj_col)
    
    #Convert age column to int range 0-9 (ordering)
    ages = df.age.unique()
    df['age'] = df['age'].map(dict((ages[i],i) for i in range(len(ages))))
    
    #Convert all Object column to int using one hot encoding
    df = pd.get_dummies(df)
    
    #Delete column with suffix No and id
    no_col = [col for col in df.columns if col.endswith("No") or col.endswith("id")]
    delete_column(no_col)
    
    return df

In [4]:
clean_df = feature_engineering(df)
clean_df

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,race_Other,gender_Female,gender_Male,acetohexamide_Steady,tolbutamide_Steady,troglitazone_Steady,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,change_Ch
0,0,1,41.000000,0.00000,1,0,0,0,250.83,276.00,...,0,1,0,0,0,0,0,0,0,0
1,1,3,59.000000,0.00000,18,0,0,0,276.00,250.01,...,0,1,0,0,0,0,0,0,0,1
2,2,2,11.000000,1.00000,13,2,0,1,648.00,250.00,...,0,1,0,0,0,0,0,0,0,0
3,3,2,44.000000,0.00000,16,0,0,0,8.00,250.43,...,0,0,1,0,0,0,0,0,0,1
4,4,1,51.000000,1.00000,8,0,0,0,197.00,157.00,...,0,0,1,0,0,0,0,0,0,1
5,5,3,31.000000,0.00000,16,0,0,0,414.00,411.00,...,0,0,1,0,0,0,0,0,0,0
6,6,4,70.000000,2.00000,21,0,0,0,414.00,411.00,...,0,0,1,0,0,0,0,0,0,1
7,7,5,73.000000,3.00000,12,0,0,0,428.00,492.00,...,0,0,1,0,0,0,0,0,0,0
8,8,9,68.000000,2.00000,28,0,0,0,398.00,427.00,...,0,1,0,0,0,0,0,0,0,1
9,9,7,33.000000,0.00000,18,0,0,0,434.00,198.00,...,0,1,0,0,0,0,0,0,0,1


In [5]:
pca = PCA(15)
pca_arr = pca.fit_transform(clean_df)
pca_df = pd.DataFrame(data = pca_arr)
pca_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-219.271598,-242.426232,-27.480207,3.460728,-13.955241,-2.942746,-1.045970,5.891416,-1.027586,-0.440124,0.734116,-0.834894,-0.589664,-0.098963,-0.080818
1,-193.372784,-256.742950,-6.076473,-14.620996,2.936194,-0.914464,1.982260,4.647890,-1.042275,-0.449101,0.678760,-0.798896,0.440311,-0.179940,-0.100130
2,177.207136,-225.229708,2.977753,33.446545,-1.865584,-1.918549,0.859384,3.933693,-0.037805,0.567608,0.862539,0.689332,-0.297287,1.891792,-0.155812
3,-472.861092,-187.071817,97.652413,0.416969,0.986289,-1.896516,2.790842,2.627975,-1.029691,-0.428920,-0.720948,-0.463622,0.448257,-0.174028,-0.101968
4,-266.953228,-338.425056,47.133971,-6.636382,-6.987853,-2.991229,-1.336745,1.898483,0.011208,-0.442977,-0.745273,-0.359254,0.434777,-0.198134,-0.119668
5,-63.581335,-123.972430,-108.588250,13.481226,1.060574,-0.946145,0.653139,0.911700,-0.998385,-0.448124,-0.771863,-0.251540,-0.505783,-0.118416,-0.091315
6,-63.578178,-123.945281,-108.587050,-25.542673,5.891980,0.024373,-0.343492,0.026696,1.021483,-0.468498,-0.803741,-0.187228,0.482918,-0.215839,-0.126124
7,-53.649613,-60.707467,-160.159220,-28.460090,-3.124595,1.013441,-0.471961,-0.935488,2.035421,-0.460143,-0.799396,-0.085709,-0.484356,-0.124307,-0.097747
8,-61.954218,-248.408365,-281.681569,-23.659076,12.955565,4.981003,-0.553020,-2.142793,1.030160,-0.485179,0.545989,-0.261303,0.533490,-0.223428,-0.121545
9,-53.565791,-134.442749,209.617935,11.463878,3.012006,3.013036,-1.698626,-2.957753,-0.950307,-0.465711,0.536019,-0.218086,0.543313,-0.226722,-0.132991
