### Impute missing values and convert binary to +-1, category to one-hot

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("data_cleaned.csv")
df.head()

Unnamed: 0,site,gender,age,race_ethnicity,maritalstatus,education,lowhealthliteracy,employment status,healthinsurance,strength_comfort_religion,...,antidepressant_hosp,antidiabetic_hosp,betablocker,P2Y12,atrialfib_complication,vtachvfib_complication,Acute_Kidney_Injury_complication,bleeding_complication,cardiac_rehab,died_2_year
0,Massachusetts,Male,60,NHW,Married,CollegeGraduate,No,Employed,No,,...,No,No,Yes,Yes,No,No,No,No,Yes,No
1,Massachusetts,Male,61,NHW,Married,Somecollege,Yes,Unemployed/retired,No,Little/Some,...,No,Yes,Yes,Yes,No,No,No,No,Yes,No
2,Massachusetts,Male,56,NHW,NotMarried,HighSchoolorless,Yes,Unemployed/retired,Yes,Greatdeal,...,No,No,Yes,Yes,No,No,No,No,No,No
3,Massachusetts,Male,48,NHW,NotMarried,HighSchoolorless,No,Employed,No,Little/Some,...,Yes,No,Yes,No,No,No,No,No,No,No
4,Massachusetts,Male,49,NHW,NotMarried,HighSchoolorless,No,Unemployed/retired,No,Little/Some,...,Yes,No,Yes,Yes,Yes,No,No,No,Yes,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2068 entries, 0 to 2067
Data columns (total 60 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   site                              2068 non-null   object 
 1   gender                            2053 non-null   object 
 2   age                               2068 non-null   int64  
 3   race_ethnicity                    2059 non-null   object 
 4   maritalstatus                     2068 non-null   object 
 5   education                         2067 non-null   object 
 6   lowhealthliteracy                 2068 non-null   object 
 7   employment status                 2068 non-null   object 
 8   healthinsurance                   2068 non-null   object 
 9   strength_comfort_religion         2068 non-null   object 
 10  petition_prayer_health            2068 non-null   object 
 11  intercessory_prayers_health       2068 non-null   object 
 12  surviv

### Find variable type

In [5]:
## separte features and target:
target = df.columns[-1]
print(target)

dfx = df.drop(target,axis=1)
dfy = df[target]

died_2_year


In [6]:
# number of uniques of each column (excluding NaN)
nu = np.array([len(pd.unique(dfx[col].dropna())) for col in dfx.columns])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[   2    2   65    3    2    3    2    2    2    3    2    2  726    5
    4    2    2    2   16 2065 2065   17    3    3   26  255  116  106
  155  308   42  246  186 1303    2    2    2    2    2    2    2    2
    2    3    3    2    2    2    2    2    2    2    2    2    2    2
    2    2    2]


In [7]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 7: # !!!! NOTE: this is not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))  # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [8]:
variable_type = define_variable_type(dfx,nu)
print('variable type:',variable_type)

i_binary: [0, 1, 4, 6, 7, 8, 10, 11, 15, 16, 17, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]
i_category: [3, 5, 9, 13, 14, 22, 23, 43, 44]
variable type: [1. 1. 3. 2. 1. 2. 1. 1. 1. 2. 1. 1. 3. 2. 2. 1. 1. 1. 3. 3. 3. 3. 2. 2.
 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### Replace missing values with column mean and column mode

In [9]:
def impute_missing_mean(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            # 2020.07.14: convert str to float
            if type(np.array(df[col])[0]) == str:
                df[col] = pd.to_numeric(df[col],errors='coerce')
            
            #df2[col] = df[col].fillna(df[col].median())
            df2[col] = df[col].fillna(df[col].mean())
    return df2 

In [10]:
dfx_imputed = impute_missing_mean(dfx,variable_type)
dfx_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2068 entries, 0 to 2067
Data columns (total 59 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   site                              2068 non-null   object 
 1   gender                            2068 non-null   object 
 2   age                               2068 non-null   int64  
 3   race_ethnicity                    2068 non-null   object 
 4   maritalstatus                     2068 non-null   object 
 5   education                         2068 non-null   object 
 6   lowhealthliteracy                 2068 non-null   object 
 7   employment status                 2068 non-null   object 
 8   healthinsurance                   2068 non-null   object 
 9   strength_comfort_religion         2068 non-null   object 
 10  petition_prayer_health            2068 non-null   object 
 11  intercessory_prayers_health       2068 non-null   object 
 12  surviv

## Data Processing

### Attributes

In [11]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [12]:
# convert x
x = np.array(dfx_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

(2068, 80)
[[ 1.  1. 60. ... -1. -1.  1.]
 [ 1.  1. 61. ... -1. -1.  1.]
 [ 1.  1. 56. ... -1. -1. -1.]
 ...
 [ 1.  1. 73. ... -1. -1.  1.]
 [ 1. -1. 58. ... -1. -1.  1.]
 [ 1.  1. 72. ... -1. -1.  1.]]


### Target

In [13]:
y = np.array(dfy)
print(np.unique(y,return_counts=True))

# convert taget to 0 and 1
#y_new = y
#y_new = np.ones(y.shape[0])
#y_new[y =='No'] = 0

unique_value = np.unique(y)
y_new = np.array([-1. if value == unique_value[0] else 1. for value in y])  
print(np.unique(y_new,return_counts=True))

(array(['No', 'Yes'], dtype=object), array([1945,  123]))
(array([-1.,  1.]), array([1945,  123]))


In [14]:
# combine X and y and save to a file
xy_new = np.hstack((x_new,y_new[:,np.newaxis]))
np.savetxt('data_processed_mean.dat',xy_new,fmt='%f')

## Impute missing values by k-NN

In [15]:
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
from scipy.stats import mode

In [16]:
def binary_category_to_integers(df,variable_type):
    # convert binary and categorical variables to integers
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    LE = LabelEncoder()
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or category
            df2[col] = LE.fit_transform(df[col])   
    return df2

In [17]:
def impute_missing_knn(x,variable_type,i_missed,j_missed,k_nn):
    # impute missing values by using k-NN
    # x: 2D numpy array
    # variable_type: 1D list,
    # = 1 for binary, 2 for category, 3 for numeric

    i_numeric = np.argwhere(variable_type > 2).flatten()
    i_nonnumeric = np.argwhere(variable_type <= 2).flatten()

    # distance between numeric features
    x1 = x[:,i_numeric]
    
    # standard scaler
    x1 = (x1 - x1.mean(axis=0))/x1.std(axis=0)

    d1 = cdist(x1,x1,metric='euclidean')

    # distance between binary/category features
    x2 = x[:, i_nonnumeric]
    d2 = cdist(x2,x2,metric='hamming')

    # total distance, d1 is non-normalized while d2 is normalized
    d = d1 + len(i_nonnumeric)*d2 

    np.fill_diagonal(d,np.nan)
    i_nn = np.argsort(d,axis=1)[:,:k_nn]

    # impute missing values with k-NN
    x_imputed = x.copy()
    for ii in range(len(i_missed)):
        i,j = i_missed[ii],j_missed[ii]
        #print(i,j)

        #print('i=',i,', j=',j , ', xij=', x[i,j])
        #print('i_nn=',i_nn[i,0:k_nn],'x_inn=',x[i_nn[i,0:k_nn],j])

        if j in i_numeric:
            x_imputed[i,j] = x[i_nn[i,0:k_nn],j].mean()
           
        else: # category
            value_count = mode(x[i_nn[i,0:k_nn],j])

            if value_count[1][0] > 1: # count > 1
                x_imputed[i,j] = value_count[0][0] # most frequent
                
            else:
                x_imputed[i,j] = x[i_nn[i,0],j] # closest row
               
    return x_imputed

In [18]:
dfx2_imputed = binary_category_to_integers(dfx_imputed,variable_type)

# position of missing values
i_missed,j_missed = np.where(dfx.isna())

In [19]:
k_nn_list = [2,3,4,5,6,7,8,9,10,int(round(np.sqrt(dfx2_imputed.shape[0])))]
k_nn_name = [2,3,4,5,6,7,8,9,10,'_sqrt']

for ik,k_nn in enumerate(k_nn_list):
    print('ik,knn:',ik,k_nn)
    x_knn = impute_missing_knn(np.array(dfx2_imputed),variable_type,
                           i_missed,j_missed,k_nn)

    # convert binary to +-1, category to onehot
    x_knn_new = convert_binary_and_category(x_knn,variable_type)

    # combine X and y and save to a file
    xy_knn = np.hstack((x_knn_new,y_new[:,np.newaxis]))
    np.savetxt('data_processed_knn%s.dat'%(k_nn_name[ik]),xy_knn,fmt='%f')

ik,knn: 0 2
ik,knn: 1 3
ik,knn: 2 4
ik,knn: 3 5
ik,knn: 4 6
ik,knn: 5 7
ik,knn: 6 8
ik,knn: 7 9
ik,knn: 8 10
ik,knn: 9 45
