In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import Imputer

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("heat.csv")
#df

In [4]:
#df.info()

In [5]:
# replace empty by nan
df = df.replace(r'^\s+$', np.nan, regex=True)
#df

In [6]:
#df.info()

In [7]:
df = df.replace('\t','',regex=True)
df = df.replace(' ','',regex=True)
df = df.replace('\?','np.nan',regex=True)

## Data Cleaning

In [8]:
# Drop the column `ID` as it is unrelated to the dependent variable
df = df.drop(['Location_1outdoor_indoor', 'PreSBP', 'PreRR','PreBT','PreHR','Abdminal','Muscular','CRP'],axis=1)
#df

In [9]:
# Remove rows where taget are missing
missing_row = df['Deadtodischarge'].isnull()
print('Number of rows where target are missing:')
print(sum(missing_row))

df = df[~missing_row]
#df

Number of rows where target are missing:
367


In [21]:
missing_row[79]

True

We determine and drop the variables with excessive missing values from the dataset.

In [10]:
def remove_bad_columns(df,bad_column_threshold):
    # find bad columns having too many missing values
    n_null = np.array(df.isnull().sum(axis=0))
    bad_col = np.array([]).astype(int)
    for i in range(len(n_null)):
        if n_null[i] >= bad_column_threshold:
            bad_col = np.append(bad_col,i)

    #print(bad_col)
    print('number of bad columns:',len(bad_col))

    # delete bad columns
    df = df.drop(df.columns[bad_col],axis=1)
    #df.info()
    return df   

In [11]:
df = remove_bad_columns(df,400)
df.info()

number of bad columns: 3
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2810 entries, 5 to 3174
Data columns (total 25 columns):
Sex_1male_2female                            2783 non-null float64
Age                                          2803 non-null float64
Functionaldependency_1notdisable_2disable    2502 non-null float64
HT                                           2810 non-null float64
HeartDisease                                 2810 non-null float64
Pscyco                                       2810 non-null float64
DM                                           2810 non-null float64
CerevD                                       2810 non-null float64
ParkinD                                      2810 non-null float64
CKD                                          2810 non-null float64
Dementia                                     2810 non-null float64
GCS                                          2736 non-null float64
SBP                                          2641 non-null float

We find bad rows which contain too many missing values, then remove them.

In [12]:
def remove_bad_rows(df,bad_row_threshold):   
    # find bad rows having too many missing values
    n_null = np.array(df.isnull().sum(axis=1))
    bad_row = np.array([]).astype(int)
    for t in range(len(n_null)):
        if n_null[t] >= bad_row_threshold:
            bad_row = np.append(bad_row,t)

    #print(bad_row)
    print('number of bad rows:',len(bad_row))

    # delete bad rows
    df = df.drop(bad_row)
    #df.info()
    return df

In [15]:
n_null = np.array(df.isnull().sum(axis=1))
print(n_null)

[0 0 0 ... 0 1 0]


In [18]:
bad_row_threshold = 5
bad_row = np.array([]).astype(int)
for t in range(len(n_null)):
    if n_null[t] >= bad_row_threshold:
        bad_row = np.append(bad_row,t)

In [19]:
bad_row

array([   7,   16,   35,   40,   50,   52,   61,   79,   80,  106,  125,
        144,  145,  146,  147,  169,  236,  300,  303,  306,  307,  314,
        315,  316,  317,  319,  323,  324,  325,  326,  329,  330,  332,
        333,  363,  388,  394,  397,  398,  405,  409,  412,  419,  420,
        423,  426,  427,  428,  429,  430,  437,  440,  449,  477,  479,
        480,  543,  544,  546,  549,  558,  587,  590,  604,  607,  661,
        681,  682,  762,  763,  765,  775,  777,  782,  826,  830,  851,
        903,  907,  924,  930,  935,  937,  944,  947,  955,  959,  961,
        966,  985, 1001, 1025, 1029, 1039, 1111, 1112, 1127, 1133, 1136,
       1164, 1167, 1201, 1266, 1267, 1270, 1273, 1279, 1280, 1296, 1301,
       1309, 1310, 1357, 1361, 1375, 1384, 1396, 1408, 1459, 1462, 1463,
       1464, 1467, 1468, 1469, 1471, 1476, 1507, 1515, 1517, 1525, 1538,
       1565, 1590, 1593, 1611, 1612, 1640, 1672, 1673, 1674, 1684, 1699,
       1704, 1706, 1707, 1709, 1710, 1736, 1792, 18

In [13]:
df = remove_bad_rows(df,5)
df.info()

number of bad rows: 247


KeyError: '[  16   79  145  300  315  397  419  420  479  544  546  549  681  924\n  944 1001 1279 1309 1463 1468 1469 1905 2086 2434 2769 2789] not found in axis'

For convenience, we separate independents `X` and dependent `y` from the data.

In [None]:
dfx = df.drop('Deadtodischarge',axis=1)
dfy = df['Deadtodischarge']

In [None]:
# number of uniques of each column (excluding NaN)
nu = np.array([len(pd.unique(dfx[col].dropna())) for col in dfx.columns])
print('number of uniques of each variable:')
print(nu)

In [None]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 5: # !!!! NOTE: this is not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))  # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [None]:
variable_type = define_variable_type(dfx,nu)
print('variable type:',variable_type)

In [None]:
def impute_missing(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            df2[col] = df[col].fillna(df[col].median())    
    return df2       

In [None]:
dfx_imputed = impute_missing(dfx,variable_type)
dfx_imputed

In [None]:
dfx_imputed.info()

## Data Processing

### Attributes

In [None]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [None]:
# convert x
x = np.array(dfx_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

### Target

In [None]:
y = np.array(dfy)
print(np.unique(y,return_counts=True))


# if target is already 0 and 1
y_new = y #and delete lines below convert  
print(np.unique(y_new,return_counts=True))

In [None]:
# combine X and y and save to a file
xy_new = np.hstack((x_new,y_new[:,np.newaxis]))
np.savetxt('heat_processed.dat',xy_new,fmt='%f')