# [9660] Data Preparation 2
Data file:
* https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/heart_failure_survival.csv

Data columns
* age	: Patient age (years)  
* anemia :	Decrease of red blood cells or hemoglobin (boolean)  
* creatinine_phosphokinase (CPK) : Level of the CPK enzyme in the blood	(mcg/L)
* diabetes : If the patient has diabetes (boolean)  
* ejection_fraction :	Percentage of blood leaving the heart at each	contraction (%)  
* high_blood_pressure :	If a patient has hypertension (boolean)  
* platelets	: Platelets in the blood (kiloplatelets/mL)  
* serum_creatinine : Level of creatinine in the blood	(mg/dL)  
* serum_sodium : Level of sodium in the blood	(mEq/L)  
* gender : Patient gender (woman or man)
* smoking : If the patient smokes	(boolean)  
* time : Follow-up period	(days)  
* DEATH_EVENT : target variable - If the patient died during the follow-up period	(boolean)

In [None]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 10/09/24 10:55:59


### Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Load data

In [None]:
# Load data file (heart_failure_survival.csv) into dataframe
#  NOTE: Field separator is '|'
df = pd.read_csv('https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/heart_failure_survival.csv',
                 sep='|')
df.shape

(299, 13)

In [None]:
# Review first few records in dataframe
df.head()

Unnamed: 0,age,anemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,gender,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,man,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,man,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,man,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,man,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,woman,0,8,1


### Data imputation

In [None]:
# Check for null values
df.isnull().sum()

Unnamed: 0,0
age,12
anemia,0
creatinine_phosphokinase,0
diabetes,0
ejection_fraction,0
high_blood_pressure,0
platelets,0
serum_creatinine,0
serum_sodium,0
gender,15


#### Use SimpleImputer to replace missing values

In [None]:
# Setup imputer to replace NaN cells with mean of column
#  Both hyperparameters explicitly specified for teaching purposes
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
imp_mean.get_params()

{'add_indicator': False,
 'copy': True,
 'fill_value': None,
 'keep_empty_features': False,
 'missing_values': nan,
 'strategy': 'mean'}

In [None]:
cols_to_impute_1 = ['age']

In [None]:
df[cols_to_impute_1] = imp_mean.fit_transform(df[cols_to_impute_1])

In [None]:
# Setup imputer to replace NaN cells with mode (most frequent value) of column
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
cols_to_impute_2 = ['gender']

In [None]:
df[cols_to_impute_2] = imp_most_freq.fit_transform(df[cols_to_impute_2])

In [None]:
# Recheck for null values
df.isnull().sum()

Unnamed: 0,0
age,0
anemia,0
creatinine_phosphokinase,0
diabetes,0
ejection_fraction,0
high_blood_pressure,0
platelets,0
serum_creatinine,0
serum_sodium,0
gender,0


### Data normalization
Normalization scales (transforms) values to a given range (typically 0-1)


In [None]:
# Make a copy of the original dataframe
df2 = df.copy()

In [None]:
df2['platelets'].head()

Unnamed: 0,platelets
0,265000.0
1,263358.03
2,162000.0
3,210000.0
4,327000.0


In [None]:
col_min = df2['platelets'].min()
col_max = df2['platelets'].max()
col_mean = df2['platelets'].mean()
col_stddev = df2['platelets'].std()
print(f"Sub_metering_1: min={col_min}, max={col_max}, mean={col_mean}, std_dev={col_stddev}")

Sub_metering_1: min=25100.0, max=850000.0, mean=263358.02926421404, std_dev=97804.2368685983


In [None]:
# MinMaxScaler is a common data normalization method
minmaxscaler = MinMaxScaler()
df2['platelets'] = minmaxscaler.fit_transform(np.array(df2[['platelets']]))
df2['platelets'].head()

Unnamed: 0,platelets
0,0.290823
1,0.288833
2,0.16596
3,0.224148
4,0.365984


In [None]:
col_min = df2['platelets'].min()
col_max = df2['platelets'].max()
col_mean = df2['platelets'].mean()
col_stddev = df2['platelets'].std()
print(f"Sub_metering_1: min={col_min}, max={col_max}, mean={col_mean}, std_dev={col_stddev}")

Sub_metering_1: min=0.0, max=0.9999999999999999, mean=0.2888326212440466, std_dev=0.11856496165425905


### Data standardization
Standardization scales (transforms) values such that
- the column's mean will be approximately 0, AND
- the column's standard deviation will be approximately 1 (unit variance)

In [None]:
df2['creatinine_phosphokinase'].head()

Unnamed: 0,creatinine_phosphokinase
0,582
1,7861
2,146
3,111
4,160


In [None]:
col_min = df2['creatinine_phosphokinase'].min()
col_max = df2['creatinine_phosphokinase'].max()
col_mean = df2['creatinine_phosphokinase'].mean()
col_stddev = df2['creatinine_phosphokinase'].std()
print(f"Global_intensity: min={col_min}, max={col_max}, mean={col_mean}, std_dev={col_stddev}")

Global_intensity: min=23, max=7861, mean=581.8394648829432, std_dev=970.2878807124362


In [None]:
# StandardScaler is a common data standardization method
scaler = StandardScaler()
df2['creatinine_phosphokinase'] = scaler.fit_transform(np.array(df2[['creatinine_phosphokinase']]))
df2['creatinine_phosphokinase'].head()

Unnamed: 0,creatinine_phosphokinase
0,0.000166
1,7.51464
2,-0.449939
3,-0.486071
4,-0.435486


In [None]:
col_min = df2['creatinine_phosphokinase'].min()
col_max = df2['creatinine_phosphokinase'].max()
col_mean = df2['creatinine_phosphokinase'].mean()
col_stddev = df2['creatinine_phosphokinase'].std()
print(f"Global_intensity: min={col_min}, max={col_max}, mean={col_mean}, std_dev={col_stddev}")

Global_intensity: min=-0.5769177795755219, max=7.514639528854929, mean=0.0, std_dev=1.0016764471115345


### Scale all independent variables
Normalization and standardization can each be used for feature scaling, but ONLY one method should be used at any given time.

In [None]:
df.head()

Unnamed: 0,age,anemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,gender,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,man,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,man,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,man,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,man,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,woman,0,8,1


In [None]:
cols_to_one_hot_encode = ['gender']
df = pd.get_dummies(df, columns=cols_to_one_hot_encode, dtype=int)
df.head()

Unnamed: 0,age,anemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,smoking,time,DEATH_EVENT,gender_man,gender_woman
0,75.0,0,582,0,20,1,265000.0,1.9,130,0,4,1,1,0
1,55.0,0,7861,0,38,0,263358.03,1.1,136,0,6,1,1,0
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,7,1,1,0
3,50.0,1,111,0,20,0,210000.0,1.9,137,0,7,1,1,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,8,1,0,1


In [None]:
# Separate independent and dependent variables
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

In [None]:
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 1.23522376e+00, -8.71104775e-01,  1.65728387e-04, ...,
        -1.62950241e+00,  7.19559933e-01, -7.19559933e-01],
       [-5.09122362e-01, -8.71104775e-01,  7.51463953e+00, ...,
        -1.60369074e+00,  7.19559933e-01, -7.19559933e-01],
       [ 3.63050697e-01, -8.71104775e-01, -4.49938761e-01, ...,
        -1.59078490e+00,  7.19559933e-01, -7.19559933e-01],
       ...,
       [-1.38129542e+00, -8.71104775e-01,  1.52597865e+00, ...,
         1.90669738e+00, -1.38973830e+00,  1.38973830e+00],
       [-1.38129542e+00, -8.71104775e-01,  1.89039811e+00, ...,
         1.93250906e+00,  7.19559933e-01, -7.19559933e-01],
       [-9.45208891e-01, -8.71104775e-01, -3.98321274e-01, ...,
         1.99703825e+00,  7.19559933e-01, -7.19559933e-01]])

In [None]:
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df.head()

Unnamed: 0,age,anemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,smoking,time,gender_man,gender_woman
0,1.235224,-0.871105,0.000166,-0.847579,-1.53056,1.359272,0.01681648,0.490057,-1.504036,-0.687682,-1.629502,0.71956,-0.71956
1,-0.509122,-0.871105,7.51464,-0.847579,-0.007077,-0.735688,7.53566e-09,-0.284552,-0.141976,-0.687682,-1.603691,0.71956,-0.71956
2,0.363051,-0.871105,-0.449939,-0.847579,-1.53056,-0.735688,-1.038073,-0.0909,-1.731046,1.454161,-1.590785,0.71956,-0.71956
3,-0.945209,1.147968,-0.486071,-0.847579,-1.53056,-0.735688,-0.5464741,0.490057,0.085034,-0.687682,-1.590785,0.71956,-0.71956
4,0.363051,1.147968,-0.435486,1.17983,-1.53056,-0.735688,0.6517986,1.264666,-4.682176,-0.687682,-1.577879,-1.389738,1.389738
