# 2. Data preparation

---

### Setup

In [1]:
import sys
sys.path.insert(1, '../../utils')

import pandas as pd

original: pd.DataFrame = pd.read_csv('../heart_failure_clinical_records_dataset_after_profiling.csv', sep=';')

cols_nr = original.select_dtypes(include='number')
cols_sb = original.select_dtypes(include='bool')

original.describe()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,581.839465,38.083612,263358.029264,1.39388,136.625418,130.26087
std,11.894809,970.287881,11.834841,97804.236869,1.03451,4.412477,77.614208
min,40.0,23.0,14.0,25100.0,0.5,113.0,4.0
25%,51.0,116.5,30.0,212500.0,0.9,134.0,73.0
50%,60.0,250.0,38.0,262000.0,1.1,137.0,115.0
75%,70.0,582.0,45.0,303500.0,1.4,140.0,203.0
max,95.0,7861.0,80.0,850000.0,9.4,148.0,285.0


## 2.0. Data preparation

---

### Discretization

---

In [2]:
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

print("Before discretization:\n\n" + str(original.nunique()))

data_num = original.select_dtypes('number')
data_bool = original.select_dtypes('bool')

data_num_np = data_num.to_numpy()
data_bool_np = data_bool.to_numpy()

bins1 = [25, 25, 10, 25, 25, 25, 25]
bins2 = [10, 10, 5, 10, 10, 10, 10]

Before discretization:

age                          47
anaemia                       2
creatinine_phosphokinase    208
diabetes                      2
ejection_fraction            17
high_blood_pressure           2
platelets                   176
serum_creatinine             40
serum_sodium                 27
sex                           2
smoking                       2
time                        148
DEATH_EVENT                   2
dtype: int64


In [3]:
def discretize(bins):
    new_data_by_var = [pd.DataFrame() for _ in range(len(bins))]

    for var in range(len(bins)):
        curr_data = data_num_np[:,var]
        bin_size = 1 / bins[var]

        quantiles = [np.quantile(curr_data, bin_size, interpolation='lower')]
        for step in range(2, bins[var]):
            q = np.quantile(curr_data, bin_size * step, interpolation='lower')
            if quantiles[-1] != q:
                quantiles.append(q)
        new_data = new_data_by_var[var]
        name = data_num.columns[var]

        def addBoolColumn(index, name, cond):
            new_data.insert(index, name, [cond(v) for v in curr_data])

        def rm_flt(num):
            return str(int(num)) if num == int(num) else str(num)

        addBoolColumn(0, name + "<" + rm_flt(quantiles[0]), lambda x: x < quantiles[0])

        for q in range(len(quantiles) - 1):
            new_name = rm_flt(quantiles[q]) + "<=" + name + "<" + rm_flt(quantiles[q + 1])
            test = lambda x : quantiles[q] <= x < quantiles[q + 1]
            addBoolColumn(q + 1, new_name, test)

        addBoolColumn(len(quantiles), 
            name + ">=" + rm_flt(quantiles[-1]), lambda x: x >= quantiles[-1])
    
    return new_data_by_var

In [10]:
def sep_true_false():
    new_data_bool = [data_bool.copy(deep=True), pd.DataFrame()]
    for v in range(len(data_bool.columns)):
        var = data_bool.columns[v]
        curr_data = data_bool_np[:,v]
        
        if var == 'sex':
            new_data_bool[0] = new_data_bool[0].rename(columns = {'sex': 'sex_M'})
            name = 'sex_F'
        else:
            name = 'no_' + var
        
        new_data_bool[1].insert(v, name, [not val for val in curr_data])
    return new_data_bool

def add_m_f():
    new_data_bool = [data_bool.copy(deep=True), pd.DataFrame()]
    new_data_bool[0] = new_data_bool[0].rename(columns = {'sex': 'sex_M'})
    new_data_bool[1].insert(0, 'sex_F', [not val for val in data_bool['sex']])
    return new_data_bool

In [11]:
new_data_bool = add_m_f()

discretized1 = pd.concat(discretize(bins1) + new_data_bool, axis = 1)
discretized2 = pd.concat(discretize(bins2) + new_data_bool, axis = 1)

print("\n\nAfter discretization:\n\n" + str(discretized1.nunique()))



After discretization:

age<42                 2
42<=age<45             2
45<=age<50             2
50<=age<53             2
53<=age<55             2
                      ..
high_blood_pressure    2
sex_M                  2
smoking                2
DEATH_EVENT            2
sex_F                  2
Length: 133, dtype: int64


In [14]:
print(discretized1['sex_F'])
print(discretized1['sex_M'])

0      False
1      False
2      False
3      False
4       True
       ...  
294    False
295     True
296     True
297    False
298    False
Name: sex_F, Length: 299, dtype: bool
0       True
1       True
2       True
3       True
4      False
       ...  
294     True
295    False
296    False
297     True
298     True
Name: sex_M, Length: 299, dtype: bool


In [15]:
# Finally, export prepared data to csv
discretized1.to_csv('data/prepared_d1.csv', sep=';', index=False)
discretized2.to_csv('data/prepared_d2.csv', sep=';', index=False)

### Feature Selection

---

#### Unsupervised Selection

By definition, unsupervised selection only aims for eliminating redundancies among the
variables, getting the smallest set possible.

In [7]:
import seaborn as sns

df = discretized1

copy = df.copy(deep=True)
copy = copy.drop('DEATH_EVENT', axis = 1)
corrs = copy.corr()

removed = []

for x in corrs:
    for y in corrs:
        if x == y:
            break
        corr = corrs[x][y]
        if abs(corr) >= 0.2 and x not in removed and y not in removed:
            copy = copy.drop(x, axis=1)
            removed.append(x)
            break

copy['DEATH_EVENT'] = df['DEATH_EVENT']

data_dict_unsupervised = copy.copy(deep=True)
print(data_dict_unsupervised)

     age<42  42<=age<45  45<=age<50  50<=age<53  53<=age<55  55<=age<58  \
0     False       False       False       False       False       False   
1     False       False       False       False       False        True   
2     False       False       False       False       False       False   
3     False       False       False        True       False       False   
4     False       False       False       False       False       False   
..      ...         ...         ...         ...         ...         ...   
294   False       False       False       False       False       False   
295   False       False       False       False       False        True   
296   False       False        True       False       False       False   
297   False       False        True       False       False       False   
298   False       False       False        True       False       False   

     58<=age<60  60<=age<61  61<=age<63  63<=age<65  ...  205<=time<210  \
0         False       Fa

In [8]:
data_dict_unsupervised.to_csv('data/prepared_d_s.csv', sep=';', index=False)

### Summary

---

***Are all variables in the same scale? If not, how does scaling impact the results?***



***Is the dataset unbalanced? If yes, what is the best balancing technique to apply?***

