In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
# Creation of imputer instances for categorical, numerical features, and MinMaxScaler
impute_num = SimpleImputer(missing_values = np.nan, strategy = 'mean')
impute_cat = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
min_max_scaler = MinMaxScaler()

In [3]:
# In this cell we create two lists containing the name of numerical and categorical 
# features. Also dictionaries for later type casting are created. 

num = ['AGE', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB',
       'DLIT_AG', 'ZSN_A', 'S_AD_ORIT', 'D_AD_ORIT', 'ant_im', 'lat_im',
       'inf_im', 'post_im', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD',
       'L_BLOOD', 'ROE', 'TIME_B_S']

num_type = ['float']

cat = ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_01', 'nr_02', 'nr_03', 'nr_04',
       'nr_07', 'nr_08', 'np_01', 'np_04', 'np_05', 'np_07', 'np_08', 'np_09',
       'np_10', 'endocr_01', 'endocr_02', 'endocr_03', 'zab_leg_01',
       'zab_leg_02', 'zab_leg_03', 'zab_leg_04', 'zab_leg_06', 'O_L_POST',
       'K_SH_POST', 'MP_TP_POST', 'SVT_POST', 'GT_POST', 'FIB_G_POST',
       'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04',
       'ritm_ecg_p_06', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01',
       'n_r_ecg_p_02', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05',
       'n_r_ecg_p_06', 'n_r_ecg_p_08', 'n_r_ecg_p_09', 'n_r_ecg_p_10',
       'n_p_ecg_p_01', 'n_p_ecg_p_03', 'n_p_ecg_p_04', 'n_p_ecg_p_05',
       'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_08', 'n_p_ecg_p_09',
       'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_01',
       'fibr_ter_02', 'fibr_ter_03', 'fibr_ter_05', 'fibr_ter_06',
       'fibr_ter_07', 'fibr_ter_08', 'GIPO_K', 'GIPER_NA', 'NA_KB',
       'NOT_NA_KB', 'LID_KB', 'NITR_S', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n',
       'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n']

cat_type = ['Int64']
cat_type2 = ['int']

# dictionaries for type casting

num_dict = dict()
cat_dict = dict()
cat2_dict = dict()

for typ_n in num_type:
    for column in num:
        num_dict[column] = typ_n
        
for typ_c in cat_type:
    for column in cat:
        cat_dict[column] = typ_c
        
for typ_2c in cat_type2:
    for column in cat:
        cat2_dict[column] = typ_2c

# works for Python 3.9.0 or greater
types = num_dict | cat_dict
types2 = num_dict | cat2_dict

# In Python 3.5 or greater
# types = {**num_dict, **cat_dict}

In [4]:
# read file with training data
train = pd.read_csv('./jly-dl-project/train', dtype=types, index_col=0)

In [5]:
# impute missing values
train[num] = impute_num.fit_transform(train[num])
train[cat] = impute_cat.fit_transform(train[cat])

In [6]:
# cast features to types
train = train.astype(dtype=types2)
train['AGE'] = train['AGE'].astype('int')

In [7]:
# perform min_max scaling on numerical features
train[num] = min_max_scaler.fit_transform(train[num])

In [8]:
# Split data into X and y vectors
X_train = train.drop(columns=['LET_IS'])
y_train = train['LET_IS']