# Data preparation

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
plt.style.use('ggplot')

#### Loading data

In [2]:
data = pd.read_csv('data/dataset-MI+FG_reduced_corrected.csv')
# replacing M -> 0 and F -> !
data['sex'] = data['sex'].apply(lambda sex: 0 if sex == 'M' else 1)

### 1 - Handling Missing Values

#### 1.1 - Remove NaNs

In [3]:
data_without_nan = data.dropna().reset_index(drop=True)

#### 1.2 - Replace NaNs

In [4]:
discrete_columns = ['BMI', 'ESS', 'paO2', 'paCO2', 'FVC%', 'FEV1%', 'Tiff']
real_columns = [
    'Mallampati', 'Tipo di Russamento', 'Frequenza della stanchezza',
    'Frequenza dei risvegli', 'Frequenza Apnee', 'Frequenza Addormentamento alla guida'
]
data_mv_replaced = data.copy()

In [5]:
for column in data.columns:
    if column in discrete_columns:
        data_mv_replaced[column].fillna(data_mv_replaced[column].mode()[0], inplace=True)
    if column in real_columns:
        data_mv_replaced[column].fillna(data_mv_replaced[column].mean(), inplace=True)  

#### 1.3 - Predict NaNs

In [6]:
columns_without_mv = ['Severita', 'AHI', 'aa', 'sex', 'SMOKE', 'IPERTENSIONE.', 'CARDIOPATIA', 'DIABETE']
columns_to_regress = ['BMI', 'ESS', 'paO2', 'paCO2', 'FVC%', 'FEV1%', 'Tiff']
columns_to_predict = [
    'Mallampati', 'Tipo di Russamento', 'Frequenza della stanchezza',
    'Frequenza dei risvegli', 'Frequenza Apnee', 'Frequenza Addormentamento alla guida'
]
data_mv_predicted = data.copy()

In [7]:
%%time
pd.options.mode.chained_assignment = None  # default='warn'

for col_to_predict in columns_to_predict:
    training_data = data_mv_predicted[columns_without_mv + [col_to_predict]]
    training_data = training_data.dropna(subset=[col_to_predict])
    
    # discretizing real features
    for col_to_discretize in ['AHI', 'aa']:
        training_data[col_to_discretize] = pd.qcut(training_data[col_to_discretize], 3, labels=[1,2,3])
    
    Y = training_data[col_to_predict].tolist()
    training_data = training_data.drop(col_to_predict, axis=1)
    Xs   = [row.tolist() for i, row in training_data.iterrows()]
    
    classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    classifier.fit(Xs, Y)
    
    for i, row in data_mv_predicted.iterrows():
        if row[col_to_predict] != row[col_to_predict]: # check if it's NaN
            prediction = classifier.predict([row[columns_without_mv].tolist()])
            data_mv_predicted.set_value(i, col_to_predict, prediction)
    

CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s
Wall time: 1.11 s


In [8]:
%%time

for col_to_regress in columns_to_regress:
    training_data = data[columns_without_mv + [col_to_regress]]
    training_data = training_data.dropna(subset=[col_to_regress])
    
    Y = training_data[col_to_regress].tolist()
    training_data = training_data.drop(col_to_regress, axis=1)
    Xs   = [row.tolist() for i, row in training_data.iterrows()]
    
    regressor = SVR(kernel='rbf', C=1e3, gamma=0.1)
    regressor.fit(Xs, Y)
    
    for i, row in data.iterrows():
        if row[col_to_regress] != row[col_to_regress]: # check if it's NaN
            prediction = regressor.predict([row[columns_without_mv].tolist()])
            data_mv_predicted.set_value(i, col_to_regress, prediction)
    

CPU times: user 628 ms, sys: 0 ns, total: 628 ms
Wall time: 627 ms


### 2 - Normalize
Scaling features to have mean 0 and a variance of 1 (Feature wise)

In [9]:
normalized = preprocessing.scale(data_without_nan.values, axis=0) # axis 0 means feature wise
normalized = preprocessing.normalize(normalized, axis=0) # axis 0 means feature wise

data_without_nan_norm = pd.DataFrame(normalized, columns=data_without_nan.columns)
data_without_nan_norm['Severita'] = data_without_nan['Severita']
data_without_nan_norm['AHI'] = data_without_nan['AHI']



normalized = preprocessing.scale(data_mv_predicted.values, axis=0) # axis 0 means feature wise
normalized = preprocessing.normalize(normalized, axis=0) # axis 0 means feature wise

data_mv_replaced_norm = pd.DataFrame(normalized, columns=data_mv_replaced.columns)
data_mv_replaced_norm['Severita'] = data_mv_replaced['Severita']
data_mv_replaced_norm['AHI'] = data_mv_replaced['AHI']



normalized = preprocessing.scale(data_mv_predicted.values, axis=0) # axis 0 means feature wise
normalized = preprocessing.normalize(normalized, axis=0) # axis 0 means feature wise

data_mv_predicted_norm = pd.DataFrame(normalized, columns=data_mv_predicted.columns)
data_mv_predicted_norm['Severita'] = data_mv_predicted['Severita']
data_mv_predicted_norm['AHI'] = data_mv_predicted['AHI']

### 3 - Sampling Severita
Stratum size = number of elem in the smallest class (which is 1.0)

#### 3.1 - Sampling data without NaNs

In [10]:
data_without_nan_norm_sampled = pd.DataFrame(columns=data_without_nan_norm.columns)
severita_values = data_without_nan_norm['Severita'].unique()
stratum_size = len(data_without_nan_norm[data_without_nan_norm['Severita'].apply(lambda sev: sev == 1.0)])

for val in severita_values:
    data_without_nan_norm_sampled = data_without_nan_norm_sampled.append(
        data_without_nan_norm[data_without_nan_norm['Severita'].apply(lambda sev: sev == val)].sample(stratum_size, replace=False))

#### 3.2 - Sampling data replaced NaNs

In [11]:
data_mv_replaced_norm_sampled = pd.DataFrame(columns=data_mv_replaced_norm.columns)
severita_values = data_mv_replaced_norm['Severita'].unique()
stratum_size = len(data_mv_replaced_norm[data_mv_replaced_norm['Severita'].apply(lambda sev: sev == 1.0)])

for val in severita_values:
    data_mv_replaced_norm_sampled = data_mv_replaced_norm_sampled.append(
        data_mv_replaced_norm[data_mv_replaced_norm['Severita'].apply(lambda sev: sev == val)].sample(stratum_size, replace=False))

#### 3.3 - Sampling data predicted NaNs

In [12]:
data_mv_predicted_norm_sampled = pd.DataFrame(columns=data_mv_predicted_norm.columns)
severita_values = data_mv_predicted_norm['Severita'].unique()
stratum_size = len(data_mv_predicted_norm[data_mv_predicted_norm['Severita'].apply(lambda sev: sev == 1.0)])

for val in severita_values:
    data_mv_predicted_norm_sampled = data_mv_predicted_norm_sampled.append(
        data_mv_predicted_norm[data_mv_predicted_norm['Severita'].apply(lambda sev: sev == val)].sample(stratum_size, replace=False))

###### Printing to file

In [13]:
!mkdir -p data/preprocessed

In [14]:
data_without_nan.to_csv('data/preprocessed/data_without_nan.csv', index=False, sep=',')
data_mv_replaced.to_csv('data/preprocessed/data_mv_replaced.csv', index=False, sep=',')
data_mv_predicted.to_csv('data/preprocessed/data_mv_predicted.csv', index=False, sep=',')

# normalized 
data_without_nan_norm.to_csv('data/preprocessed/data_without_nan_norm.csv', index=False, sep=',')
data_mv_replaced_norm.to_csv('data/preprocessed/data_mv_replaced_norm.csv', index=False, sep=',')
data_mv_predicted_norm.to_csv('data/preprocessed/data_mv_predicted_norm.csv', index=False, sep=',')

# normalized and sampled
data_without_nan_norm_sampled.to_csv('data/preprocessed/data_without_nan_norm_sampled.csv', index=False, sep=',')
data_mv_replaced_norm_sampled.to_csv('data/preprocessed/data_mv_replaced_norm_sampled.csv', index=False, sep=',')
data_mv_predicted_norm_sampled.to_csv('data/preprocessed/data_mv_predicted_norm_sampled.csv', index=False, sep=',')