In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Configuring plotting visual and sizes
sns.set_style('whitegrid')
sns.set_context('talk')

params = {'legend.fontsize': '12',
          'figure.figsize': (16, 9),
          'axes.labelsize': '12',
          'axes.titlesize':'12',
          'xtick.labelsize':'12',
          'ytick.labelsize':'12',
          'patch.linewidth': 0.0
         }

plt.rcParams.update(params)

In [3]:
data_test = pd.read_csv("../datasets/test_dataset.csv", sep=',', decimal='.')
data = pd.read_csv("../datasets/dataset.csv", sep=',', decimal='.')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       866 non-null    float64
 1   sex       866 non-null    float64
 2   cp        866 non-null    float64
 3   trestbps  809 non-null    float64
 4   chol      840 non-null    float64
 5   fbs       784 non-null    float64
 6   restecg   865 non-null    float64
 7   thalach   811 non-null    float64
 8   exang     811 non-null    float64
 9   oldpeak   806 non-null    float64
 10  slope     570 non-null    float64
 11  ca        297 non-null    float64
 12  thal      413 non-null    float64
 13  label     866 non-null    int64  
 14  split     866 non-null    object 
dtypes: float64(13), int64(1), object(1)
memory usage: 101.6+ KB


1. age: edad del paciente       
2. sex: sexo del paciente       
3. cp: tipo de dolor de pecho:
    - 1: angina típica
    - 2: angina atípica
    - 3: dolor no-anginoso
    - 4: asintomático
4. trestbps: presión arterial en reposo (en mm Hg al ingreso en el hospital)
5. chol: colesterol sérico en mg/dl
6. fbs: dolor provocado por el esfuerzo (1 = sí; 0 = no)
7. restecg: resultados electrocardiográficos en reposo
    - 0: normal
    - 1: presenta anormalidad de la onda ST-T (inversiones de la onda T y/o elevación o depresión del ST elevación o depresión del ST > 0,05 mV)
    - 2: presenta probable o definida hipertrofia ventricular izquierda
8. thalach: frecuencia cardiaca en reposo
9. exang: angina inducida por el ejercicio (1 = sí; 0 = no)
10. oldpeak: depresión del ST inducida por el ejercicio en relación con el reposo
11. slope: la pendiente del segmento ST en ejercicio máximo
    - 1: pendiente ascendente
    - 2: plano
    - 3: pendiente descendente
12. ca: número de vasos mayores (0-3) coloreados por flouroscopia      
13. thal: trastorno talasémico
    - 3: normal
    - 6: defecto fijo 
    - 7: defecto reversible
14. label (target): indica la ausencia (0) o el grado de enfermedad cardíaca (1-4)

  

In [4]:
data_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,56.0,1.0,2.0,124.0,224.0,1.0,0.0,161.0,0.0,2.0,2.0,,
1,61.0,1.0,4.0,125.0,0.0,0.0,0.0,105.0,1.0,0.0,3.0,,7.0
2,52.0,1.0,4.0,135.0,0.0,,0.0,128.0,1.0,2.0,2.0,,7.0
3,64.0,1.0,4.0,120.0,0.0,1.0,1.0,106.0,0.0,2.0,2.0,,
4,49.0,1.0,1.0,130.0,0.0,0.0,1.0,145.0,0.0,3.0,2.0,,


In [5]:
# Definimos variables categóricas y numéricas
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

def data_preprocessing(df, categorical_vars, numerical_vars):
    data = df.copy()
    # Eliminamos filas duplicadas
    data = data.drop_duplicates()

    # data[categorical_vars] = data[categorical_vars].astype(float).astype('Int64')
    # luego las pasamos a string, para que su valor 
    # numérico no influya en el análisis ni en las predicciones
    # data[categorical_vars + target] = data[categorical_vars + target].astype(str)
    # ca = 9 es un oulier, lo clippeamos a 3, que es el valor máximo según las especificaciones
    data['ca'] = data['ca'].replace(9.0, 3)

    # Reemplazamos 0s en chol y trestbps con NAs
    data[['trestbps', 'chol']] = data[['trestbps', 'chol']].replace(0, np.nan)

    # Rellenamos los valores numéricos no informados
    data = data.replace(pd.NA, np.nan)
    data[categorical_vars] = data[categorical_vars].replace(np.nan, -1)
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=11)
    data[numerical_vars + categorical_vars] = imputer.fit_transform(data[numerical_vars + categorical_vars])
    # data[numerical_vars] = imputer.fit_transform(data[numerical_vars])

    # Clippeamos los outliers
    for var in ['trestbps', 'chol', 'thalach', 'oldpeak']:
        tmp_data = data[var][data[var] > 0].dropna()
        data.loc[data[var].isna(), var] = np.quantile(tmp_data, 0.5)
        q1, q3 = np.quantile(tmp_data, 0.25), np.quantile(tmp_data, 0.75)
        iqr = q3 - q1
        cut_off = iqr * 1.5
        lower, upper = q1 - cut_off, q3 + cut_off
        data.loc[data[var] > upper, var] = upper
        data.loc[data[var] < lower, var] = lower
        
    # Añadimos nuevas variables
    #data['oldpeak2'] = np.where(np.abs(data['oldpeak']) >= 0.5, 1, 0)
    #data['cp2'] = np.where(data['cp'] == 4, 1, 0)
    #data['slope2'] = np.where((data['slope'] == 2) | (X['slope'] == 3) , 1, 0)
    #data.loc[data['slope2'].isna(), 'slope2'] = -1
    #data.loc[(data['thal'] == 7) | (data['thal'] == 6), 'thal2' ] = 1
    #data.loc[data['thal'] == 3, 'thal2'] = 0
    #data.loc[data['thal2'].isna(), 'thal2'] = -1
    #categorical_vars += ['oldpeak2', 'cp2', 'slope2', 'thal2']
    #data[categorical_vars] = data[categorical_vars].astype(int)
        
    # Parseamos las variables categóricas como enteros, para eliminar los decimales
    data[categorical_vars] = data[categorical_vars].astype(float).astype('Int64')
    return data

In [6]:
data_test = data_preprocessing(data_test, categorical_vars, numerical_vars)
data = data_preprocessing(data, categorical_vars, numerical_vars)
data_test.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,55.12,0.78,3.28,132.339545,259.048182,0.1,0.58,134.94,0.46,1.360545,1.2,-0.58,1.72
std,7.922172,0.418452,1.03095,16.426197,45.228863,0.614452,0.835195,22.18421,0.503457,1.254798,1.355262,0.905539,3.528543
min,35.0,0.0,1.0,94.0,153.0,-1.0,-1.0,92.0,0.0,0.0,-1.0,-1.0,-1.0
25%,50.0,1.0,2.25,120.977273,226.5,0.0,0.0,120.0,0.0,0.0,1.0,-1.0,-1.0
50%,56.0,1.0,4.0,130.0,263.136364,0.0,0.0,140.0,0.0,1.136364,2.0,-1.0,-1.0
75%,61.0,1.0,4.0,140.0,277.454545,0.0,1.0,150.0,1.0,2.0,2.0,-1.0,6.0
max,76.0,1.0,4.0,168.534091,353.886364,1.0,2.0,179.0,1.0,5.0,3.0,3.0,7.0


In [7]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
count,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0,865.0
mean,53.431214,0.788439,3.254335,131.87567,244.075145,0.049711,0.601156,137.11939,0.29711,0.840011,0.819653,-0.421965,1.889017,1.084393
std,9.47549,0.408651,0.920476,16.939933,44.021174,0.486939,0.807516,25.495844,0.579993,0.980733,1.402654,0.971219,3.303573,1.232676
min,28.0,0.0,1.0,90.0,137.045455,-1.0,-1.0,66.0,-1.0,-0.5,-1.0,-1.0,-1.0,0.0
25%,47.0,1.0,3.0,120.0,216.0,0.0,0.0,120.0,0.0,0.0,-1.0,-1.0,-1.0,0.0
50%,54.0,1.0,4.0,130.0,241.636364,0.0,0.0,138.0,0.0,0.5,1.0,-1.0,-1.0,1.0
75%,60.0,1.0,4.0,140.0,268.636364,0.0,1.0,156.0,1.0,1.5,2.0,0.0,6.0,2.0
max,77.0,1.0,4.0,170.0,347.590909,1.0,2.0,202.0,1.0,3.5,3.0,3.0,7.0,4.0


In [11]:
data = data.drop(data.loc[data['age'] < 35].index)
data = data.drop(data.loc[data['age'] > 76].index)
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
count,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0,843.0
mean,53.884935,0.790036,3.275208,132.107301,244.56314,0.052195,0.603796,136.381106,0.300119,0.855527,0.845789,-0.415184,1.937129,1.097272
std,8.91913,0.407525,0.907969,16.888001,43.761912,0.491862,0.806497,25.18335,0.583899,0.984798,1.399844,0.972719,3.316923,1.233934
min,35.0,0.0,1.0,90.0,137.045455,-1.0,0.0,66.0,-1.0,-0.5,-1.0,-1.0,-1.0,0.0
25%,48.0,1.0,3.0,120.0,216.0,0.0,0.0,120.0,0.0,0.0,-1.0,-1.0,-1.0,0.0
50%,54.0,1.0,4.0,130.0,242.0,0.0,0.0,137.0,0.0,0.518182,1.0,-1.0,-1.0,1.0
75%,60.0,1.0,4.0,140.0,268.727273,0.0,1.0,155.5,1.0,1.5,2.0,0.0,6.0,2.0
max,76.0,1.0,4.0,170.0,347.590909,1.0,2.0,195.0,1.0,3.5,3.0,3.0,7.0,4.0


In [22]:
data.loc[data['label'] == 0, ['chol', 'thalach', 'trestbps']].sort_values('chol', ascending = False)
max_chol = data.loc[data['label'] == 0, 'chol'].max()
data = data.drop(data.loc[data['chol'] == max_chol].index)

data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
count,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0
mean,53.849633,0.795844,3.265281,131.846633,241.41437,0.050122,0.599022,136.40609,0.298289,0.852045,0.841076,-0.410758,1.94132,1.09291
std,8.947043,0.40333,0.911932,16.864385,40.484219,0.490977,0.802749,25.253481,0.582505,0.992214,1.401756,0.977085,3.314813,1.230448
min,35.0,0.0,1.0,90.0,137.045455,-1.0,0.0,66.0,-1.0,-0.5,-1.0,-1.0,-1.0,0.0
25%,47.25,1.0,3.0,120.0,215.181818,0.0,0.0,120.0,0.0,0.0,-1.0,-1.0,-1.0,0.0
50%,54.0,1.0,4.0,130.0,240.818182,0.0,0.0,137.0,0.0,0.5,1.0,-1.0,-1.0,1.0
75%,60.0,1.0,4.0,140.0,266.181818,0.0,1.0,155.75,1.0,1.5,2.0,0.0,6.0,2.0
max,76.0,1.0,4.0,170.0,347.0,1.0,2.0,195.0,1.0,3.5,3.0,3.0,7.0,4.0


In [23]:
data.loc[data['label'] == 0]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,split
0,56.0,1,2,126.0,166.0,0,1,140.0,0,0.0,-1,-1,-1,0,val
1,52.0,1,3,170.0,199.0,1,0,162.0,0,0.5,1,0,7,0,train
3,66.0,1,3,110.0,213.0,1,2,99.0,1,1.3,2,-1,-1,0,train
5,54.0,0,2,120.0,221.0,0,0,138.0,0,1.0,1,-1,-1,0,val
7,48.0,1,2,130.0,245.0,0,0,160.0,0,0.0,-1,-1,-1,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,51.0,1,3,125.0,245.0,1,2,166.0,0,2.4,2,0,3,0,train
851,53.0,1,4,142.0,226.0,0,2,111.0,1,0.0,1,0,7,0,train
852,42.0,1,3,134.0,240.0,-1,0,160.0,0,0.0,-1,-1,-1,0,train
855,41.0,1,4,112.0,250.0,0,0,142.0,0,0.0,-1,-1,-1,0,train
