In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 00. Análisis EDA

En este apartado solo estudiaremos el CSV 'train.csv' ya que no hay que tratar los datos, solo estudiarlos.

In [9]:
data_train = pd.read_csv('/Users/emart/Documents/GitHub/mdata_dp3/data/train.csv')
data_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,-9.0,-9.0,7.0,2
2,63.0,1.0,4.0,140.0,0.0,?,2.0,149.0,0.0,2.0,1.0,?,?,2
3,52.0,0.0,2.0,140.0,-9.0,0.0,0.0,140.0,0.0,0.0,-9.0,-9.0,-9.0,0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3


Como podemos observar, tenemos 14 variables, 2 descriptivas del paciente ('age' y 'sex') y una que es la resultante a estudiar ('label'). Se detectan a primera vista valores negativos y carácteres como '?'. 

En un primer lugar, se va a explorar la naturaleza del dataset, haciendo énfasis en los valores que a primera vista parecen incoherentes.

In [10]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       732 non-null    float64
 1   sex       732 non-null    float64
 2   cp        732 non-null    float64
 3   trestbps  732 non-null    object 
 4   chol      732 non-null    object 
 5   fbs       732 non-null    object 
 6   restecg   732 non-null    float64
 7   thalach   732 non-null    object 
 8   exang     732 non-null    object 
 9   oldpeak   732 non-null    object 
 10  slope     732 non-null    object 
 11  ca        732 non-null    object 
 12  thal      732 non-null    object 
 13  label     732 non-null    int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 80.2+ KB


Se entiende que los valores son númericos y que por lo tanto que existan carácteres '?' no tiene sentido. Por lo tanto se necesita realizar una transformación de esos valores a valores NaN (Not a Number) para poder seguir nuestro análisis.

In [11]:
columns_train = data_train.columns
for column in columns_train:
    """ Si el valor de la columna es '?', se reemplaza por None. Después, se convierte el tipo de dato a float. """
    data_train[column] = data_train[column].replace('?', None)

data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       732 non-null    float64
 1   sex       732 non-null    float64
 2   cp        732 non-null    float64
 3   trestbps  685 non-null    object 
 4   chol      727 non-null    object 
 5   fbs       674 non-null    object 
 6   restecg   732 non-null    float64
 7   thalach   688 non-null    object 
 8   exang     688 non-null    object 
 9   oldpeak   683 non-null    object 
 10  slope     637 non-null    object 
 11  ca        483 non-null    object 
 12  thal      563 non-null    object 
 13  label     732 non-null    int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 80.2+ KB


Observamos que en 9 de las 14 columnas tenemos valores nulos. A continuación, en función de la explicación obtenida del enunciado, vamos a definir que variables son continuas y cuáles son categóricas:
- **Continuas**: age, trestbps, chol, thalach, oldpeak.
- **Categóricas**: sex, cp, fbs, restecg, exang, slope, ca, thal
- **Resultado**: label

Por lo tanto, observamos que no todas las variables cumplen con su naturaleza de dato. Procedemos entonces a estipular el tipo en cada una de ellas.

In [13]:
# Númericas - Continuas
data_train['age'] = data_train['age'].astype(int)
data_train['trestbps'] = data_train['trestbps'].astype(float)
data_train['chol'] = data_train['chol'].astype(float)
data_train['thalach'] = data_train['thalach'].astype(float)
data_train['oldpeak'] = data_train['oldpeak'].astype(float)

# Categóricas
data_train['sex'] = data_train['sex'].astype('category')
data_train['cp'] = data_train['cp'].astype('category')
data_train['fbs'] = data_train['fbs'].astype('category')
data_train['restecg'] = data_train['restecg'].astype('category')
data_train['exang'] = data_train['exang'].astype('category')
data_train['slope'] = data_train['slope'].astype('category')
data_train['ca'] = data_train['ca'].astype('category')
data_train['thal'] = data_train['thal'].astype('category')
data_train['label'] = data_train['label'].astype('category')

data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       732 non-null    int64   
 1   sex       732 non-null    category
 2   cp        732 non-null    category
 3   trestbps  685 non-null    float64 
 4   chol      727 non-null    float64 
 5   fbs       674 non-null    category
 6   restecg   732 non-null    category
 7   thalach   688 non-null    float64 
 8   exang     688 non-null    category
 9   oldpeak   683 non-null    float64 
 10  slope     637 non-null    category
 11  ca        483 non-null    category
 12  thal      563 non-null    category
 13  label     732 non-null    category
dtypes: category(9), float64(4), int64(1)
memory usage: 37.3 KB


Antes de tratar los nulos, estableciendo correctamente los tipos de datos, pasamos a hacer un estudio exhaustivo de las variables.

In [16]:
data_train.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
count,732.0,732.0,732.0,685.0,727.0,674.0,732.0,688.0,688.0,683.0,637.0,483.0,563.0,732.0
unique,,2.0,4.0,,,5.0,3.0,,4.0,,7.0,8.0,7.0,5.0
top,,1.0,4.0,,,0.0,0.0,,0.0,,2.0,-9.0,-9.0,0.0
freq,,578.0,391.0,,,415.0,439.0,,327.0,,187.0,230.0,210.0,327.0
mean,53.364754,,,131.975182,195.357634,,,138.132267,,0.881259,,,,
std,9.306868,,,19.203305,113.90813,,,25.963443,,1.11296,,,,
min,28.0,,,0.0,-9.0,,,60.0,,-2.6,,,,
25%,47.0,,,120.0,168.0,,,120.0,,0.0,,,,
50%,54.0,,,130.0,222.0,,,140.0,,0.5,,,,
75%,60.0,,,140.0,267.0,,,158.25,,1.55,,,,


In [21]:
print("Valores nulos por columna:")
print("--------------------------")
print(data_train.isnull().sum())

Valores nulos por columna:
--------------------------
age           0
sex           0
cp            0
trestbps     47
chol          5
fbs          58
restecg       0
thalach      44
exang        44
oldpeak      49
slope        95
ca          249
thal        169
label         0
dtype: int64


In [25]:
print("Valores negativos por columna:")
print("--------------------------")
print((data_train == -9).sum())

Valores negativos por columna:
--------------------------
age          0
sex          0
cp           0
trestbps     0
chol        16
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
label        0
dtype: int64
