In [1]:
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
root_path = Path(os.getcwd()).resolve().parent

In [3]:
sys.path.append(str(root_path))

In [4]:
pd.set_option('display.max_columns',500)

In [5]:
df = pd.read_csv("../data/processed/heart_attack_prediction_dataset1_processed.csv")

In [6]:
data = df.copy()

In [7]:
data

Unnamed: 0,Edad,Sexo,TipoDolorTorax,PresionArterialReposo,Colesterol,GlucosaEnAyunas,ECGReposo,FreqCardiacaMaxima,AnginaDeEsfuerzo,DescensoST,PendienteST,EnfermedadCardiaca
0,59,M,ASY,156,96.0,0,LVH,177,Y,1.6,Flat,1
1,57,M,ASY,135,413.0,0,Normal,125,Y,-0.1,Up,1
2,58,M,ATA,130,410.0,0,Normal,124,N,1.2,Up,1
3,65,M,ASY,134,149.0,0,ST,190,N,1.0,Flat,1
4,57,F,ATA,125,156.0,0,Normal,140,Y,-0.1,Flat,0
...,...,...,...,...,...,...,...,...,...,...,...,...
18557,53,M,ASY,161,8.0,0,Normal,100,Y,2.0,Up,0
18558,56,M,ASY,127,60.0,0,Normal,124,N,-0.1,Flat,0
18559,55,M,ASY,144,517.0,0,Normal,112,Y,-0.1,Up,1
18560,60,M,ASY,130,253.0,0,Normal,144,Y,0.1,Up,1


Transformar las varibales categoricas nominales a categoricas ordinales

In [8]:
# data['Sexo'].replace({'M':0, 'F':1}, inplace=True);
# data['TipoDolorTorax'].replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':3}, inplace=True);
# data['ECGReposo'].replace({'Normal':0, 'ST':1, 'LVH':2}, inplace=True);
# data['AnginaDeEsfuerzo'].replace({'N':0, 'Y':1}, inplace=True);
# data['PendienteST'].replace({'Up':0, 'Flat':1, 'Down':2}, inplace=True);

Tabla con descripcion de cada variable

Separación del conjunto de datos en features y target

In [9]:
X = data.drop(columns='EnfermedadCardiaca')

In [10]:
y = data['EnfermedadCardiaca']

Transformando las variables utilizando onehotencoder para evitar que el modelo interprete cada variable utilizando diferentes pesos

In [11]:
X_scaled = X.copy()
X_scaled = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,2,6,8,10])],
                         remainder='passthrough').fit_transform(X)

Analisis de la escala de cada variable

In [13]:
X.describe()

Unnamed: 0,Edad,PresionArterialReposo,Colesterol,GlucosaEnAyunas,FreqCardiacaMaxima,DescensoST
count,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0
mean,56.207844,132.015354,209.013719,0.043907,136.584204,0.6491
std,8.793589,18.440124,97.482676,0.204894,25.388117,0.998975
min,28.0,59.0,1.0,0.0,60.0,-1.6
25%,50.0,120.0,141.0,0.0,119.0,-0.1
50%,56.0,132.0,209.013719,0.0,137.0,0.2
75%,62.0,144.0,271.0,0.0,154.0,1.3
max,77.0,200.0,603.0,1.0,202.0,5.7


Se nota que los datos no están escalados, lo que puede causar problemas durante los estudos realizados aquí en este conjunto de datos. Utilizaremos el metodo StandardScaler de la libreria Sklearn para transformar nuestros datos.

In [14]:
scaler = MinMaxScaler() ##pruebar minmax scaler

In [15]:
X_scaled = scaler.fit_transform(X_scaled)

In [16]:
X_scaled_df = pd.DataFrame(X_scaled)

In [17]:
X_scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.632653,0.687943,0.157807,0.0,0.823944,0.438356
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.591837,0.539007,0.684385,0.0,0.457746,0.205479
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.612245,0.503546,0.679402,0.0,0.450704,0.383562
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.755102,0.531915,0.245847,0.0,0.915493,0.356164
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.591837,0.468085,0.257475,0.0,0.563380,0.205479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18557,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.510204,0.723404,0.011628,0.0,0.281690,0.493151
18558,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.571429,0.482270,0.098007,0.0,0.450704,0.205479
18559,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.551020,0.602837,0.857143,0.0,0.366197,0.205479
18560,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.653061,0.503546,0.418605,0.0,0.591549,0.232877


In [19]:
X_scaled_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0,18562.0
mean,0.211777,0.788223,0.54019,0.188019,0.220343,0.051449,0.206497,0.601713,0.19179,0.597888,0.402112,0.066157,0.501077,0.432766,0.57567,0.517839,0.345538,0.043907,0.539325,0.308096
std,0.408578,0.408578,0.498396,0.390737,0.414489,0.220918,0.404802,0.489558,0.393719,0.490337,0.490337,0.248562,0.500012,0.495472,0.179461,0.130781,0.161931,0.204894,0.17879,0.136846
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.44898,0.432624,0.232558,0.0,0.415493,0.205479
50%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.571429,0.51773,0.345538,0.0,0.542254,0.246575
75%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.693878,0.602837,0.448505,0.0,0.661972,0.39726
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Resumen de preprocesamento

* Separacion de los datos en X e y
    * X = Conjunto de todas las variables excepto la columna target
    * y = columna target, variable que identifica se el paciente tiene problema cardiaco o no.
* Tratamiento de la X
    * Transformacción de las variables a traves de la libreria OneHotEncoder
    * Escalar las variables a traves del metodo StandardScaler

Reducción de dimensionalidad utilizando PCA

In [20]:
pca = PCA(n_components=.95)

In [21]:
X_pca = pca.fit_transform(X_scaled)

In [22]:
X_pca

array([[ 1.13381346,  0.23339192, -0.77118868, ..., -0.082418  ,
        -0.0417583 , -0.07338805],
       [ 0.46632167, -1.02269523,  0.33569488, ..., -0.09316578,
        -0.03624417, -0.0349845 ],
       [-0.92438298, -0.47121129,  0.60980931, ..., -0.08166159,
        -0.11322882, -0.01311665],
       ...,
       [ 0.46723371, -1.02268102,  0.33489584, ..., -0.09401439,
        -0.03891227, -0.02640199],
       [ 0.46499136, -1.02261168,  0.33683977, ..., -0.09199755,
        -0.03225383, -0.04740113],
       [-0.25326485,  0.82086097, -0.60294715, ..., -0.07007082,
        -0.09207959, -0.02632416]])

In [23]:
pca.explained_variance_ratio_

array([0.17889136, 0.16941341, 0.13262867, 0.12992448, 0.12248744,
       0.07448541, 0.07283209, 0.0340473 , 0.02355214, 0.01542019])

In [24]:
pca.explained_variance_ratio_.cumsum()

array([0.17889136, 0.34830477, 0.48093343, 0.61085791, 0.73334536,
       0.80783076, 0.88066286, 0.91471015, 0.9382623 , 0.95368249])

* Varianza explicada:
    * Los dos primeros componentes retienen el 96% de la información relevante.
* Componentes marginales:
    * Los siguientes componentes aportan información mínima.
* Visualización y eficiencia:
    * Dos dimensiones facilitan la visualización y reducen la carga computacional."

Separacion del conjunto de datos entre datos de entrenamiento y prueba

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

Kernel ``rbf``

In [26]:
svm = SVC(kernel='rbf', random_state=42, C=10)
svm.fit(X_train, y_train)

In [27]:
svm_pred = svm.predict(X_test)

In [28]:
svm_pred

array([1, 0, 0, ..., 1, 1, 1])

In [29]:
print("Acuracia: %.2f%%" % (accuracy_score(y_test, svm_pred)*100))

Acuracia: 52.52%


In [30]:
confusion_matrix(y_test, svm_pred)

array([[ 264, 1493],
       [ 270, 1686]])

In [31]:
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.49      0.15      0.23      1757
           1       0.53      0.86      0.66      1956

    accuracy                           0.53      3713
   macro avg       0.51      0.51      0.44      3713
weighted avg       0.51      0.53      0.45      3713



Kernel ``linear``

In [32]:
svm = SVC(kernel='linear', random_state=42, C=10)
svm.fit(X_train, y_train)

In [33]:
svm_pred = svm.predict(X_test)

In [34]:
svm_pred

array([1, 1, 1, ..., 1, 1, 1])

In [35]:
print("Acuracia: %.2f%%" % (accuracy_score(y_test, svm_pred)*100))

Acuracia: 52.68%


In [36]:
confusion_matrix(y_test, svm_pred)

array([[   0, 1757],
       [   0, 1956]])

In [37]:
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1757
           1       0.53      1.00      0.69      1956

    accuracy                           0.53      3713
   macro avg       0.26      0.50      0.35      3713
weighted avg       0.28      0.53      0.36      3713



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross validation

In [38]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [39]:
model = SVC(kernel='rbf', random_state=42, C=10)
resultado = cross_val_score(model, X_pca, y, cv=kfold)

In [40]:
print("Acuracia média: %.2f%%" % (resultado.mean()*100))

Acuracia média: 53.75%
