In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Definir la ruta al archivo dentro de Google Drive
file_path = '/content/drive/My Drive/IA_PROJECT_1_EDU/dataset_con_nan.csv'
data = pd.read_csv(file_path)

In [4]:
# Convertir columnas específicas a tipo categórico
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Convertir a tipo categórico
for col in categorical_columns:
    data[col] = data[col].astype('category')

# Verificar los cambios
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       732 non-null    float64 
 1   sex       732 non-null    category
 2   cp        732 non-null    category
 3   trestbps  685 non-null    float64 
 4   chol      576 non-null    float64 
 5   fbs       666 non-null    category
 6   restecg   732 non-null    category
 7   thalach   688 non-null    float64 
 8   exang     688 non-null    category
 9   oldpeak   683 non-null    float64 
 10  slope     485 non-null    category
 11  ca        253 non-null    category
 12  thal      353 non-null    category
 13  label     732 non-null    int64   
dtypes: category(8), float64(5), int64(1)
memory usage: 41.3 KB


In [5]:
# Calcular la moda de la columna 'ca'
moda_ca = data['ca'].mode()[0]

# Reemplazar los valores 9 en la columna 'ca' con la moda
data['ca'] = data['ca'].replace(9, moda_ca)

# Verificar el resultado
print(data['ca'].unique())

[1.0, NaN, 0.0, 2.0, 3.0]
Categories (4, float64): [0.0, 1.0, 2.0, 3.0]


In [6]:
# Contar los valores '?' y '-9' y '0' por separado en cada columna
count_question_marks = (data == '?').sum()
count_minus_nine = ((data == -9.0) | (data == '-9.0') | (data == '9.0') | (data == 9.0)).sum()  # Se compara tanto con -9 como con '-9.0' como texto
count_0 =  ((data == 0) | (data == '0')).sum()  # Se compara tanto con -9 como con '-9.0' como texto

# Crear un DataFrame con los resultados
count_table = pd.DataFrame({
    'Cuenta de "?":': count_question_marks,
    'Cuenta de "-9":': count_minus_nine,
    'Cuenta de "0"': count_0
})

# Mostrar la tabla
print(count_table)

          Cuenta de "?":  Cuenta de "-9":  Cuenta de "0"
age                    0                0              0
sex                    0                0            154
cp                     0                0              0
trestbps               0                0              0
chol                   0                0              0
fbs                    0                0            554
restecg                0                0            439
thalach                0                0              0
exang                  0                0            422
oldpeak                0                0            297
slope                  0                0              0
ca                     0                0            147
thal                   0                0              0
label                  0                0            327


### Rellenamos con media y moda

In [7]:
# RELLENAMOS NUMERICOS CON MEDIA O CATEGORICOS CON MEDIANA

# Rellenar NaN en columnas de tipo float con la media
for col in data.select_dtypes(include=['float']):
    data[col] = data[col].fillna(data[col].mean())

# Rellenar NaN en columnas de tipo category con la moda
for col in data.select_dtypes(include=['category']):
    data[col] = data[col].fillna(data[col].mode()[0])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       732 non-null    float64 
 1   sex       732 non-null    category
 2   cp        732 non-null    category
 3   trestbps  732 non-null    float64 
 4   chol      732 non-null    float64 
 5   fbs       732 non-null    category
 6   restecg   732 non-null    category
 7   thalach   732 non-null    float64 
 8   exang     732 non-null    category
 9   oldpeak   732 non-null    float64 
 10  slope     732 non-null    category
 11  ca        732 non-null    category
 12  thal      732 non-null    category
 13  label     732 non-null    int64   
dtypes: category(8), float64(5), int64(1)
memory usage: 41.3 KB


PASAMOS TODAS LAS CATEGORICAS A ONE HOT ENCODING

---


In [9]:
import pandas as pd

# Lista de columnas categóricas
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Aplicar One-Hot Encoding asegurando que los valores sean 0 y 1 (int)
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True, dtype=int)

# Mostrar el resultado
print(data_encoded.head())

    age  trestbps        chol  thalach  oldpeak  label  sex_1.0  cp_2.0  \
0  51.0     125.0  213.000000    125.0      1.4      0        1       0   
1  54.0     120.0  237.000000    150.0      1.5      2        1       0   
2  63.0     140.0  246.821181    149.0      2.0      2        1       0   
3  52.0     140.0  246.821181    140.0      0.0      0        0       1   
4  55.0     140.0  217.000000    111.0      5.6      3        1       0   

   cp_3.0  cp_4.0  ...  restecg_1.0  restecg_2.0  exang_1.0  slope_2.0  \
0       0       0  ...            0            1          1          0   
1       1       0  ...            0            0          1          1   
2       0       1  ...            0            1          0          0   
3       0       0  ...            0            0          0          1   
4       0       1  ...            0            0          1          0   

   slope_3.0  ca_1.0  ca_2.0  ca_3.0  thal_6.0  thal_7.0  
0          0       1       0       0         

APLICAMOS AHORA SMOTE


---



In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.utils import resample
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


def balancear_clases(data, target_col, estrategia="SMOTE"):
    """
    Balancea un DataFrame utilizando técnicas como submuestreo, sobremuestreo, o combinadas.

    Parámetros:
        df (pd.DataFrame): Conjunto de datos a balancear.
        target_col (str): Nombre de la columna objetivo.
        estrategia (str): Técnica a usar ("submuestreo", "sobremuestreo", "combinado").

    Retorna:
        pd.DataFrame: DataFrame balanceado.
    """
    # Separar características (X) y objetivo (y)
    X = data_encoded.drop(columns=[target_col])
    y = data_encoded[target_col]

    # Elegir estrategia
    if estrategia == "submuestreo":
        sampler = RandomUnderSampler(random_state=42)
    elif estrategia == "sobremuestreo":
        sampler = SMOTE(random_state=42)
    elif estrategia == "combinado":
        sampler = SMOTEENN(random_state=42)  # Combina SMOTE y edición de vecinos
    else:
        raise ValueError("Estrategia desconocida. Usa 'submuestreo', 'sobremuestreo' o 'combinado'.")

    # Aplicar balanceo
    X_res, y_res = sampler.fit_resample(X, y)

    # Reconstruir DataFrame balanceado
    balanced_df = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res, columns=[target_col])], axis=1)
    return balanced_df


balanced_data = balancear_clases(data_encoded, "label", estrategia="combinado")
balanced_data_sub = balancear_clases(data_encoded, "label", estrategia="submuestreo")
print(balanced_data["label"].value_counts())
print(balanced_data_sub["label"].value_counts())

label
4    214
3    167
2    149
1     90
0     57
Name: count, dtype: int64
label
0    34
1    34
2    34
3    34
4    34
Name: count, dtype: int64


NORMALIZAMOS AHORA LAS VARIABLES NUMERICAS

---



In [11]:
# Selección de Características
X = balanced_data.drop(columns =['label'])
y = balanced_data['label']

len(X), len(y), X.iloc[0], y.iloc[0]

(677,
 677,
 age             35.0
 trestbps       122.0
 chol           192.0
 thalach        174.0
 oldpeak          0.0
 sex_1.0          1.0
 cp_2.0           1.0
 cp_3.0           0.0
 cp_4.0           0.0
 fbs_1.0          0.0
 restecg_1.0      0.0
 restecg_2.0      0.0
 exang_1.0        0.0
 slope_2.0        0.0
 slope_3.0        0.0
 ca_1.0           0.0
 ca_2.0           0.0
 ca_3.0           0.0
 thal_6.0         0.0
 thal_7.0         0.0
 Name: 0, dtype: float64,
 0)

In [12]:
# Hacemos un split de los datos
from sklearn.model_selection import train_test_split


X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Verificar
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(473, 20) (473,)
(102, 20) (102,)
(102, 20) (102,)


In [13]:
from sklearn.preprocessing import StandardScaler
# Normalizar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [14]:
X_train[0], y_train[0]

(array([ 0.8658503 , -1.51783109, -0.01096867, -0.37186034, -0.93046994,
         0.42025815, -0.18711211, -0.29556103,  0.79084104, -0.28268123,
        -0.32821995, -0.31622777,  1.43924583,  0.72173425, -0.1392715 ,
        -0.15430335, -0.1392715 , -0.06516352, -0.09235142, -1.74434373]),
 0)

# AHORA SERIA YA PROBAR LOS MODELOS

LOGISTIC REGRESSION

In [15]:
# import pandas as pd
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
# import matplotlib.pyplot as plt
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score , recall_score


# # Implementación de Regresión Logística
# model = LogisticRegression(max_iter=1000)
# #model = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=1000)
# model.fit(X_train, y_train)
# # Predecir y Evaluar el Modelo
# y_pred = model.predict(X_val)

# len(y_pred), y_pred[0]


# # Evaluar el modelo
# accuracy = accuracy_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred, average='weighted')
# print(f"Precisión del modelo: {accuracy}")
# print(f'F1-Score en el conjunto de validación: {f1}')

RANDOM FOREST

In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score

from sklearn.ensemble import RandomForestClassifier

# Crear el modelo de Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entrenar el modelo
rf_model.fit(X_train, y_train)

# Hacer predicciones
y_pred = rf_model.predict(X_val)

# Evaluar el modelo
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"Precisión del modelo: {accuracy}")
print(f'F1-Score en el conjunto de validación: {f1}')

Precisión del modelo: 0.7941176470588235
F1-Score en el conjunto de validación: 0.7925853963282182


XGBOOST

In [17]:
# from xgboost import XGBClassifier

# # Crear el modelo de XGBoost
# xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=True)

# # Entrenar el modelo
# xgb_model.fit(X_train, y_train)

# # Hacer predicciones
# y_pred = xgb_model.predict(X_val)
# print(y_pred)
# # Evaluar el modelo
# accuracy = accuracy_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred, average='weighted')
# recall = recall_score(y_val, y_pred, average='macro')

# print(f"Precisión del modelo: {accuracy}")
# print(f'F1-Score en el conjunto de validación: {f1}')
# print("Recall:", recall)

SVM

In [18]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report

# # Crear el modelo de SVM
# svm_model = SVC(kernel='rbf', C=1, random_state=42)

# # Entrenar el modelo
# svm_model.fit(X_train, y_train)

# # Hacer predicciones
# y_pred = svm_model.predict(X_val)
# print(y_pred)
# # Evaluar el modelo
# accuracy = accuracy_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred, average='macro')  # 'macro' para considerar cada clase por igual
# recall = recall_score(y_val, y_pred, average='macro')

# print("Accuracy:", accuracy)
# print("F1 Score:", f1)
# print("Recall:", recall)

KNN

In [19]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, f1_score, recall_score

# # Crear el modelo de KNN
# knn_model = KNeighborsClassifier(n_neighbors=5)  # n_neighbors es el número de vecinos

# # Entrenar el modelo
# knn_model.fit(X_train, y_train)

# # Hacer predicciones en X_val
# y_pred = knn_model.predict(X_val)

# # Evaluar el modelo
# accuracy = accuracy_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred, average='macro')  # 'macro' para considerar cada clase por igual
# recall = recall_score(y_val, y_pred, average='macro')

# print("Accuracy:", accuracy)
# print("F1 Score:", f1)
# print("Recall:", recall)

# HACEMOS AHORA LO MISMO CON EL TEST

In [20]:
print(rf_model)

RandomForestClassifier(random_state=42)


In [21]:
# Definir la ruta al archivo dentro de Google Drive
file_path = '/content/drive/MyDrive/IA_PROJECT_1_EDU/dataset_con_nan_test.csv'
data_test = pd.read_csv(file_path)

In [22]:
data_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,57.0,1.0,4.0,156.0,173.0,0.0,2.0,119.0,1.0,3.0,3.0,,
1,52.0,1.0,2.0,160.0,196.0,0.0,0.0,165.0,0.0,0.0,,,
2,48.0,1.0,2.0,100.0,,0.0,0.0,100.0,0.0,0.0,,,
3,62.0,1.0,4.0,115.0,,,0.0,128.0,1.0,2.5,3.0,,
4,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,54.0,1.0,4.0,200.0,198.0,0.0,0.0,142.0,1.0,2.0,2.0,,
180,55.0,1.0,2.0,110.0,214.0,1.0,1.0,180.0,0.0,,,,
181,67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0
182,59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0


In [23]:
# Convertir columnas específicas a tipo categórico
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Convertir a tipo categórico
for col in categorical_columns:
    data_test[col] = data_test[col].astype('category')

# Verificar los cambios
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       184 non-null    float64 
 1   sex       184 non-null    category
 2   cp        184 non-null    category
 3   trestbps  172 non-null    float64 
 4   chol      139 non-null    float64 
 5   fbs       161 non-null    category
 6   restecg   182 non-null    category
 7   thalach   173 non-null    float64 
 8   exang     173 non-null    category
 9   oldpeak   171 non-null    float64 
 10  slope     123 non-null    category
 11  ca        56 non-null     category
 12  thal      80 non-null     category
dtypes: category(8), float64(5)
memory usage: 9.9 KB


In [24]:
# Calcular la moda de la columna 'ca'
moda_ca = data_test['ca'].mode()[0]

# Reemplazar los valores 9 en la columna 'ca' con la moda
data_test['ca'] = data_test['ca'].replace(9, moda_ca)

# Verificar el resultado
print(data_test['ca'].unique())

[NaN, 0.0, 1.0, 3.0, 2.0]
Categories (4, float64): [0.0, 1.0, 2.0, 3.0]


  data_test['ca'] = data_test['ca'].replace(9, moda_ca)


In [25]:
# Contar los valores '?' y '-9' y '0' por separado en cada columna
count_question_marks = (data_test == '?').sum()
count_minus_nine = ((data_test == -9.0) | (data_test == '-9.0') | (data_test == '9.0') | (data_test == 9.0)).sum()  # Se compara tanto con -9 como con '-9.0' como texto
count_0 =  ((data_test == 0) | (data_test == '0')).sum()  # Se compara tanto con -9 como con '-9.0' como texto

# Crear un DataFrame con los resultados
count_table = pd.DataFrame({
    'Cuenta de "?":': count_question_marks,
    'Cuenta de "-9":': count_minus_nine,
    'Cuenta de "0"': count_0
})

# Mostrar la tabla
print(count_table)

          Cuenta de "?":  Cuenta de "-9":  Cuenta de "0"
age                    0                0              0
sex                    0                0             40
cp                     0                0              0
trestbps               0                0              0
chol                   0                0              0
fbs                    0                0            136
restecg                0                0            110
thalach                0                0              0
exang                  0                0            103
oldpeak                0                0             72
slope                  0                0              0
ca                     0                0             34
thal                   0                0              0


In [26]:
# RELLENAMOS NUMERICOS CON MEDIA O CATEGORICOS CON MEDIANA

# Rellenar NaN en columnas de tipo float con la media
for col in data_test.select_dtypes(include=['float']):
    data_test[col] = data_test[col].fillna(data_test[col].mean())

# Rellenar NaN en columnas de tipo category con la moda
for col in data_test.select_dtypes(include=['category']):
    data_test[col] = data_test[col].fillna(data_test[col].mode()[0])

In [27]:
# Obtener los valores únicos de cada columna
for column in data_test.columns:
    print(f"Valores únicos en la columna '{column}':")
    print(data_test[column].unique())
    print()  # Línea en blanco para separar cada columna

Valores únicos en la columna 'age':
[57. 52. 48. 62. 51. 54. 45. 64. 70. 32. 55. 60. 38. 50. 74. 72. 65. 67.
 77. 33. 56. 44. 47. 68. 39. 59. 58. 61. 46. 53. 40. 34. 63. 35. 42. 43.
 66. 71. 69. 37. 29.]

Valores únicos en la columna 'sex':
[1.0, 0.0]
Categories (2, float64): [0.0, 1.0]

Valores únicos en la columna 'cp':
[4.0, 2.0, 3.0, 1.0]
Categories (4, float64): [1.0, 2.0, 3.0, 4.0]

Valores únicos en la columna 'trestbps':
[156.         160.         100.         115.         110.
 130.         125.         140.         120.         144.
 155.          95.         150.         124.         132.80813953
 118.         129.         128.         152.         145.
  80.         106.         101.         108.         142.
 105.         154.         200.         138.         126.
 158.         102.         132.         180.         136.
 112.         170.         122.         135.        ]

Valores únicos en la columna 'chol':
[173.         196.         246.58273381 175.         201.
 25

PASAMOS TODAS LAS CATEGORICAS A ONE HOT ENCODING

---


In [28]:
import pandas as pd

# Lista de columnas categóricas
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Aplicar One-Hot Encoding asegurando que los valores sean 0 y 1 (int)
data_test = pd.get_dummies(data_test, columns=categorical_columns, drop_first=True, dtype=int)

# Mostrar el resultado
print(data_test.head())

    age  trestbps        chol  thalach  oldpeak  sex_1.0  cp_2.0  cp_3.0  \
0  57.0     156.0  173.000000    119.0      3.0        1       0       0   
1  52.0     160.0  196.000000    165.0      0.0        1       1       0   
2  48.0     100.0  246.582734    100.0      0.0        1       1       0   
3  62.0     115.0  246.582734    128.0      2.5        1       0       0   
4  51.0     110.0  175.000000    123.0      0.6        1       0       1   

   cp_4.0  fbs_1.0  restecg_1.0  restecg_2.0  exang_1.0  slope_2.0  slope_3.0  \
0       1        0            0            1          1          0          1   
1       0        0            0            0          0          1          0   
2       0        0            0            0          0          1          0   
3       1        0            0            0          1          0          1   
4       0        0            0            0          0          0          0   

   ca_1.0  ca_2.0  ca_3.0  thal_6.0  thal_7.0  
0       

NORMALIZAMOS AHORA LAS VARIABLES NUMERICAS

---



In [29]:
from sklearn.preprocessing import StandardScaler
# Normalizar las características
scaler = StandardScaler()
X_test = scaler.fit_transform(data_test)

In [30]:
X_test[0]

array([ 0.29014836,  1.29930679, -1.61933028, -0.64794676,  2.24356589,
        0.52704628, -0.48466414, -0.49319696,  0.87705802, -0.39652579,
       -0.46748588,  1.92819831,  1.27615494, -1.51185789,  3.7859389 ,
       -0.2757234 , -0.18359702, -0.12874232, -0.22677868, -0.45883147])

In [33]:
y_test= rf_model.predict(X_test)
# print("Informe de Clasificación:\n", classification_report(y_test, y_pred))
# print("Matriz de Confusión:\n", confusion_matrix(y_test, y_pred))
# print("ROC AUC:", roc_auc_score(y_test, y_pred))
len(y_test)

184