In [162]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score


In [163]:
ruta = '/home/roko/cursos/mineria/lab2/datos_sin_out_balanceados/wine_so_2_clases.csv'
df_wine = pd.read_csv(ruta)
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Calidad
0,7.4,0.590,0.08,4.4,0.086,6.0,29.0,0.99740,3.38,0.50,9.0,Baja
1,5.7,1.130,0.09,1.5,0.172,7.0,19.0,0.99400,3.50,0.48,9.8,Baja
2,8.8,0.610,0.30,2.8,0.088,17.0,46.0,0.99760,3.26,0.51,9.3,Baja
3,4.6,0.520,0.15,2.1,0.054,8.0,65.0,0.99340,3.90,0.56,13.1,Baja
4,8.3,0.675,0.26,2.1,0.084,11.0,43.0,0.99760,3.31,0.53,9.2,Baja
...,...,...,...,...,...,...,...,...,...,...,...,...
1196,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,Baja
1197,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,Alta
1198,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,Alta
1199,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,Baja


# Preprocesado del df

In [164]:
# Preprocesamiento: convertir la columna 'Calidad' a valores numéricos (0 = Baja, 1 = Alta)
df_wine['Calidad'] = df_wine['Calidad'].map({'Baja': 0, 'Alta': 1})

# Eliminar las columnas 'fixed acidity' y 'density' porque son multicolineales
df_wine = df_wine.drop(columns=['fixed acidity', 'density'])

In [165]:
# Estandarizar

In [166]:
# Separar características y etiquetas
X = df_wine.drop(columns=['Calidad'])  # Eliminar la columna de 'Calidad'
y = df_wine['Calidad']  # La variable objetivo

# Inicializar el escalador
scaler = StandardScaler()

# Estandarizar los datos
X_scaled = scaler.fit_transform(X)

# Convertir el resultado a un DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_wine = pd.concat([df_scaled, y.reset_index(drop=True)], axis=1)
df_wine

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,Calidad
0,0.379473,-0.918915,4.158348,0.470039,-1.023011,-0.492965,0.397503,-1.118965,-1.385780,0
1,3.578531,-0.863820,-1.346968,5.901724,-0.907765,-0.875526,1.287346,-1.288358,-0.581824,0
2,0.497957,0.293185,1.120932,0.596357,0.244694,0.157388,-0.492341,-1.034269,-1.084296,0
3,-0.035220,-0.533247,-0.207937,-1.551054,-0.792519,0.884254,4.253491,-0.610787,2.734491,0
4,0.883029,0.072803,-0.207937,0.343720,-0.446782,0.042620,-0.121572,-0.864876,-1.184791,0
...,...,...,...,...,...,...,...,...,...,...
1196,0.438715,-0.918915,-0.397776,0.722675,1.973382,0.080876,0.916578,-0.441394,0.121636,0
1197,0.142506,-0.808725,-0.018099,-1.045781,2.780103,0.348669,1.435654,1.083141,0.825097,1
1198,-0.094461,-0.643438,0.171740,-0.161553,1.627644,-0.072148,0.694117,0.998444,0.624108,1
1199,0.705303,-0.698534,-0.397776,-0.224712,1.973382,0.080876,1.806422,0.659659,-0.179847,0


In [167]:
# Separar variables independientes (X) y dependiente (y)
X = df_wine.drop(columns=['Calidad'])  # Todas las columnas excepto 'Calidad'
y = df_wine['Calidad']  # La clase

In [194]:
def entrenar_modelo(X_train, y_train):
    # Crear y entrenar un modelo de regresión logística
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

In [224]:
def generar_modelos(X, y, num_modelos, porcentaje = 0.6):
    modelos = []
    n = len(X)
    
    # Calcular el número de muestras según el porcentaje
    num_muestras = int(n * porcentaje)

    for _ in range(num_modelos):
        # Muestreo aleatorio con reemplazo
        indices = np.random.choice(n, size=num_muestras, replace=True)
        #print(indices)
        X_sample = X.iloc[indices]  # Usar iloc para seleccionar filas por índice
        y_sample = y.iloc[indices]
        
        # Entrenar el modelo con la muestra
        modelo = entrenar_modelo(X_sample, y_sample)
        modelos.append(modelo)
    
    return modelos

In [225]:
from sklearn.metrics import classification_report, confusion_matrix

def predecir_con_ensamble(modelos, X_test, y_test):
    # Hacer predicciones con todos los modelos
    predicciones = np.array([modelo.predict(X_test) for modelo in modelos])
    #print(predicciones,len(predicciones), len(predicciones[1]))
    
    # Votación simple: obtener la predicción más común (moda) para cada observación
    pred_final = mode(predicciones, axis=0).mode[0]
    #print(pred_final,len(pred_final))
    
    # Calcular la matriz de confusión
    conf_matrix = confusion_matrix(y_test, pred_final)
    
    # Obtener un informe de clasificación con métricas
    report = classification_report(y_test, pred_final, output_dict=True)
    
    # Crear un DataFrame con las predicciones y los valores reales
    
    return pred_final, conf_matrix, report

In [226]:
# Separar variables independientes (X) y dependiente (y)
X = df_wine.drop(columns=['Calidad'])  # Todas las columnas excepto 'Calidad'
y = df_wine['Calidad']  # La clase\
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("train: ",len(X_train))
print("test: ", len(X_test))

train:  840
test:  361


In [227]:
# Generar 10 modelos
modelos = generar_modelos(X_train, y_train, num_modelos=400)

In [228]:
# Predecir con el ensamble de modelos
predicciones_finales, matriz_confusion, metrics_report = predecir_con_ensamble(modelos, X_test, y_test)

# Imprimir la matriz de confusión
print("Matriz de Confusión:")
print(matriz_confusion)

# Imprimir el informe de clasificación
print("\nInforme de Clasificación:")
print(metrics_report)



Matriz de Confusión:
[[116  42]
 [ 56 147]]

Informe de Clasificación:
{'0': {'precision': 0.6744186046511628, 'recall': 0.7341772151898734, 'f1-score': 0.703030303030303, 'support': 158}, '1': {'precision': 0.7777777777777778, 'recall': 0.7241379310344828, 'f1-score': 0.75, 'support': 203}, 'accuracy': 0.7285318559556787, 'macro avg': {'precision': 0.7260981912144703, 'recall': 0.7291575731121781, 'f1-score': 0.7265151515151516, 'support': 361}, 'weighted avg': {'precision': 0.7325402449411984, 'recall': 0.7285318559556787, 'f1-score': 0.7294426257030134, 'support': 361}}


In [184]:
# Separar variables independientes (X) y dependiente (y)
X = df_wine.drop(columns=['Calidad'])  # Todas las columnas excepto 'Calidad'
y = df_wine['Calidad']  # La clase\
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)