# Modelos de Clasificacion

In [None]:
!uv pip install ucimlrepo

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
  
### 1 maligno, 0 benigno
y.Class = (y.Class==4).astype(int)
X = X.drop(columns = 'Bare_nuclei')

## Procesamiento de los datos

In [None]:
### Particionamiento en train, validation  test 60-20-20
X_train, X_tem, y_train, y_tem = train_test_split(X, 
                                                    y,
                                                    stratify = y,
                                                    test_size=0.4, 
                                                    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_tem, 
                                                y_tem,
                                                stratify = y_tem,
                                                test_size=0.5, 
                                                random_state=42)

In [None]:
### Escalar el conjunto de datos
scaler = StandardScaler()
X_train_z = scaler.fit_transform(X_train)
X_val_z = scaler.transform(X_val)
X_test_z = scaler.transform(X_test)

### guardar nombre de las variables
var_names = X.columns

In [None]:
# Crear un modelo de regresión logística
model_rl = LogisticRegression()

# Ajustar el modelo con los datos de entrenamiento
model_rl.fit(X_train_z, y_train)

# Hacer predicciones en el conjunto de validacion
y_pred = model_rl.predict(X_val_z)
y_pred_score = model_rl.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

In [None]:
### Curva ROC
fpr, tpr, thresholds = roc_curve(y_val, y_pred_score)

In [None]:
# Graficar la curva ROC
plt.figure('ROC Regresion Logistica',figsize=(8, 6))
plt.plot(fpr, tpr, color='orange', label=f'Curva ROC (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Clasificador aleatorio')
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend()
plt.show()

## Cross Validation

In [None]:
# Initialize the LOOCV
loo = LeaveOneOut()

y_pred_vec = []
y_true_vec = []
# Perform LOOCV
for train_index, test_index in loo.split(X_train_z):
    X_train_cv, X_test_cv = X_train_z[train_index], X_train_z[test_index]
    y_train_cv, y_test_cv = y_train.values[train_index], y_train.values[test_index]

    # Fit the model
    model_cv = LogisticRegression()
    model_cv.fit(X_train_cv, y_train_cv)

    # Make predictions
    y_pred = model_cv.predict(X_test_cv)

    # Print the results for each iteration
    #print(f"True value: {y_test_cv[0]}, Predicted value: {y_pred[0]}")

    y_pred_vec.append(y_pred[0])
    y_true_vec.append(y_test_cv[0])

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_true_vec, y_pred_vec)
print(f"accuracy del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_true_vec, y_pred_vec))

print("\nInforme de clasificación:")
precision = precision_score(y_true_vec, y_pred_vec)
recall = recall_score(y_true_vec, y_pred_vec)

print(f'Precision: {precision}')
print(f'Recall: {recall}')

## K Folds

In [None]:
# Define the number of folds
k = 3

# Initialize the KFold
kf = KFold(n_splits=k)

recall_vec = []
precision_vec = []
accuracy_vec = []
# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train_z):
    X_train_cv, X_test_cv = X_train_z[train_index], X_train_z[test_index]
    y_train_cv, y_test_cv = y_train.values[train_index], y_train.values[test_index]

    # Fit the model
    model_cv = LogisticRegression()
    model_cv.fit(X_train_cv, y_train_cv)

    # Make predictions
    y_pred = model_cv.predict(X_test_cv)
    
    # Calcular la precisión del modelo
    accuracy = accuracy_score(y_test_cv, y_pred)
    precision = precision_score(y_test_cv, y_pred)
    recall = recall_score(y_test_cv, y_pred)
    
    recall_vec.append(recall)
    precision_vec.append(precision)
    accuracy_vec.append(accuracy)

In [None]:
print(f'Recall: {np.mean(recall_vec)}')
print(f'Precision: {np.mean(precision_vec)}')
print(f'Accuracy: {np.mean(accuracy_vec)}')

In [None]:
### funcion alternativa para cross validation
from sklearn.model_selection import cross_val_score

### obtener error de validacion cruzada
# Cross-validation with accuracy
model_rl = LogisticRegression()
accuracy_scores = cross_val_score(model_rl, X_train_z, y_train, cv=5, scoring='accuracy')
print(f'Accuracy (CV): {np.mean(accuracy_scores)}')

## Regresion Logistica con Regularizacion

### Lasso

In [None]:
model_rg_lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=0)

# Definir los parámetros para la búsqueda
parameters = {'C': np.logspace(-3, 3, 100)}  # Rango de valores de C para probar

# Realizar la búsqueda de cuadrícula para encontrar el mejor valor de C
clf = GridSearchCV(model_rg_lasso, parameters, cv=5)
clf.fit(X_train_z, y_train)

# Obtener el mejor valor de C
best_C = clf.best_params_['C']
print(f"Mejor valor de C: {best_C}")

In [None]:
print(f"Mejor valor de C: {best_C}")

In [None]:
### Seleccion de variables
df_vars = pd.DataFrame()
df_vars['vars'] = var_names.values
df_vars['Coef'] = clf.best_estimator_.coef_[0]
df_vars

In [None]:
# Ajustar el modelo con el mejor valor de C
model_rg_lasso = LogisticRegression(penalty='l1', C=best_C, solver='liblinear', random_state=0)
model_rg_lasso.fit(X_train_z, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model_rg_lasso.predict(X_val_z)
y_pred_score = model_rg_lasso.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Precisión del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

### Ridge

In [None]:
model_rg_ridge = LogisticRegression(penalty='l2', solver='lbfgs', random_state=0)

# Definir los parámetros para la búsqueda
parameters = {'C': np.logspace(-3, 3, 100)}  # Rango de valores de C para probar

# Realizar la búsqueda de cuadrícula para encontrar el mejor valor de C
clf = GridSearchCV(model_rg_ridge, parameters, cv=5)
clf.fit(X_train_z, y_train)

# Obtener el mejor valor de C
best_C = clf.best_params_['C']
print(f"Mejor valor de C: {best_C}")

In [None]:
### Seleccion de variables
df_vars = pd.DataFrame()
df_vars['vars'] = var_names.values
df_vars['Coef'] = clf.best_estimator_.coef_[0]
df_vars

In [None]:
# Ajustar el modelo con el mejor valor de C
model_rg_ridge = LogisticRegression(penalty='l2', C=best_C, solver='lbfgs', random_state=0)
model_rg_ridge.fit(X_train_z, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model_rg_ridge.predict(X_val_z)
y_pred_score = model_rg_ridge.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Precisión del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

## KNN

In [None]:
# Create a KNN classifier with a specified number of neighbors (e.g., 3)
k = 10
knn = KNeighborsClassifier(n_neighbors=k)

# Fit the classifier to the training data
knn.fit(X_train_z, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_val_z)
y_pred_score = knn.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Precisión del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

# Arbol de Decision

In [None]:
### Arbol de decisión
from sklearn.datasets import make_classification
from sklearn.tree import plot_tree

In [None]:
# 1. Crear dataset sintético (2D para graficar)
X, y = make_classification(n_samples=200, n_features=2, 
                           n_redundant=0, n_informative=2,
                           n_clusters_per_class=1, random_state=42)

# 2. Entrenar árbol de decisión
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)


In [None]:
# 3. Graficar el árbol entrenado
plt.figure(figsize=(10,6))
plot_tree(clf, filled=True, feature_names=["x1","x2"], class_names=["Clase 0","Clase 1"])
plt.title("Árbol de Decisión")
plt.show()

In [None]:
# 4. Graficar la frontera de decisión
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(8,6))
plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
plt.scatter(X[:, 0], X[:, 1], c=y, s=30, edgecolor="k", cmap=plt.cm.RdYlBu)
plt.title("Frontera de Decisión del Árbol")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

In [None]:
### Ajusta arbol de decision a datos reales
tree = DecisionTreeClassifier()

# Fit the classifier to the training data
tree.fit(X_train_z, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_val_z)
y_pred_score = knn.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Precisión del modelo: {accuracy}")

# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

In [None]:
### Ajustar Random Forest
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train_z, y_train)
y_pred = forest.predict(X_val_z)
y_pred_score = forest.predict_proba(X_val_z)[:,1]
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)

In [None]:
print(f"Precisión del modelo: {accuracy}")
# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))
print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')

In [None]:
### Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gbc.fit(X_train_z, y_train)
y_pred = gbc.predict(X_val_z)
y_pred_score = gbc.predict_proba(X_val_z)[:,1]

In [None]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_val, y_pred)
print(f"Precisión del modelo: {accuracy}")
# Mostrar la matriz de confusión y el informe de clasificación
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))
print("\nInforme de clasificación:")
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_score)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')