# Detección de impago en microcréditos

Importar librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
sns.set(style="whitegrid")

Cargar datos

In [None]:
# Cargar datos

df = pd.read_csv('vino_knn.csv')
df.head()

EDA

In [None]:
# EDA

df.info()

df.describe().T

print('Valores nulos:\n', df.isnull().sum())
print('\nDistribución ALTA_CALIDAD:\n', df['ALTA_CALIDAD'].value_counts())

plt.figure(figsize=(6,5))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlación entre variables')
plt.show()

Separar variables

In [None]:
# Separar variables

X = df.drop('ALTA_CALIDAD', axis=1)
y = df['ALTA_CALIDAD']
X.head()

Escalar

In [None]:
# Escalar

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.head()

Train/Test split

In [None]:
# Train/Test split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape

Modelo inicial

In [None]:
# Modelo inicial

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

Perfil de usuarios

In [None]:
# Perfil por clase

display(df.groupby('ALTA_CALIDAD').mean())
display(df.groupby('ALTA_CALIDAD').size())

Evaluación

In [None]:
# Evaluación

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.4f}')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicho')
plt.ylabel('Real')
plt.title('Matriz de confusión')
plt.show()

print(classification_report(y_test, y_pred))

if len(np.unique(y_test))==2:
    y_proba = knn.predict_proba(X_test)[:,1]
    print('ROC AUC:', roc_auc_score(y_test, y_proba))
    fpr,tpr,_ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6,4))
    plt.plot(fpr,tpr)
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC Curve')
    plt.show()

Elegir K óptimo

In [None]:
# Elegir K óptimo

error_rate = []
ks = range(1,21)
for k in ks:
    knn_k = KNeighborsClassifier(n_neighbors=k)
    knn_k.fit(X_train, y_train)
    pred_k = knn_k.predict(X_test)
    error_rate.append(np.mean(pred_k != y_test))

plt.figure(figsize=(8,4))
plt.plot(ks, error_rate, marker='o')
plt.xlabel('k')
plt.ylabel('Tasa de error')
plt.title('Error rate vs k')
plt.xticks(ks)
plt.show()

best_k = ks[int(np.argmin(error_rate))]
print('Mejor k por menor error:', best_k)