**COMPARACIÓN DE MODELOS DE MACHINE LEARNING PARA LA PREDICCIÓN DEL SCORE CREDITICIO**

**Preprocesamiento de datos**

In [None]:
# Librerías 
from sklearn.model_selection import GridSearchCV  # Búsqueda de hiperparámetros mediante validación cruzada.
import matplotlib.pyplot as plt  # Librería de visualización de datos en Python.
from sklearn.preprocessing import StandardScaler  # Preprocesamiento de datos: estandarización y codificación one-hot.
from sklearn.pipeline import Pipeline  # Encadenamiento de pasos de procesamiento y modelado.
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score  # Métricas de evaluación: precisión, área bajo la curva ROC, sensibilidad, especificidad, puntuación F1, error cuadrático medio, coeficiente de determinación.
from sklearn.feature_selection import SelectKBest, mutual_info_classif  # Selección de características: mejores K características, información mutua.
import pandas as pd

In [None]:
# Instalación de complementos para leer un archivo en Excel
!pip install xlrd==2.0.1

In [None]:
# Lee el archivo Excel
archivo_excel = 'D:/backups/phising 2.0/data science/dataset5 CLUB v2.0.xls'

# Leer todas las hojas del archivo Excel
hojas = pd.read_excel(archivo_excel, sheet_name=None)

df_sheet1 = hojas[list(hojas.keys())[0]]  # Primer sheet
df_sheet2 = hojas[list(hojas.keys())[1]]  # Segundo sheet

df = pd.concat([df_sheet1, df_sheet2], axis=0, ignore_index=True)

In [None]:
df.info()

In [8]:
df = df.fillna(df.mean(numeric_only=True)) 

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

In [None]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

# Aplica Label Encoding a las columnas categóricas
label_encoder = LabelEncoder()

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

In [None]:
# Convertir la columna FECHAADJUDICACION en características numéricas
df['FECHAADJUDICACION'] = pd.to_datetime(df['FECHAADJUDICACION'])

df['ANIO_ADJUDICACION'] = df['FECHAADJUDICACION'].dt.year
df['MES_ADJUDICACION'] = df['FECHAADJUDICACION'].dt.month

df = df.drop('FECHAADJUDICACION', axis=1)

In [13]:
# Eliminar las columnas 'INGRESOSANUALES_1' y 'TEACONSEGURO_1'
df.drop(['INGRESOSANUALES_1', 'TEACONSEGURO_1'], axis=1, inplace=True)

In [None]:
# StandardScaler
from sklearn.preprocessing import StandardScaler

# Seleccionar las columnas numéricas específicas
columns_to_scale = ['SCORE', 'ACTIVOSTOTALES', 'PASIVOSTOTALES', 'INGRESOSANUALES', 
                    'EGRESOSANUALES', 'CARGASFAMILIARES', 'EDAD', 'AHORROS', 
                    'DPFS', 'DEUDASVIGENTES', 'CUOTASTOTALESVIGENTES', 'NUMPRESTVIGENTES', 
                    'MONTOPRESTAMO', 'SALDOPRESTAMO', 'NUMEROCUOTAS', 'TEACONSEGURO', 
                    'TASAINTERES', 'VALORPROMEDIOCUOTA', 'DIASMOROSIDAD', 
                    'NUMPRESTMORA', 'ANIOSTRABAJO']

scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
print(df.head())


In [None]:
df

**RandomForestRegressor**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

X = df.drop('SCORE', axis=1)  # Características
y = df['SCORE']               # Variable objetivo

# Dividisión de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definir el pipeline
pipeline = Pipeline(steps=[
    ('feature_selection', SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))),  
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1)) 
])

# GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo del GridSearch
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Imprimir las métricas
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# Validación cruzada con el mejor modelo
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print("Cross-validated RMSE:", np.mean(cv_rmse))


**DecisionTreeClassifier**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Crear bins y etiquetas para SCORE
bins = sorted(set([df['SCORE'].min(), -3, -2, -1, 0, df['SCORE'].max()]))
labels = ['Muy Bajo', 'Bajo', 'Medio-Bajo', 'Medio-Alto', 'Alto']
y_categorico = pd.cut(df['SCORE'], bins=bins, labels=labels, include_lowest=True)

# Dividir los datos 
X = df.drop('SCORE', axis=1)  # Características
y = y_categorico              # Variable categórica como objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# pipeline
pipeline = ImbPipeline(steps=[
    ('oversampling', RandomOverSampler(random_state=42)), 
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=15)),  
    ('classifier', DecisionTreeClassifier(random_state=42))  
])

# GridSearchCV
param_grid = {
    'classifier__max_depth': [3, 5, 10, None],             # Profundidad del árbol
    'classifier__min_samples_split': [2, 5, 10],           # Número mínimo de muestras para dividir un nodo
    'classifier__min_samples_leaf': [1, 2, 4],             # Número mínimo de muestras en una hoja
    'classifier__criterion': ['gini', 'entropy']           # Criterios de división
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predicciones con el mejor modelo
y_pred = best_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Mostrar métricas
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Mostrar informe de clasificación y matriz de confusión
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


**GaussianNB**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt

# Visualización de la distribución de SCORE
plt.figure(figsize=(10, 6))
plt.hist(df['SCORE'], bins=50, color='blue', edgecolor='black')
plt.title('Distribución de los valores de SCORE', fontsize=15)
plt.xlabel('SCORE', fontsize=12)
plt.ylabel('Frecuencia', fontsize=12)
plt.show()

# Definir los bins para la variable objetivo
bins = sorted(set([df['SCORE'].min(), -3, -2, -1, 0, df['SCORE'].max()]))
labels = ['Muy Bajo', 'Bajo', 'Medio-Bajo', 'Medio-Alto', 'Alto']
y_categorico = pd.cut(df['SCORE'], bins=bins, labels=labels, include_lowest=True)

# Dividir los datos
X = df.drop('SCORE', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y_categorico, test_size=0.3, random_state=42)

# Pipeline
pipeline = ImbPipeline(steps=[
    ('oversampling', RandomOverSampler(random_state=42)), 
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=15)),  
    ('classifier', DecisionTreeClassifier(random_state=42)) 
])

# GridSearchCV
param_grid = {
    'classifier__max_depth': [3, 5, 10, None],             # Profundidad del árbol
    'classifier__min_samples_split': [2, 5, 10],           # Número mínimo de muestras para dividir un nodo
    'classifier__min_samples_leaf': [1, 2, 4],             # Número mínimo de muestras en una hoja
    'classifier__criterion': ['gini', 'entropy']           # Criterios de división
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Mostrar métricas
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Mostrar reporte de clasificación y matriz de confusión
print("Classification Report:")
print(classification_report(y_test, y_pred))

**KNeighborsClassifier**

In [None]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline

X = df.drop('SCORE', axis=1)  # Características
y = df['SCORE']  # Variable objetivo

# Crear bins para la variable SCORE y convertirla en categórica
bins = sorted(set([df['SCORE'].min(), -3, -2, -1, 0, df['SCORE'].max()]))
labels = ['Muy Bajo', 'Bajo', 'Medio-Bajo', 'Medio-Alto', 'Alto']
y_categorico = pd.cut(df['SCORE'], bins=bins, labels=labels, include_lowest=True)

# Ver la distribución de categorías
print(y_categorico.value_counts())

# Dividir los datos 
X_train, X_test, y_train, y_test = train_test_split(X, y_categorico, test_size=0.3, random_state=42)

# Pipeline
pipeline = ImbPipeline(steps=[
    ('oversampling', RandomOverSampler(random_state=42)),  
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=15)),  
    ('classifier', KNeighborsClassifier())  
])

# GridSearchCV
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9],  # Número de vecinos
    'classifier__weights': ['uniform', 'distance'],  # Peso de los vecinos
    'classifier__p': [1, 2]  # Distancia: Manhattan (p=1) o Euclidiana (p=2)
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Evaluar el modelo con los mejores hiperparámetros
best_model = grid_search.best_estimator_

# Realizar predicciones con el modelo óptimo
y_pred = best_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Mostrar métricas de evaluación
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Mostrar informe de clasificación y matriz de confusión
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

**LinearDiscriminantAnalyst**

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline

X = df.drop('SCORE', axis=1)  # Características
y = df['SCORE']               # Variable objetivo

# Crear bins para SCORE y convertirla en una variable categórica
bins = sorted(set([df['SCORE'].min(), -3, -2, -1, 0, df['SCORE'].max()]))
labels = ['Muy Bajo', 'Bajo', 'Medio-Bajo', 'Medio-Alto', 'Alto']
y_categorico = pd.cut(df['SCORE'], bins=bins, labels=labels, include_lowest=True)

# Ver la distribución de clases
print(y_categorico.value_counts())

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y_categorico, test_size=0.3, random_state=42)

# Crear el pipeline
pipeline = ImbPipeline(steps=[
    ('oversampling', RandomOverSampler(random_state=42)),
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=15)), 
    ('classifier', LinearDiscriminantAnalysis())  
])

# GridSearchCV
param_grid = {
    'classifier__solver': ['svd', 'lsqr', 'eigen'],  # Algoritmos disponibles para LDA
    'classifier__shrinkage': [None, 'auto']  # Regularización (solo aplica para lsqr/eigen)
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Evaluar el modelo con los mejores hiperparámetros
best_model = grid_search.best_estimator_

# Realizar predicciones con el modelo óptimo
y_pred = best_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Mostrar métricas de evaluación
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Mostrar informe de clasificación y matriz de confusión
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


**LinearRegression**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('SCORE', axis=1)  # Características
y = df['SCORE']               # Variable objetivo

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Pipeline
pipeline = Pipeline(steps=[
    ('feature_selection', SelectKBest(score_func=mutual_info_regression, k=15)),  
    ('regressor', LinearRegression())  
])

# GridSearchCV
param_grid = {
    'regressor__fit_intercept': [True, False]  # Ajustar con o sin intercepto
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Imprimir los mejores parámetros encontrados
print("Best Parameters:", grid_search.best_params_)

# Realizar predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Imprimir métricas
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# Validación cruzada con el mejor modelo
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)

# Imprimir el RMSE validado cruzadamente
print(f"Cross-validated RMSE: {np.mean(cv_rmse):.4f}")


**SVC**

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Definir nuevos bins basados en la distribución observada
bins = sorted(set([df['SCORE'].min(), -3, -2, -1, 0, df['SCORE'].max()]))
labels = ['Muy Bajo', 'Bajo', 'Medio-Bajo', 'Medio-Alto', 'Alto']

# Crear la variable categórica de SCORE
y_categorico = pd.cut(df['SCORE'], bins=bins, labels=labels, include_lowest=True)

# Ver los resultados
print(y_categorico.value_counts())

X = df.drop('SCORE', axis=1)  # Características

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y_categorico, test_size=0.3, random_state=42)

# Escalar características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Pipeline
pipeline = Pipeline(steps=[
    ('oversampling', RandomOverSampler(random_state=42)),
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=15)),  
    ('classifier', SVC(random_state=42))  
])

# GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters from Grid Search:", grid_search.best_params_)

# Realizar predicciones con el mejor modelo
y_pred = best_model.predict(X_test)

# Calcular y mostrar métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Mostrar reporte de clasificación detallado
print("\nClassification Report:\n", classification_report(y_test, y_pred))
