In [1]:
import pandas as pd
import pickle
import mlflow

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier



In [2]:
data_train = pd.read_csv('../data/processed/features_for_model.csv')
data_test = pd.read_csv('../data/processed/test_dataset.csv')

In [3]:
x_features = data_train.drop(['Loan_Status'], axis=1)
y_target = data_train['Loan_Status']

x_features_test = data_test.drop(['Loan_Status'], axis=1)
y_target_test = data_test['Loan_Status']

In [4]:
x_features.shape

(429, 12)

### Leemos el Pipeline pre-configurado

In [5]:
with open('../artifacts/pipeline.pkl', 'rb') as  f:
    loan_prediction_model_pipeline = pickle.load(f)

In [6]:
x_features_test_arr = loan_prediction_model_pipeline.transform(x_features_test)
df_features_test = pd.DataFrame(x_features_test_arr, columns=x_features_test.columns)
df_features_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.837587,0.08695,0.228154,1.409525,0.269448,-3.161607e-16,1.122887,0.48978,-0.729481,0.551318,0.371391
1,-1.161664,-0.434812,0.254479,-0.267519,0.269448,0.4565051,-1.341146,0.48978,-0.729481,0.551318,0.371391
2,-1.053596,-0.361671,0.141611,-0.435223,-2.610618,-2.391217,-0.10913,0.48978,1.370837,0.551318,0.371391
3,-1.161664,-0.065746,-0.512235,-0.9024,0.269448,-2.391217,1.122887,0.48978,-0.729481,0.551318,0.371391
4,-1.053596,0.308512,-0.512235,0.0,0.269448,0.4565051,1.122887,-2.041733,1.370837,0.551318,-2.692582


In [8]:
df_features_test.shape

(185, 11)

Entrenamiento de Modelos

In [9]:
# 1. Lista de modelos para evaluar
models = [
    ('Logistic Regression',1, LogisticRegression(penalty='l2',C=1.0,
                                                solver='lbfgs',max_iter=1000)),
    ('Logistic Regression',2, LogisticRegression(penalty='l1',C=0.5,
                                                solver='liblinear',max_iter=2000)),
    ('Logistic Regression',3, LogisticRegression(penalty=None,max_iter=5000,
                                                tol=1e-5,solver='saga')),
    ('Random Forest', 1,RandomForestClassifier(n_estimators=50, max_depth=5)),
    ('Random Forest', 2,RandomForestClassifier(n_estimators=100, max_depth=10)),
    ('Random Forest', 3,RandomForestClassifier(n_estimators=300, max_depth=15)),
    ('XGBoost', 1,RandomForestClassifier(n_estimators=50, max_depth=5)),
    ('XGBoost', 2,RandomForestClassifier(n_estimators=100, max_depth=10)),
    ('XGBoost', 3,RandomForestClassifier(n_estimators=200, max_depth=15)),
    ('SVM',1, SVC(kernel='linear', C=1)),
    ('SVM',2, SVC(kernel='rbf', C=10)),
    ('SVM',3, SVC(kernel='poly', C=0.1, degree=3)),    
    ('KNN',1, KNeighborsClassifier(n_neighbors=3, metric='euclidean')),
    ('KNN',2, KNeighborsClassifier(n_neighbors=5, metric='manhattan')),
    ('KNN',3, KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=3))
]

In [10]:
# configuración de servidor
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Loan Predict Model - Training Modelos")

<Experiment: artifact_location='mlflow-artifacts:/733735587012203752', creation_time=1733808788090, experiment_id='733735587012203752', last_update_time=1733808788090, lifecycle_stage='active', name='Loan Predict Model - Training Modelos', tags={}>

In [11]:
# Entrenamiento de los modelos
results = []

with mlflow.start_run():
    for name, num, model in models:
        model.fit(x_features, y_target)  # Entrenar modelo
        y_pred = model.predict(df_features_test)  # Predecir
        acc = accuracy_score(y_target_test, y_pred)  # Calcular accuracy    
        results.append((name,num, acc))  # Guardar nombre y accuracy

        # registramos hiper-parametros 
        mlflow.log_params(model.get_params())

        # registramos métricas
        mlflow.log_metric("accuracy score", acc)

        # registramos modelo y entrenado.
        mlflow.sklearn.log_model(model, name)

        mlflow.end_run()
    

🏃 View run serious-skink-42 at: http://127.0.0.1:5000/#/experiments/733735587012203752/runs/f429371fbe9e4a299fae2aca45d5b914
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/733735587012203752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Unnamed: 0


In [9]:
print("Resultados de accuracy para cada modelo:")
for name, num, acc in results:
    print(f"{name} - {num}: {acc:.8f}")

Resultados de accuracy para cada modelo:
Logistic Regression - 1: 0.76216216
Logistic Regression - 2: 0.77297297
Logistic Regression - 3: 0.76216216
Random Forest - 1: 0.76756757
Random Forest - 2: 0.76756757
Random Forest - 3: 0.75675676
XGBoost - 1: 0.77297297
XGBoost - 2: 0.76216216
XGBoost - 3: 0.75135135
SVM - 1: 0.77297297
SVM - 2: 0.74594595
SVM - 3: 0.74594595
KNN - 1: 0.74594595
KNN - 2: 0.76216216
KNN - 3: 0.74594595


In [10]:
# 4. Encontrar el mejor modelo
best_model = max(results, key=lambda x: x[2])

print(f"\nEl mejor modelo es {best_model[0]} - {best_model[1]} con un accuracy de {best_model[2]:.8f}")



El mejor modelo es Logistic Regression - 2 con un accuracy de 0.77297297


In [11]:
modelo = [model.get_params() for name, num, model in models if name == best_model[0] and num == best_model[1]]
print(modelo)
 

[{'C': 0.5, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}]


In [12]:
modelo = [model for name, num, model in models if name == best_model[0] and num == best_model[1]]

if (best_model[0]=="Logistic Regression"):
    loan_prediction_model_pipeline.steps.append(
        ("modelo_regresion_logistica",modelo[0])
    )
if (best_model[0]=="Random Forest"):
    loan_prediction_model_pipeline.steps.append(
        ("modelo_random_forest",modelo[0])
    )
if (best_model[0]=="XGBoost"):
    loan_prediction_model_pipeline.steps.append(
        ("modelo_xgboost",modelo[0])
    )
if (best_model[0]=="SVM"):
    loan_prediction_model_pipeline.steps.append(
        ("modelo_svm",modelo[0])
    )
if (best_model[0]=="KNN"):
    loan_prediction_model_pipeline.steps.append(
        ("modelo_knn",modelo[0])
    )


In [13]:
train_dataset = pd.read_csv('../data/raw/loan_sanction_train.csv')
train_dataset.drop(["Loan_ID"], axis=1, inplace=True)
train_dataset_features = train_dataset.drop("Loan_Status", axis=1)
train_dataset_target = train_dataset["Loan_Status"].map({'Y': 1, 'N': 0})

In [14]:
loan_prediction_model_pipeline.fit(train_dataset_features,train_dataset_target) 

In [15]:
with open('../artifacts/pipeline_model.pkl', 'wb') as f:
    pickle.dump(loan_prediction_model_pipeline, f)