In [37]:
import pandas as pd
import numpy as np

import pickle
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import CountFrequencyEncoder

from sklearn.preprocessing import StandardScaler


In [38]:
dataset = pd.read_csv("../data/raw/loan_sanction_train.csv")
dataset.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [39]:
# configuracipon del Pipeline
TARGET = "Loan_Status"
VARS_TO_DROP = ["Loan_ID",TARGET]
CONTINUE_VARS_TO_IMPUTATION= ["LoanAmount","Loan_Amount_Term","Credit_History","ApplicantIncome","CoapplicantIncome"]
CATEGORICAL_VARS_TO_IMPUTATION=["Gender","Married","Dependents","Education","Self_Employed","Property_Area"]
OHE_VAR_TO_ENCODE = ["Gender","Married","Education","Self_Employed"]
FREQENC_VAR_TO_ENCODE = ["Dependents","Property_Area"]



In [40]:
x_features = dataset.drop(labels=VARS_TO_DROP,axis=1)
y_target = dataset[TARGET].map({'Y': 1, 'N': 0})

x_train, x_test, y_train, y_test = train_test_split(x_features,y_target, test_size=0.3,shuffle=True,random_state=2025)

In [41]:
from sklearn.pipeline import Pipeline

loan_prediction_model = Pipeline([
    # imputacion de variables continuas
    ("continues_var_mean_imputacion",MeanMedianImputer(imputation_method="mean",variables=CONTINUE_VARS_TO_IMPUTATION)),
    #imputacion de variables categoricas
    ("categorical_var_freq_imputation",CategoricalImputer(imputation_method="frequent",variables=CATEGORICAL_VARS_TO_IMPUTATION)),
    #codificaciones de las variables categoricas
    ("categorical_encoding_ohe",OneHotEncoder(variables=OHE_VAR_TO_ENCODE,drop_last=True)),
    ("categorical_encoding_freq_enc",CountFrequencyEncoder(encoding_method="count",variables=FREQENC_VAR_TO_ENCODE)),
    # estandarizacion de variables
    ("feature_scaling",StandardScaler())
])

In [None]:
 Ajustar el modelo antes de transformar los datos
loan_prediction_model.fit(x_train, y_train)


In [42]:
x_features_processed = loan_prediction_model.transform(x_train)
df_features_process = pd.DataFrame(x_features_processed,columns=x_train.columns)
df_features_process[TARGET] = y_train.reset_index()[TARGET]

# guardamos los datos para entrenar los modelos.
df_features_process.to_csv('../data/processed/features_for_model.csv', index=False)
df_features_process.head()



AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [8]:
x_test[TARGET] = y_test
x_test.to_csv('../data/processed/test_dataset.csv', index=False)

with open('../artifacts/pipeline.pkl', 'wb') as f:
    pickle.dump(loan_prediction_model, f)