# 3.- Ingenieria de Caracteristicas

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## 3.1. Cargamos Datos

In [35]:
dataset_train = pd.read_csv("../data/raw/loan_sanction_train.csv")
dataset_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [36]:
with open("../artifacts/variables_escaling.pkl","rb") as f:
    variables_escaling = pickle.load(f)

variables_escaling

{'categoricas': ['Gender',
  'Married',
  'Dependents',
  'Education',
  'Self_Employed',
  'Property_Area',
  'Loan_Status'],
 'continuas': ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'],
 'discretas': ['Loan_Amount_Term', 'Credit_History']}

## 3.2. Eliminamos variables no útiles

In [37]:
dataset_train.drop('Loan_ID', axis=1, inplace=True)
dataset_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## 3.3.- Imputacion de Variables

In [38]:
print("\nConteo de Valores Nulos:")
print(dataset_train.isnull().sum())


Conteo de Valores Nulos:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [39]:
print("\nPorcentaje de Valores Nulos:")
print(dataset_train.isnull().mean())
 


Porcentaje de Valores Nulos:
Gender               0.021173
Married              0.004886
Dependents           0.024430
Education            0.000000
Self_Employed        0.052117
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.035831
Loan_Amount_Term     0.022801
Credit_History       0.081433
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64


    Se puede observar que las variables que tienen nulos se pueden imputar ya que no son muchos

### 3.3.1 Imputamos Varibles Categoricas

Verificamos que columnas categoricas tienen nulos y las imputamos con la moda

In [40]:
for colName in variables_escaling["categoricas"]:
    if dataset_train[colName].isnull().sum():
        print(colName)

Gender
Married
Dependents
Self_Employed


In [41]:
dataset_train["Gender"].value_counts(normalize=True)

Gender
Male      0.813644
Female    0.186356
Name: proportion, dtype: float64

In [42]:
mode_Gender = dataset_train['Gender'].mode()[0]
dataset_train['Gender'] = dataset_train['Gender'].fillna(mode_Gender)
mode_Gender


'Male'

In [43]:
dataset_train["Married"].value_counts(normalize=True)

Married
Yes    0.651391
No     0.348609
Name: proportion, dtype: float64

In [44]:
mode_Married = dataset_train['Married'].mode()[0]
dataset_train['Married'] = dataset_train['Married'].fillna(mode_Married)
mode_Married

'Yes'

In [45]:
dataset_train["Dependents"].value_counts(normalize=True)

Dependents
0     0.575960
1     0.170284
2     0.168614
3+    0.085142
Name: proportion, dtype: float64

In [46]:
mode_Dependents = dataset_train['Dependents'].mode()[0]
dataset_train['Dependents'] = dataset_train['Dependents'].fillna(mode_Dependents)
mode_Dependents

'0'

In [47]:
dataset_train["Self_Employed"].value_counts(normalize=True)

Self_Employed
No     0.859107
Yes    0.140893
Name: proportion, dtype: float64

In [48]:
mode_Self_Employed = dataset_train['Self_Employed'].mode()[0]
dataset_train['Self_Employed'] = dataset_train['Self_Employed'].fillna(mode_Self_Employed)
mode_Self_Employed

'No'

### 3.3.2 Imputamos Varibles Continuas y Discretas

Verificamos que columnas Continuas y Discretas tienen nulos y las imputamos con la media

In [None]:
#Variables Continuas
for colName in variables_escaling["continuas"]:
    if dataset_train[colName].isnull().sum():
        print(colName)

LoanAmount


In [50]:
media_LoanAmount = dataset_train['LoanAmount'].mean()
dataset_train['LoanAmount'] = dataset_train['LoanAmount'].fillna(media_LoanAmount) 
media_LoanAmount

146.41216216216216

In [53]:
#Variables Discretas
for colName in variables_escaling["discretas"]:
    if dataset_train[colName].isnull().sum():
        print(colName)

Loan_Amount_Term
Credit_History


In [55]:
media_Loan_Amount_Term = dataset_train['Loan_Amount_Term'].mean()
dataset_train['Loan_Amount_Term'] = dataset_train['Loan_Amount_Term'].fillna(media_Loan_Amount_Term) 
media_Loan_Amount_Term

342.0

In [None]:
media_Credit_History= dataset_train['Credit_History'].mean()
dataset_train['Credit_History'] = dataset_train['Credit_History'].fillna(media_Credit_History) 
media_Credit_History

0.8421985815602837

Verificamos si hay valores nulos, para 

In [57]:
print("\nPorcentaje de Valores Nulos:")
print(dataset_train.isnull().mean())


Porcentaje de Valores Nulos:
Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64


## 3.4 Codificación de Variables Categóricas

In [73]:
dataset_train[variables_escaling["categoricas"]].describe()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,614,614,614,614,614,614
unique,2,2,4,2,2,3,2
top,Male,Yes,0,Graduate,No,Semiurban,Y
freq,502,401,360,480,532,233,422


Como Dependents y Property_Areas tiene multiples valores se utilizara Frequency Encoding para las demas One Hot Encoding

Codificación de variable con Frecuency Encoding que tienen multiples valores

In [87]:
for colName in variables_escaling["categoricas"]:
    if len(dataset_train[colName].unique()) >2:
        print(colName)

Dependents
Property_Area


In [91]:
codificador_Dependents = dataset_train['Dependents'].value_counts()
dataset_train['Dependents'] = dataset_train['Dependents'].map(codificador_Dependents)
codificador_Dependents

Dependents
0     360
1     102
2     101
3+     51
Name: count, dtype: int64

In [92]:
codificador_Property_Area = dataset_train['Property_Area'].value_counts()
dataset_train['Property_Area'] = dataset_train['Property_Area'].map(codificador_Property_Area)
codificador_Property_Area

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

Codificación de variables Categorias con One Hot Encoding con valores menores a 3

In [90]:
for colName in variables_escaling["categoricas"]:
    if len(dataset_train[colName].unique()) <3:
        dataset_train[colName] = pd.get_dummies(dataset_train[colName], drop_first=True).astype(int)

In [93]:
dataset_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,360,0,0,5849,0.0,146.412162,360.0,1.0,202,1
1,1,1,102,0,0,4583,1508.0,128.0,360.0,1.0,179,0
2,1,1,360,0,1,3000,0.0,66.0,360.0,1.0,202,1
3,1,1,360,1,0,2583,2358.0,120.0,360.0,1.0,202,1
4,1,0,360,0,0,6000,0.0,141.0,360.0,1.0,202,1


# 4. Guardamos el Dataset Procesado.

In [94]:
dataset_train.to_csv('../data/processed/features_for_model.csv', index=False)

In [95]:
feature_eng_configs = {
    'codificador_Property_Area': codificador_Property_Area,
    'codificador_Dependents': codificador_Dependents,
    'media_Credit_History': media_Credit_History,
    'media_Loan_Amount_Term': media_Loan_Amount_Term,
    'media_LoanAmount': media_LoanAmount,
    'mode_Self_Employed': mode_Self_Employed,
    'mode_Dependents': mode_Dependents,
    'mode_Married': mode_Married,
    'mode_Gender': mode_Gender,
}

import pickle

with open("../artifacts/feature_eng_configs.pkl","wb") as f:
    pickle.dump(feature_eng_configs,f)