# **Encoding**

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import pickle


import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../')
from src import funciones_encoding as fe

pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


In [52]:
df = pd.read_pickle("../datos/dataframes/df_nonulls.pkl")
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating
0,3.0,4.0,2.0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0,3,3
1,3.0,2.0,4.0,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4,2,4


In [53]:
df.select_dtypes(exclude=np.number).columns

Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus',
       'StockOptionLevel', 'TrainingTimesLastYear', 'JobInvolvement',
       'PerformanceRating'],
      dtype='object')

Cuando tengamos un problema de clasificación para hacer el encoding y ver si entre las variables categóricas hay diferencias significativas usaremos tablas de contingencia.

In [54]:
lista_col_categoricas = df.select_dtypes(exclude=np.number).columns

cols_con_orden = []
cols_sin_orden = []

for col in lista_col_categoricas:
    test = fe.TestEstadisticos(df, "Attrition", col)
    if test.detectar_orden_problema_categorico():
        cols_con_orden.append(col)
    else:
        cols_sin_orden.append(col)

print(f"Las columnas con orden son {cols_con_orden}")
print(f"Las columnas que NO tienen orden son {cols_sin_orden}")

Estamos comparando las variables de ENVIRONMENTSATISFACTION
El p-valor es 3.0188665104797826e-14
La variable ENVIRONMENTSATISFACTION SÍ tiene orden ✅

---------------------------------------------

Estamos comparando las variables de JOBSATISFACTION
El p-valor es 5.02288323219608e-11
La variable JOBSATISFACTION SÍ tiene orden ✅

---------------------------------------------

Estamos comparando las variables de WORKLIFEBALANCE
El p-valor es 8.489055178330413e-10
La variable WORKLIFEBALANCE SÍ tiene orden ✅

---------------------------------------------

Estamos comparando las variables de BUSINESSTRAVEL
El p-valor es 5.097055600861411e-15
La variable BUSINESSTRAVEL SÍ tiene orden ✅

---------------------------------------------

Estamos comparando las variables de DEPARTMENT
El p-valor es 6.065982503633906e-06
La variable DEPARTMENT SÍ tiene orden ✅

---------------------------------------------

Estamos comparando las variables de EDUCATION
El p-valor es 0.2340907093606658
La variable 

A las columnas con orden les vampos a aplicar el Target Encoder, las columnas sin orden las vamos a devolver a su estado numérico a todas las que sean posibles y a hacer onehot encoder a Gender. Si las méstricas obtenidas no fueran muy buenas podríamos volver a este paso y hacer one hot encoder a todas las que no tuvieran orden.

In [55]:
cols_cat_a_num = ['Education', 'JobLevel', 'StockOptionLevel', 'PerformanceRating']
df[cols_cat_a_num] = df[cols_cat_a_num].astype("int")

df["Attrition"] = df["Attrition"].map({"Yes":1, "No":0})

df.head(2)

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating
0,3.0,4.0,2.0,51,0,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0,3,3
1,3.0,2.0,4.0,31,1,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4,2,4


In [56]:
diccionario_encoding={"onehot":["Gender"], "target":['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 'TrainingTimesLastYear', 'JobInvolvement']}
var_respuesta = "Attrition"
encoding = fe.Encoding(df, diccionario_encoding, var_respuesta)

# -------------- One hot encoder --------------
df_ohe, one_hot_encoder = encoding.one_hot_encoding()
df_ohe.drop(columns=["Gender"], inplace=True)

with open('../datos/preprocesamiento/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

df_ohe.head()

# -------------- Target encoder --------------
df_ohe_target, target_encoder = encoding.target_encoding()

with open('../datos/preprocesamiento/target_encoder.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)

df_ohe_target.head(3)

Unnamed: 0,Attrition,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,Gender_Female,Gender_Male
0,0,0.137225,0.113943,0.166169,51,0.15077,0.151492,6,2,0.167044,1,0.145889,0.127476,131160,1.0,11,0,1.0,0.063158,1,0,0,0.153057,3,1.0,0.0
1,1,0.137225,0.164034,0.177778,31,0.245983,0.15782,10,1,0.167044,1,0.183935,0.253256,41890,0.0,23,1,6.0,0.175817,5,1,4,0.161232,4,1.0,0.0
2,0,0.150179,0.164034,0.309322,32,0.245983,0.15782,17,4,0.126582,4,0.169456,0.127476,193280,1.0,15,3,5.0,0.171053,5,0,3,0.153057,3,0.0,1.0


Guardamos el df la con el encoding hecho:

In [57]:
df_ohe_target.to_pickle("../datos/dataframes/df_nonulls_encoded.pkl")