# **Predicción** 

Ahora con nuestro modelo ya creado podemos darle datos de un trabajador ficticio y probar que predicción nos da.

In [84]:
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import sys

sys.path.append('../')
from src import funciones_prediccion as fp

pd.set_option("display.max_columns", None)

Nos inventamos unos datos para una casa con todos los parámetros necesarios para el modelo.

In [None]:
new_employee = pd.DataFrame({
    'EnvironmentSatisfaction' : [1], 
    'JobSatisfaction': [2], 
    'WorkLifeBalance' : [3],
    'Age': [24],
    'BusinessTravel' : ["Travel_Frequently"], 
    'Department' : ["Research & Development"],
    'DistanceFromHome' : [10],
    'Education' : [4],
    'EducationField' :["Technical Degree"], 
    'Gender' : ["Female"], 
    'JobLevel' : [1], 
    'JobRole' : ["Laboratory Technician"], 
    'MaritalStatus' : ["Single"],
    'MonthlyIncome' : [25000],
    'NumCompaniesWorked' : [1],
    'PercentSalaryHike' : [11],
    'StockOptionLevel' : [0],
    'TotalWorkingYears' : [1],
    'TrainingTimesLastYear' : [1],
    'YearsAtCompany' : [1],
    'YearsSinceLastPromotion' : [0],
    'YearsWithCurrManager' : [1],
    'JobInvolvement' :  [1],
    'PerformanceRating' : [3]
})

df_pred = pd.DataFrame(new_employee)
df_pred.shape

(1, 24)

In [86]:
# cargamos los transformadores y el modelo entrenado
with open('../datos/modelos/modelo_prediccion_final.pkl', 'rb') as f:
    model = pickle.load(f)

with open('../datos/preprocesamiento/robust_scaler.pkl', 'rb') as f:
    robust_scaler = pickle.load(f)

with open('../datos/preprocesamiento/target_encoder.pkl', 'rb') as f:
    target_encoder = pickle.load(f)

with open('../datos/preprocesamiento/one_hot_encoder.pkl', 'rb') as f:
    one_hot_encoder = pickle.load(f)

### **1. Encoding**

El primer paso es hacer el encoding en el mismo orden y con las mismas columnas que cuando entrenamos al modelo.

In [87]:
diccionario_encoding={"onehot":["Gender", 'JobRole'], "target":['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'BusinessTravel', 'Department', 'EducationField',  'MaritalStatus']}

col_one_hot = diccionario_encoding["onehot"]
col_target = diccionario_encoding["target"]


Primero hacemos el one-hot encoding:

In [88]:
encoded_matrix = one_hot_encoder.transform(df_pred[col_one_hot])

df_ohe = pd.DataFrame(
    encoded_matrix.toarray(),  # Convertir matriz dispersa a densa (si es dispersa)
    columns=one_hot_encoder.get_feature_names_out(col_one_hot)  # Obtener nombres de las columnas
)

df_encoded = pd.concat([df_pred.reset_index(drop=True), df_ohe.reset_index(drop=True)], axis=1)
df_encoded.drop(columns=col_one_hot, inplace=True)

print(df_encoded.shape)
df_encoded

(1, 33)


Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,JobLevel,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,1,2,3,24,Travel_Frequently,Research & Development,10,4,Technical Degree,1,Single,25000,1,11,0,1,1,1,0,1,1,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Después hacemos el target:

In [89]:
df_encoded = target_encoder.transform(df_encoded)
df_encoded

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,JobLevel,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,0.247458,0.16263,0.138979,24,0.254296,0.15415,10,4,0.107143,1,0.256619,25000,1,11,0,1,1,1,0,1,1,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### **2. Estandarizacion**

In [90]:
col_num = df_encoded.select_dtypes(include = np.number).columns
df_encoded_estand = pd.DataFrame(robust_scaler.transform(df_encoded), columns= col_num)
df_encoded_estand

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,JobLevel,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,12.5028,0.0,0.0,-0.916667,0.108443,0.0,0.25,0.5,-7.514956,-0.5,1.0,-0.441996,-0.333333,-0.5,-1.0,-0.944444,-2.0,-0.666667,-0.5,-0.4,-2.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### **3. Predicción**

In [91]:
prediccion = model.predict(df_encoded_estand)[0]
probabilidad = round(model.predict_proba(df_encoded_estand)[0][prediccion]*100, 2)
attrition = "no" if prediccion==0 else "si"
print(f"El empleado {attrition.upper()} se va de la empresa con una probabilidad del {probabilidad}%")

El empleado SI se va de la empresa con una probabilidad del 97.97%
