In [41]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from feature_engine import encoding

# Adiciona o caminho da pasta 'py_functions' ao sys.path
sys.path.append(os.path.abspath('../../py_functions'))
from functions import iv_woe

### Leitura das Bases de Dados

In [42]:
dados_turnover_jan = pd.read_csv("../Dados/Turnover_Funcionarios_Jan_23.txt",sep = "\t")

### Análise Exploratória

In [43]:
dados_turnover_jan.describe()

Unnamed: 0,ID_respondente,ult_avaliacao_clima,qtde_projetos_3m,qtde_projetos_6m,qtde_projetos_12m,qtde_projetos_24m,media_horas_trabalho_3m,media_horas_trabalho_6m,media_horas_trabalho_12m,tempo_empresa,flag_promocao_3m,flag_promocao_6m,flag_promocao_12m,flag_promocao_vida,turnover
count,3869.0,3652.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0,3869.0
mean,1935.0,0.621766,1.88214,3.792453,5.310416,7.828638,166.064099,166.182735,166.240889,3.529853,0.034634,0.031016,0.029982,0.459809,0.175239
std,1117.028424,0.299004,1.495793,1.19508,1.626292,2.346166,13.741916,12.795058,14.097675,1.737699,0.182875,0.173383,0.170559,0.498446,0.380221
min,1.0,0.0,0.0,2.0,2.0,2.0,139.0,149.0,139.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,968.0,0.39,1.0,3.0,4.0,6.0,155.0,156.0,155.0,3.0,0.0,0.0,0.0,0.0,0.0
50%,1935.0,0.69,2.0,4.0,5.0,8.0,162.0,160.0,164.0,3.0,0.0,0.0,0.0,0.0,0.0
75%,2902.0,0.88,3.0,5.0,6.0,10.0,177.0,177.0,176.0,4.0,0.0,0.0,0.0,1.0,0.0
max,3869.0,1.0,7.0,7.0,10.0,15.0,214.0,204.0,215.0,11.0,1.0,1.0,1.0,1.0,1.0


In [4]:
qtd = dados_turnover_jan['departamento'].value_counts(dropna=False)
freq = qtd/len(dados_turnover_jan)

tab = pd.DataFrame({
    'Quantidade': qtd,
    'Proporção': freq
})

tab

Unnamed: 0_level_0,Quantidade,Proporção
departamento,Unnamed: 1_level_1,Unnamed: 2_level_1
vendas,1015,0.262342
dados,886,0.229
TI/suporte,863,0.223055
marketing/produto,697,0.18015
RH/financeiro,408,0.105454


In [5]:
qtd = dados_turnover_jan['patamar_salario'].value_counts(dropna=False)
freq = qtd/len(dados_turnover_jan)

tab = pd.DataFrame({
    'Quantidade': qtd,
    'Proporção': freq
})

tab

Unnamed: 0_level_0,Quantidade,Proporção
patamar_salario,Unnamed: 1_level_1,Unnamed: 2_level_1
01_menor_igual_media,3220,0.832256
02_acima_media,649,0.167744


### Análise do poder preditivo das variáveis (IV)

In [6]:
iv, woe = iv_woe(data = dados_turnover_jan, target = 'turnover')

Information value of ID_respondente is 0.032663
Information value of ult_avaliacao_clima is 0.094439
Information value of qtde_projetos_3m is 0.009395
Information value of qtde_projetos_6m is 0.03194
Information value of qtde_projetos_12m is 0.012572
Information value of qtde_projetos_24m is 0.014719
Information value of media_horas_trabalho_3m is 0.004882
Information value of media_horas_trabalho_6m is 0.013801
Information value of media_horas_trabalho_12m is 0.019898
Information value of tempo_empresa is 0.088353
Information value of flag_promocao_3m is 0.023954
Information value of flag_promocao_6m is 0.000115
Information value of flag_promocao_12m is 4.9e-05
Information value of flag_promocao_vida is 0.008593
Information value of departamento is 0.271602
Information value of patamar_salario is 0.168649


  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
  d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})


In [7]:
iv.sort_values(['IV'],ascending=False)

Unnamed: 0,Variable,IV
0,departamento,0.271602
0,patamar_salario,0.168649
0,ult_avaliacao_clima,0.094439
0,tempo_empresa,0.088353
0,ID_respondente,0.032663
0,qtde_projetos_6m,0.03194
0,flag_promocao_3m,0.023954
0,media_horas_trabalho_12m,0.019898
0,qtde_projetos_24m,0.014719
0,media_horas_trabalho_6m,0.013801


### Categorização de variáveis com valores *missing* (safra de jan/23)

In [8]:
def freq_table_mult_col(df,var_categorica,var_resultado,quantil):
    var = f'{var_categorica}_{quantil}'
    df[var] = pd.qcut(df[var_categorica],quantil)
    cross_tab = pd.crosstab(df[var], df[var_resultado], margins=False, dropna=False)
    prop_table = cross_tab.div(cross_tab.sum(axis=1), axis=0)
    df.drop([var], axis=1, inplace=True)
    return prop_table
    

In [9]:
freq_table_mult_col(dados_turnover_jan,'ult_avaliacao_clima','turnover',3)

turnover,0,1
ult_avaliacao_clima_3,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.824885,0.175115
"(-0.001, 0.51]",0.782114,0.217886
"(0.51, 0.83]",0.82215,0.17785
"(0.83, 1.0]",0.87173,0.12827


In [10]:
freq_table_mult_col(dados_turnover_jan,'ult_avaliacao_clima','turnover',4)

turnover,0,1
ult_avaliacao_clima_4,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.824885,0.175115
"(-0.001, 0.39]",0.774403,0.225597
"(0.39, 0.69]",0.81105,0.18895
"(0.69, 0.88]",0.842623,0.157377
"(0.88, 1.0]",0.871429,0.128571


In [11]:
freq_table_mult_col(dados_turnover_jan,'ult_avaliacao_clima','turnover',5)

turnover,0,1
ult_avaliacao_clima_5,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.824885,0.175115
"(-0.001, 0.31]",0.772,0.228
"(0.31, 0.59]",0.799163,0.200837
"(0.59, 0.78]",0.814765,0.185235
"(0.78, 0.92]",0.877763,0.122237
"(0.92, 1.0]",0.861401,0.138599


In [12]:
freq_table_mult_col(dados_turnover_jan,'ult_avaliacao_clima','turnover',6)

turnover,0,1
ult_avaliacao_clima_6,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.824885,0.175115
"(-0.001, 0.245]",0.763547,0.236453
"(0.245, 0.51]",0.800322,0.199678
"(0.51, 0.69]",0.81407,0.18593
"(0.69, 0.83]",0.829688,0.170313
"(0.83, 0.93]",0.878007,0.121993
"(0.93, 1.0]",0.865672,0.134328


In [13]:
# Criação de variável 'ult_avaliacao_clima_cat' com cenário escolhido de categorização
dados_turnover_jan['ult_avaliacao_clima_cat'] = pd.qcut(dados_turnover_jan['ult_avaliacao_clima'], q=3)

#Criação de categoria 'Vazio' na nova variável 'ult_avaliacao_clima_cat'
dados_turnover_jan['ult_avaliacao_clima_cat'] = dados_turnover_jan['ult_avaliacao_clima_cat'].cat.add_categories("Vazio")
dados_turnover_jan['ult_avaliacao_clima_cat'] = dados_turnover_jan['ult_avaliacao_clima_cat'].fillna("Vazio")

### Separação de conjuntos de treino e teste

In [14]:
# Aletorizando a ordem das linhas da base
dados_turnover_jan = dados_turnover_jan.sample(frac=1, random_state=12345).reset_index(drop=True)

In [15]:
# Separando os conjuntos de treino e teste
dados_turnover_treino, dados_turnover_teste = train_test_split(dados_turnover_jan, test_size=0.3, random_state=12345)

### Modelo de regressão logística múltipla

#### Treino Inicial

In [16]:
# Definindo as variáveis preditoras e a variável resposta
features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat',
'tempo_empresa',
'media_horas_trabalho_6m',
'qtde_projetos_24m',
'flag_promocao_3m',
'turnover'
]

cat_features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat'
]

df_predictors = dados_turnover_treino[features]


In [17]:
#OneHot

onehot = encoding.OneHotEncoder(variables=cat_features)
onehot.fit(df_predictors)
df_predictors = onehot.transform(df_predictors)

In [18]:
predictors = df_predictors.columns.tolist()
predictors.remove('turnover')  # Removendo a variável resposta
response = 'turnover'

In [19]:
modelo = sm.GLM(df_predictors[response], 
                sm.add_constant(df_predictors[predictors]),
                family=sm.families.Binomial())
resultado = modelo.fit()

In [20]:
# print(resultado.summary())
# print(resultado.summary2())

In [21]:
summary_table = resultado.summary2().tables[1]

# Ajustar a precisão dos p-values
summary_table['P>|z|'] = summary_table['P>|z|'].apply(lambda x: '{:.10f}'.format(x))

# Exibir o resumo
summary_table

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.73947,0.359449,-2.057232,0.0396638893,-1.443978,-0.034963
tempo_empresa,-0.243407,0.038271,-6.360163,2e-10,-0.318416,-0.168398
media_horas_trabalho_6m,1.8e-05,0.004293,0.00418,0.9966651056,-0.008396,0.008432
qtde_projetos_24m,0.020641,0.022881,0.902136,0.3669843719,-0.024204,0.065487
flag_promocao_3m,-0.777403,0.408678,-1.902236,0.0571402926,-1.578398,0.023592
departamento_vendas,0.493466,0.111167,4.438981,9.0386e-06,0.275584,0.711349
departamento_RH/financeiro,0.176868,0.148869,1.188077,0.2348030432,-0.11491,0.468646
departamento_marketing/produto,-0.222676,0.138642,-1.606131,0.1082451581,-0.494409,0.049056
departamento_dados,-1.095134,0.153037,-7.156018,0.0,-1.395081,-0.795188
departamento_TI/suporte,-0.091994,0.124989,-0.736012,0.4617235536,-0.336968,0.152981


#### Retirando media_horas_trabalho_6m

In [22]:
# Definindo as variáveis preditoras e a variável resposta
features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat',
'tempo_empresa',
'qtde_projetos_24m',
'flag_promocao_3m',
'turnover'
]

cat_features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat'
]

df_predictors = dados_turnover_treino[features]


In [23]:
#OneHot

onehot = encoding.OneHotEncoder(variables=cat_features)
onehot.fit(df_predictors)
df_predictors = onehot.transform(df_predictors)

In [24]:
predictors = df_predictors.columns.tolist()
predictors.remove('turnover')  # Removendo a variável resposta
response = 'turnover'

In [25]:
modelo = sm.GLM(df_predictors[response], 
                sm.add_constant(df_predictors[predictors]),
                family=sm.families.Binomial())
resultado = modelo.fit()

In [26]:
summary_table = resultado.summary2().tables[1]

# Ajustar a precisão dos p-values
summary_table['P>|z|'] = summary_table['P>|z|'].apply(lambda x: '{:.10f}'.format(x))

# Exibir o resumo
summary_table

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.738053,0.118944,-6.205041,5e-10,-0.971179,-0.504927
tempo_empresa,-0.24339,0.038044,-6.397615,2e-10,-0.317954,-0.168825
qtde_projetos_24m,0.020661,0.022411,0.921911,0.3565749783,-0.023264,0.064585
flag_promocao_3m,-0.777356,0.408523,-1.902847,0.0570605512,-1.578046,0.023334
departamento_vendas,0.493731,0.091287,5.408554,6.35e-08,0.314812,0.672651
departamento_RH/financeiro,0.177163,0.13102,1.352187,0.1763155986,-0.079631,0.433958
departamento_marketing/produto,-0.222416,0.123817,-1.796322,0.0724432756,-0.465093,0.020262
departamento_dados,-1.094836,0.135293,-8.092321,0.0,-1.360005,-0.829666
departamento_TI/suporte,-0.091696,0.102744,-0.892468,0.3721419432,-0.293072,0.109679
patamar_salario_01_menor_igual_media,0.233549,0.095778,2.438445,0.0147505956,0.045828,0.42127


#### Retirando 0.3565749783

In [27]:
# Definindo as variáveis preditoras e a variável resposta
features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat',
'tempo_empresa',
'flag_promocao_3m',
'turnover'
]

cat_features = [
'departamento',
'patamar_salario',
'ult_avaliacao_clima_cat'
]

df_predictors = dados_turnover_treino[features]


In [28]:
#OneHot

onehot = encoding.OneHotEncoder(variables=cat_features)
onehot.fit(df_predictors)
df_predictors = onehot.transform(df_predictors)

In [29]:
predictors = df_predictors.columns.tolist()
predictors.remove('turnover')  # Removendo a variável resposta
response = 'turnover'

In [30]:
modelo = sm.GLM(df_predictors[response], 
                sm.add_constant(df_predictors[predictors]),
                family=sm.families.Binomial())
resultado = modelo.fit()

In [37]:
modelo

<statsmodels.genmod.generalized_linear_model.GLM at 0x7fd40ae4eeb0>

In [31]:
summary_table = resultado.summary2().tables[1]

# Ajustar a precisão dos p-values
summary_table['P>|z|'] = summary_table['P>|z|'].apply(lambda x: '{:.10f}'.format(x))

# Exibir o resumo
summary_table

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.659836,0.083043,-7.945701,0.0,-0.822597,-0.497074
tempo_empresa,-0.240635,0.037882,-6.352227,2e-10,-0.314882,-0.166387
flag_promocao_3m,-0.774519,0.408531,-1.895865,0.0579779416,-1.575224,0.026187
departamento_vendas,0.507484,0.090092,5.632982,1.77e-08,0.330908,0.68406
departamento_RH/financeiro,0.191111,0.130166,1.468213,0.1420463481,-0.064009,0.44623
departamento_marketing/produto,-0.205266,0.122396,-1.677063,0.0935300918,-0.445157,0.034626
departamento_dados,-1.077472,0.133915,-8.045922,0.0,-1.339941,-0.815003
departamento_TI/suporte,-0.075693,0.101247,-0.747612,0.4546940438,-0.274133,0.122746
patamar_salario_01_menor_igual_media,0.271375,0.086491,3.137607,0.001703332,0.101856,0.440895
patamar_salario_02_acima_media,-0.931211,0.124722,-7.466314,0.0,-1.175661,-0.686761


In [32]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

In [40]:
len(df_predictors[predictors].columns)

13

In [39]:
df_predictors[predictors].shape[1]

13

In [33]:
 from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif_data = pd.DataFrame() 
vif_data["feature"] = df_predictors[predictors].columns

vif_data["VIF"] = [variance_inflation_factor(df_predictors[predictors].values, i) 
                          for i in range(len(df_predictors[predictors].columns))] 
  
print(vif_data)

                                   feature       VIF
0                            tempo_empresa  1.025463
1                         flag_promocao_3m  1.021470
2                      departamento_vendas       inf
3               departamento_RH/financeiro       inf
4           departamento_marketing/produto       inf
5                       departamento_dados       inf
6                  departamento_TI/suporte       inf
7     patamar_salario_01_menor_igual_media       inf
8           patamar_salario_02_acima_media       inf
9     ult_avaliacao_clima_cat_(0.51, 0.83]       inf
10  ult_avaliacao_clima_cat_(-0.001, 0.51]       inf
11     ult_avaliacao_clima_cat_(0.83, 1.0]       inf
12           ult_avaliacao_clima_cat_Vazio       inf


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
