# **Desbalanceo**

In [1]:
import pandas as pd
from collections import Counter
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_pickle("../datos/dataframes/df_nonulls_encoded_outliers_scaled.pkl")
print(df.shape)
df.head(2)

(1520, 34)


Unnamed: 0,Attrition,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,...,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,0,0.0,-0.94495,1.0,1.333333,0.0,-1.0,-0.083333,-0.5,0.415932,...,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,1.041064,-0.333333,0.108443,0.0,0.25,-1.0,0.415932,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
display(df["Attrition"].value_counts())
df["Attrition"].value_counts()/df.shape[0]*100

Attrition
0    1278
1     242
Name: count, dtype: int64

Attrition
0    84.078947
1    15.921053
Name: count, dtype: float64

Vemos que los porcentajes son 84-15 lo cual indica un gran desbalanceo, vamos a ajustalo a un 70-30.

In [11]:
X = df.drop(columns = "Attrition")
y = df["Attrition"]

Primero aplicamos el TometLink iterando varias veces para bajar la proporción de la clase mayoritaria.

### 1. Tomek Link

In [12]:
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X, y)

df_tomet = pd.DataFrame(X_resampled, columns=X.columns)
df_tomet['Attrition'] = y_resampled

print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_resampled))

Original dataset shape Counter({0: 1278, 1: 242})
Resampled dataset shape Counter({0: 1212, 1: 242})


In [13]:
df_tomet["Attrition"].value_counts()/df_tomet.shape[0]*100

Attrition
0    83.356259
1    16.643741
Name: count, dtype: float64

Hemos pasado de un 84-15 a un 83-16 ya que con esta estrategia no estamos consiguiendo bajar más el porcentaje de la clase mayoritaria, por lo tanto vamos a probar con SMOTENC para aumentar el porcentaje de la clase minoritaria.

### 2. SMOTENC

In [14]:
X_smote = df_tomet.drop(columns = "Attrition")
y_smote = df_tomet["Attrition"]


cols_cat = ['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel', 'Department', 'Education',
       'EducationField', 'JobLevel', 'MaritalStatus',
       'StockOptionLevel', 'TrainingTimesLastYear', 'JobInvolvement',
       'PerformanceRating', "NumCompaniesWorked", "PercentSalaryHike", 
       "TotalWorkingYears", "YearsAtCompany", "YearsSinceLastPromotion",
       "YearsWithCurrManager", 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative']

smotenc = SMOTENC(categorical_features=cols_cat, random_state=42, k_neighbors=5, sampling_strategy=0.6) 
X_resampled, y_resampled = smotenc.fit_resample(X_smote, y_smote)


# Mostrar el dataset balanceado
df_tomet_smote = pd.DataFrame(X_resampled, columns=X.columns)
df_tomet_smote['Attrition'] = y_resampled
df_tomet_smote.head(1)


Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,JobLevel,...,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,Attrition
0,0.0,-0.94495,1.0,1.333333,0.0,-1.0,-0.083333,-0.5,0.415932,-0.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_resampled))

Original dataset shape Counter({0: 1278, 1: 242})
Resampled dataset shape Counter({0: 1212, 1: 727})


In [16]:
df_tomet_smote["Attrition"].value_counts()/df_tomet_smote.shape[0]*100

Attrition
0    62.506447
1    37.493553
Name: count, dtype: float64

Hemos pasado de un 84-15 a un 82-17 y ahora a un 62-37 el cual es un balanceo bastante aceptable dada la proporción original de los datos.

In [18]:
df_tomet_smote.to_pickle("../datos/dataframes/df_nonulls_encoded_outliers_scaled_balanced.pkl")