In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import bootcampviztools as bt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, RobustScaler

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [3]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv("../data_sample/Absenteeism_at_work.csv", sep=";")

In [5]:
#Paso 1: Codificación
def recode_features(X):
    X = X.copy()
    X["absent"] = X["Absenteeism time in hours"].apply(lambda x: 1 if x >= 3 else 0)
    X["Education"] = X["Education"].apply(lambda x: 2 if x > 1 else 1)
    X["Has_pet"] = X["Pet"].apply(lambda x: 1 if x > 0 else 0)
    X["Has_son"] = X["Son"].apply(lambda x: 1 if x > 0 else 0)
    return X


In [6]:
df=recode_features(df)

In [7]:
target= "absent"

In [8]:
X= df.drop(columns=[target])
y = df[target]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
# Paso 2: Columnas a excluir
columns_to_exclude = ["ID", "Height", "Weight", "Disciplinary failure",
                      "Pet", "Son", "Absenteeism time in hours"]

def func_exclude_columns(X, exclude_cols = columns_to_exclude):
    # Skip missing values from colum
    X_temp = X[[col for col in X.columns if col not in exclude_cols]].copy()
    return X_temp

In [11]:
X_train= func_exclude_columns(X_train, exclude_cols=columns_to_exclude)
X_test= func_exclude_columns(X_test, exclude_cols=columns_to_exclude)

In [12]:
# Paso 3: Transformación numéricas
# Transformación logarítmica para las más sesgadas
log_feats = ['Transportation expense', 'Work load Average/day ', 'Body mass index']

log_transformer = Pipeline([
    ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
    ('scaler', RobustScaler())
])

log_only =Pipeline([
    ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one'))])

# Para las demás
std_feats = ['Distance from Residence to Work', 'Service time', 'Age', 'Hit target']

std_transformer = StandardScaler()

In [13]:
#Paso 4: One Hot Encoding
cat_to_encode=["Reason for absence", "Month of absence", "Day of the week", "Seasons"]

In [14]:


def map_categoricals(X):
    X = X.copy()

    razones = {
    0:"UNK", 1:'Infectious', 2:'Neoplasms', 3:'Blood', 4:'Endocrine',
    5:'Mental', 6:'Nervous_system', 7:'Eye', 8:'Ear',
    9:'Circulatory', 10:'Respiratory', 11:'Digestive', 12:'Skin',
    13:'Musculoskeletal', 14:'Genitourinary', 15: 'Pregnancy', 16:'Perinatal', 
    17:'Congenital', 18:'Symptons', 19:'Injuries', 20:'External causes', 
    21:'healthstatus', 22:'Follow-up', 23:'Consultation', 24:'Blood-donation',
    25:'Lab', 26:'Unjustified', 27:'Physio', 28:'Dentist'}

    meses = {
    1: 'Jan', 2: 'Febr', 3: 'Mar', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec', 0: 'UNK'}
    
    dias = {
    2: 'Monday', 3: 'Tuesday', 4: 'Wednesday',
    5: 'Thursday', 6: 'Friday'}
    
    estaciones = {
    1: 'Summer', 2: 'Autumn', 3: 'Winter', 4: 'Spring'}

    X["Reason for absence"] = X["Reason for absence"].map(razones)
    X["Month of absence"] = X["Month of absence"].map(meses)
    X["Day of the week"] = X["Day of the week"].map(dias)
    X["Seasons"] = X["Seasons"].map(estaciones)
    return X

In [15]:
categorical_pipeline = Pipeline([
    ('map_cats', FunctionTransformer(map_categoricals, feature_names_out='one-to-one')),
    ('onehot', ColumnTransformer([
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_to_encode)
    ], remainder='passthrough'))
])



In [16]:
# Pipeline de preprocesamiento completo
preprocessing = ColumnTransformer(
    transformers=[
        ("Transform_Log", log_transformer, log_feats),
        ("Transform_Std", std_transformer, std_feats),
        ("Process_Categorical", categorical_pipeline, cat_to_encode),
    ],
    remainder="passthrough"
)

In [17]:
preprocessing

In [18]:
pipe_preprocessed = preprocessing.fit_transform(X_train)
pipe_preprocessed

array([[-1.72624978, -0.57868182,  0.44235298, ...,  0.        ,
         0.        ,  0.        ],
       [-0.61248781,  0.91456467, -1.06280556, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.07914797, -0.15887831, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 0.11652598,  0.00840431,  1.64248959, ...,  0.        ,
         0.        ,  1.        ],
       [-0.61248781,  1.13231099,  0.84112169, ...,  0.        ,
         0.        ,  0.        ],
       [-0.61248781, -0.94094385,  0.84112169, ...,  0.        ,
         0.        ,  0.        ]], shape=(592, 61))

In [19]:
preprocessing.get_feature_names_out()

array(['Transform_Log__Transportation expense',
       'Transform_Log__Work load Average/day ',
       'Transform_Log__Body mass index',
       'Transform_Std__Distance from Residence to Work',
       'Transform_Std__Service time', 'Transform_Std__Age',
       'Transform_Std__Hit target',
       'Process_Categorical__onehot__Reason for absence_Blood-donation',
       'Process_Categorical__onehot__Reason for absence_Circulatory',
       'Process_Categorical__onehot__Reason for absence_Congenital',
       'Process_Categorical__onehot__Reason for absence_Consultation',
       'Process_Categorical__onehot__Reason for absence_Dentist',
       'Process_Categorical__onehot__Reason for absence_Digestive',
       'Process_Categorical__onehot__Reason for absence_Ear',
       'Process_Categorical__onehot__Reason for absence_Endocrine',
       'Process_Categorical__onehot__Reason for absence_Eye',
       'Process_Categorical__onehot__Reason for absence_Follow-up',
       'Process_Categorical__oneh

## Modelos

In [20]:
modelos_escalados = {
    "Logistic": LogisticRegression(max_iter=2000, class_weight="balanced")
}

modelos_no_escalados = {
    "RandomF": RandomForestClassifier(max_depth=10, random_state=42, class_weight="balanced"),
    "XGB": XGBClassifier(max_depth=10, random_state=42, n_jobs=-1),
    "LGB": LGBMClassifier(max_depth=10, random_state=42, verbose=-100, class_weight="balanced", n_jobs=-1)
}


In [21]:
pipelines={}

for nombre, modelo in modelos_escalados.items():
    pipe = Pipeline([
        ("preprocessing", preprocessing),  #ColumnTransformer con escalado
        ("classifier", modelo)
    ])
    pipelines[f"{nombre}_scaled"] = pipe

# Para modelos que NO necesitan escalado
for nombre, modelo in modelos_no_escalados.items():
    pipe = Pipeline([
        ("log", log_only),
        ("categoricals", categorical_pipeline),  # mapea y one-hot, las numéricas no las escalamos
        ("classifier", modelo)
    ])
    pipelines[f"{nombre}_no_scaled"] = pipe

In [22]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [23]:
def evaluate_model(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensibilidad = tp / (tp + fn)
    especificidad = tn / (tn + fp)
    precision = tp / (tp + fp)
    print(f"✔️ Sensibilidad: {sensibilidad:.2f}")
    print(f"✔️ Especificidad: {especificidad:.2f}")
    print(f"✔️ Precisión: {precision:.2f}")


In [24]:
for nombre, pipeline in pipelines.items():
    print(f"Evaluando modelo: {nombre}")
    pipeline.fit(X_train, y_train)
    pred_test = pipeline.predict(X_test)
    evaluate_model(y_test, pred_test)
    print(classification_report(y_test, pred_test))
    print("-" * 60)

Evaluando modelo: Logistic_scaled
✔️ Sensibilidad: 0.63
✔️ Especificidad: 0.81
✔️ Precisión: 0.84
              precision    recall  f1-score   support

           0       0.59      0.81      0.68        58
           1       0.84      0.63      0.72        90

    accuracy                           0.70       148
   macro avg       0.71      0.72      0.70       148
weighted avg       0.74      0.70      0.71       148

------------------------------------------------------------
Evaluando modelo: RandomF_no_scaled
✔️ Sensibilidad: 0.79
✔️ Especificidad: 0.48
✔️ Precisión: 0.70
              precision    recall  f1-score   support

           0       0.60      0.48      0.53        58
           1       0.70      0.79      0.74        90

    accuracy                           0.67       148
   macro avg       0.65      0.64      0.64       148
weighted avg       0.66      0.67      0.66       148

------------------------------------------------------------
Evaluando modelo: XGB_no_s



In [26]:
from sklearn.model_selection import GridSearchCV

# Definimos sus hiperparametros
reg_log_param = {
    "penalty": ["l1","l2"], # Regularizaciones L1 y L2.
    "C": [0.1, 0.5, 1.0, 5.0], 
    "max_iter": [50,100,500],
    "solver": ["liblinear"],  # Suele ser el más rápido
    "class_weight": ["balanced", None]
    }

rand_forest_param = {
    'n_estimators': [100, 200, 400],
    "max_depth": [3,4,5,6,10,15,17],
    'min_samples_leaf': [1,10,20,100],
    'class_weight':['balanced', None],
    "max_features": ["sqrt", 3, 4]
}

xgb_param = {
    "n_estimators": [10, 100, 200, 400],
    "max_depth": [1,2,4,8],
    "learning_rate": [0.1,0.2,0.5,1.0],
    "scale_pos_weight": [len(y_train[y_train == 0])/len(y_train[y_train == 1]), 1]
}

lgb_param = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.1, 0.3, 0.6, 1], 
    'max_depth': [1, 6, 10, -1],  
    'min_child_samples': [1, 10, 20, 100], 
    'scale_pos_weight': [
        len(df[df[target]==0]) / len(df[df[target]==1]),
        1],
    'colsample_bytree': [0.5, 1]
}

cv = 5

gs_reg_log = GridSearchCV(LogisticRegression(),
                          reg_log_param,
                          cv=cv,
                          scoring="balanced_accuracy",
                          verbose=1,
                          n_jobs=-1
                          )


gs_rand_forest = GridSearchCV(RandomForestClassifier(),
                              rand_forest_param,
                              cv=cv,
                              scoring="balanced_accuracy",
                              verbose=1,
                              n_jobs=-1)

gs_xgb = GridSearchCV(XGBClassifier(),
                      xgb_param,
                      cv=cv,
                      scoring="balanced_accuracy",
                      verbose=1,
                      n_jobs=-1)

gs_lgb = GridSearchCV(LGBMClassifier(),
                      lgb_param,
                      cv=cv,
                      scoring="balanced_accuracy",
                      verbose=1,
                      n_jobs=-1)


pipe_grids = {"gs_reg_log":gs_reg_log,
         "gs_rand_forest":gs_rand_forest,
         "gs_xgb":gs_xgb,
         "gs_lgb":gs_lgb}

In [27]:
for nombre, grid_search in pipe_grids.items():
    grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




Fitting 5 folds for each of 504 candidates, totalling 2520 fits
Fitting 5 folds for each of 128 candidates, totalling 640 fits
Fitting 5 folds for each of 768 candidates, totalling 3840 fits


In [28]:
best_grids = [(i, j.best_score_) for i, j in pipe_grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
2,gs_xgb,0.781264
3,gs_lgb,0.779452
1,gs_rand_forest,0.767939
0,gs_reg_log,0.595168


In [29]:
best_model = pipe_grids[best_grids.iloc[0,0]]
best_model

In [None]:
import joblib

In [None]:
joblib.dump(best_model, 'modelo_pipeline.joblib')

['modelo_pipeline.joblib']

In [30]:
best_model.best_estimator_

In [34]:
clf = best_model.best_estimator_
importances = clf.feature_importances_

# Suponiendo que conoces los nombres de las features
import pandas as pd

feature_names = X_train.columns  # X es el DataFrame con tus features
importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

In [32]:
importances_df

Unnamed: 0,feature,importance
0,Reason for absence,0.18241
14,Has_pet,0.106406
13,Body mass index,0.086606
4,Transportation expense,0.064727
7,Age,0.063506
11,Social drinker,0.062565
8,Work load Average/day,0.055049
10,Education,0.051742
5,Distance from Residence to Work,0.050385
3,Seasons,0.043669
