In [76]:
# Importar las librerías necesarias
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from lazypredict.Supervised import LazyClassifier

import test_coink.utils.paths as path
pd.options.display.float_format = '{:,.2f}'.format

In [77]:
# Cargar los datos
data = pd.read_csv(path.data_processed_dir('info_satisfaccion_trabajo_clean.csv'))
data.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
979,34,No,Travel_Rarely,285,Research & Development,29,Bachelor,Medical,1377,Medium,...,Excellent,Low,2,10,1,Better,8,7,7,7
413,42,No,Travel_Frequently,1368,Research & Development,28,Master,Technical Degree,551,Very High,...,Excellent,Very High,3,7,4,Best,6,5,0,4
212,27,No,Travel_Frequently,1242,Sales,20,Bachelor,Life Sciences,293,Very High,...,Excellent,Very High,0,7,2,Better,7,7,0,7
672,42,No,Travel_Rarely,462,Sales,14,College,Medical,936,High,...,Excellent,Low,0,10,6,Better,5,4,0,3
1098,40,No,Non-Travel,1142,Research & Development,8,College,Life Sciences,1552,Very High,...,Excellent,High,0,8,2,Better,2,2,2,2


In [78]:
# fiajmos un orde estrictopar el paso de la prediccion
columns_order = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']
data = data[columns_order]

In [79]:
# Crear un objeto ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'HourlyRate','JobLevel', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                                    'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears','TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
                                    'YearsSinceLastPromotion', 'YearsWithCurrManager']),

        ('num_pca', PCA(n_components=3), ['JobLevel','MonthlyIncome','YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager']),

        ('cat', OneHotEncoder(), ['Attrition', 'BusinessTravel', 'Department', 'Education','EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
        'JobRole', 'MaritalStatus', 'Over18', 'OverTime','PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance'])
    ],
    remainder= 'passthrough'
    )

In [93]:
# Crear el pipeline integrado
pipe = Pipeline([
    ('preprocessor', preprocessor)
])

# Dividir los datos en variables de entrada y salida
X = data.drop('JobSatisfaction', axis=1)
y = data['JobSatisfaction']

X_transform = pipe.fit_transform(X)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Entrenar el pipeline integrado
# pipe.fit_transform(X_train, y_train)

clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)


100%|██████████| 29/29 [00:06<00:00,  4.77it/s]


In [94]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.28,0.27,,0.28,0.06
SVC,0.33,0.26,,0.28,0.15
NuSVC,0.28,0.26,,0.28,0.18
SGDClassifier,0.3,0.26,,0.28,0.1
ExtraTreesClassifier,0.31,0.26,,0.28,0.27
ExtraTreeClassifier,0.27,0.26,,0.27,0.02
LabelPropagation,0.26,0.26,,0.27,0.19
LabelSpreading,0.26,0.26,,0.27,0.2
BaggingClassifier,0.28,0.26,,0.28,0.22
XGBClassifier,0.27,0.25,,0.26,0.83


In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm

rs = {'random_state': 42}

# Classification - Model Pipeline
def modelPipeline(X_train, X_test, y_train, y_test):

    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    xgb = XGBClassifier(**rs, verbosity=0)

    # Crear un objeto ColumnTransformer
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'HourlyRate','JobLevel', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                                    'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears','TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
                                    'YearsSinceLastPromotion', 'YearsWithCurrManager']),

        ('num_pca', PCA(n_components=3), ['JobLevel','MonthlyIncome','YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager']),

        ('cat', OneHotEncoder(), ['Attrition', 'BusinessTravel', 'Department', 'Education','EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
        'JobRole', 'MaritalStatus', 'Over18', 'OverTime','PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance'])
    ],
    remainder= 'passthrough'
    )

    clfs = [
            ('Logistic Regression', log_reg), 
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn), 
            ('SVM', svm), 
            ('MLP', mlp), 
            ('Decision Tree', dt), 
            ('Extra Trees', et), 
            ('Random Forest', rf), 
            ('XGBoost', xgb)
            ]


    pipelines = []

    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])


    for clf_name, clf in clfs:

        pipeline = Pipeline(steps=[
                                   ('preprocessor', preprocessor),
                                   ('classifier', clf)
                                   ]
                            )
        pipeline.fit(X_train, y_train)


        y_pred = pipeline.predict(X_test)
        # F1-Score
        fscore = skm.f1_score(y_test, y_pred,average='weighted')
        # Precision
        pres = skm.precision_score(y_test, y_pred,average='weighted')
        # Recall
        rcall = skm.recall_score(y_test, y_pred,average='weighted')
        # Accuracy
        accu = skm.accuracy_score(y_test, y_pred)
        # ROC_AUC
        # roc_auc = skm.roc_auc_score(y_test, y_pred,multi_class='ovr' )#average='weighted'


        pipelines.append(pipeline)

        scores_df = scores_df.append({
                                      'Model' : clf_name, 
                                      'F1_Score' : fscore,
                                      'Precision' : pres,
                                      'Recall' : rcall,
                                      'Accuracy' : accu
                                    #   'ROC_AUC' : roc_auc
                                      }, 
                                     ignore_index=True)
        
    return pipelines, scores_df

In [95]:
pipelines, scores_df = modelPipeline(X_train, X_test, y_train, y_test)

In [96]:
scores_df

Unnamed: 0,Model,F1_Score,Precision,Recall,Accuracy,ROC_AUC
0,Logistic Regression,0.24,0.23,0.27,0.27,
1,Naive Bayes,0.27,0.27,0.27,0.27,
2,K-Nearest Neighbors,0.24,0.24,0.24,0.24,
3,SVM,0.18,0.18,0.31,0.31,
4,MLP,0.29,0.3,0.3,0.3,
5,Decision Tree,0.27,0.27,0.27,0.27,
6,Extra Trees,0.25,0.24,0.26,0.26,
7,Random Forest,0.25,0.26,0.28,0.28,
8,XGBoost,0.24,0.23,0.24,0.24,


Esta tabla muestra los resultados de 9 modelos diferentes de ML en un problema de clasificación. Las métricas incluyen F1 Score, Precision, Recall y Accuracy. Estas métricas permiten evaluar la capacidad de los modelos para clasificar correctamente las etiquetas de la clase objetivo. 
F1 Score es una métrica combinada de Precision y Recall. 
Precision mide la cantidad de etiquetas positivas correctas en comparación con las etiquetas positivas totales predichas. 
Recall mide la cantidad de etiquetas positivas correctas en comparación con las etiquetas positivas totales reales. 
Accuracy mide la proporción de etiquetas clasificadas correctamente. 

En general, la tabla muestra que ninguno de los modelos tuvo un desempeño sobresaliente en todas las métricas, lo que sugiere que podría ser necesario realizar más investigaciones y ajustes en los modelos.

### En menos 150 palabras explique cómo monitorearía este modelo en caso en caso de que estuviera desplegado para uso recurrente de la empresa.
Plataformas de monitoreo en tiempo real, como Datadog o New Relic, son muy útiles para monitorear el rendimiento de modelos de ML. Estas plataformas permiten visualizar métricas en tiempo real y detectar problemas de forma temprana. Además, permiten recibir alertas en caso de que se produzcan desviaciones importantes en el rendimiento. También proporcionan informes detallados y análisis para entender la causa de los problemas. Es importante elegir la plataforma adecuada en función de los requisitos específicos del proyecto y de las necesidades de la empresa. Con una buena plataforma de monitoreo, la empresa puede estar segura de que su modelo de clasificación está funcionando de manera óptima y de que puede detectar problemas de forma temprana.