In [1]:
# CategoricalTransformer
# Transformaciones personalizadas

In [16]:
import re
from datetime import datetime

import nltk
import pandas as pd
from data_analysis_octopus import *
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, r2_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from unidecode import unidecode
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor, LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler)




def freq_discrete(df, features):
    for feature in features:
        print(f"Feature: {feature}")
        abs_ = df[feature].value_counts(dropna=False).to_frame().rename(columns={"count": "Absolute frequency"})
        rel_ = df[feature].value_counts(dropna=False, normalize= True).to_frame().rename(columns={"proportion": "Relative frequency"})
        freq = abs_.join(rel_)
        freq["Accumulated frequency"] = freq["Absolute frequency"].cumsum()
        freq["Accumulated %"] = freq["Relative frequency"].cumsum()
        freq["Absolute frequency"] = freq["Absolute frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Relative frequency"] = freq["Relative frequency"].map(lambda x: "{:,.2%}".format(x))
        freq["Accumulated frequency"] = freq["Accumulated frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Accumulated %"] = freq["Accumulated %"].map(lambda x: "{:,.2%}".format(x))
        display(freq)


def generate_param_grid(model_name, param_grid):

    renamed_param_grid = {}

    for key, value in param_grid.items():
        if key == "scaler":
            renamed_param_grid[key] = value
        else:
            renamed_param_grid[model_name + "__" + key] = value

    return renamed_param_grid


def train_classifier_model(X, y, model, param_grid=None):
    model_name = model.__name__.lower()
    renamed_param_grid = generate_param_grid(model_name, param_grid)
    # Dividir los datos en conjuntos de entrenamiento y prueba
    squeezed_y = y.squeeze()
    X_train, X_test, y_train, y_test = train_test_split(
        X, squeezed_y, test_size=0.2, random_state=42, stratify=squeezed_y)
    # Crear el clasificador base
    if 'n_jobs' in model.__dict__:
        model_to_train = model(n_jobs=-1)
    else:
        model_to_train = model()
    # Crando el pipeline
    pipeline = Pipeline([
        # Este es solo un valor por defecto, se cambiará en GridSearchCV
        ("scaler", StandardScaler()),  
        (model_name, model_to_train)
    ])
    # Si se proporciona un grid de parámetros, realizar GridSearchCV
    if param_grid:
        grid_search = GridSearchCV(
            pipeline,
            renamed_param_grid,
            cv=5,
            scoring="f1_macro",
            n_jobs=-1,
            error_score=-1
        )
        grid_search.fit(X_train, y_train)

        # Obtener los mejores hiperparámetros y el mejor modelo
        best_model = grid_search.best_estimator_
        
        print("Mejores hiperparámetros encontrados GridSearchCV:")
        print(grid_search.best_params_)
    else:
        best_model = model_to_train

    
    # Evaluación del modelo con validación cruzada
    cv_scores = cross_val_score(
        best_model,
        X_train,
        y_train,
        cv=10,
        scoring='f1_macro',
    )
    train_mean_score = cv_scores.mean()
    std_dev = np.std(cv_scores)

    # Predicción y evaluación en el conjunto de prueba
    y_pred = best_model.predict(X_test)
    print("\nReporte de Clasificación en el conjunto de prueba:")
    print(classification_report(y_test, y_pred))

    # Evaluación del modelo en el conjunto de prueba
    test_score = f1_score(y_test, y_pred, average='macro')

    print("F1 Macro Score de validación cruzada (train):", train_mean_score)
    print("Standar deviation F1 Macro Scores(train):", std_dev)

    print("F1 Macro Score en el conjunto de prueba (test):", test_score)

    if hasattr(best_model.named_steps[model_name], 'coef_'):
    # Obtener los coeficientes del modelo
        feature_importances = best_model.named_steps[model_name].coef_[0]
        abs_coef = np.abs(feature_importances)
        coef_abs_tuples = [(feature_importances[i], abs_coef[i]) for i in range(len(feature_importances))]

        # Ordenar la lista de tuplas por el valor absoluto en orden descendente
        sorted_coef_abs_tuples = sorted(coef_abs_tuples, key=lambda x: x[1], reverse=True)
        feature_importances = [coef[0] for coef in sorted_coef_abs_tuples]

    else:
        feature_importances = best_model.named_steps[model_name].feature_importances_

    feature_names = X_train.columns
    feature_importances_df = pd.DataFrame({
        'Característica': feature_names,
        'Importancia': feature_importances
    }).sort_values('Importancia', ascending=False)

    return feature_importances_df

In [3]:
filename = "../data/train_p3.csv"
df = pd.read_csv(filename)

df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,18125,220000,2,3,2,32,0,0,0,0,...,171391,178742,164793,7000,15000,6000,10000,10000,7000,0
1,3983,220000,2,2,1,41,2,0,0,2,...,29579,48933,97187,1465,3009,0,20000,50000,1240,0
2,19251,80000,2,1,2,27,-1,-1,-1,0,...,917,702,3099,1325,917,0,702,3099,0,0
3,4024,20000,2,2,2,38,1,2,2,4,...,14190,13721,13848,2500,2501,0,0,441,1,0
4,20610,100000,2,1,2,28,0,0,0,0,...,100999,101188,99328,4600,4000,4000,4000,4000,4400,0


In [4]:
mask = [
    "ID",
]

continuas = [
    "LIMIT_BAL",
    "AGE",
    "BILL_AMT1",
    "BILL_AMT2",
    "BILL_AMT3",
    "BILL_AMT4",
    "BILL_AMT5",
    "BILL_AMT6",
    "PAY_AMT1",
    "PAY_AMT2",
    "PAY_AMT3",
    "PAY_AMT4"


]

mask += continuas

columns = df.columns[df.columns.isin(mask) == False]
freq_discrete(df, columns)
    

Feature: SEX


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,12709,60.52%,12709,60.52%
1,8291,39.48%,21000,100.00%


Feature: EDUCATION


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
EDUCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,9785,46.60%,9785,46.60%
1,7429,35.38%,17214,81.97%
3,3468,16.51%,20682,98.49%
5,189,0.90%,20871,99.39%
4,82,0.39%,20953,99.78%
6,37,0.18%,20990,99.95%
0,10,0.05%,21000,100.00%


Feature: MARRIAGE


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
MARRIAGE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,11188,53.28%,11188,53.28%
1,9551,45.48%,20739,98.76%
3,224,1.07%,20963,99.82%
0,37,0.18%,21000,100.00%


Feature: PAY_0


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10313,49.11%,10313,49.11%
-1,3943,18.78%,14256,67.89%
1,2600,12.38%,16856,80.27%
-2,1957,9.32%,18813,89.59%
2,1873,8.92%,20686,98.50%
3,212,1.01%,20898,99.51%
4,54,0.26%,20952,99.77%
5,21,0.10%,20973,99.87%
8,13,0.06%,20986,99.93%
6,7,0.03%,20993,99.97%


Feature: PAY_2


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10980,52.29%,10980,52.29%
-1,4254,20.26%,15234,72.54%
2,2745,13.07%,17979,85.61%
-2,2660,12.67%,20639,98.28%
3,223,1.06%,20862,99.34%
4,77,0.37%,20939,99.71%
5,19,0.09%,20958,99.80%
1,17,0.08%,20975,99.88%
7,14,0.07%,20989,99.95%
6,10,0.05%,20999,100.00%


Feature: PAY_3


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,11062,52.68%,11062,52.68%
-1,4131,19.67%,15193,72.35%
-2,2884,13.73%,18077,86.08%
2,2634,12.54%,20711,98.62%
3,171,0.81%,20882,99.44%
4,60,0.29%,20942,99.72%
7,18,0.09%,20960,99.81%
6,17,0.08%,20977,99.89%
5,16,0.08%,20993,99.97%
1,4,0.02%,20997,99.99%


Feature: PAY_4


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,11497,54.75%,11497,54.75%
-1,3951,18.81%,15448,73.56%
-2,3089,14.71%,18537,88.27%
2,2208,10.51%,20745,98.79%
3,129,0.61%,20874,99.40%
4,52,0.25%,20926,99.65%
7,39,0.19%,20965,99.83%
5,27,0.13%,20992,99.96%
6,4,0.02%,20996,99.98%
1,2,0.01%,20998,99.99%


Feature: PAY_5


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,11883,56.59%,11883,56.59%
-1,3848,18.32%,15731,74.91%
-2,3202,15.25%,18933,90.16%
2,1816,8.65%,20749,98.80%
3,129,0.61%,20878,99.42%
4,64,0.30%,20942,99.72%
7,40,0.19%,20982,99.91%
5,14,0.07%,20996,99.98%
6,3,0.01%,20999,100.00%
8,1,0.00%,21000,100.00%


Feature: PAY_6


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,11410,54.33%,11410,54.33%
-1,3983,18.97%,15393,73.30%
-2,3445,16.40%,18838,89.70%
2,1930,9.19%,20768,98.90%
3,135,0.64%,20903,99.54%
4,40,0.19%,20943,99.73%
7,29,0.14%,20972,99.87%
6,17,0.08%,20989,99.95%
5,9,0.04%,20998,99.99%
8,2,0.01%,21000,100.00%


Feature: PAY_AMT5


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_AMT5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4698,22.37%,4698,22.37%
1000,926,4.41%,5624,26.78%
2000,918,4.37%,6542,31.15%
3000,670,3.19%,7212,34.34%
5000,586,2.79%,7798,37.13%
...,...,...,...,...
83034,1,0.00%,20996,99.98%
7009,1,0.00%,20997,99.99%
49,1,0.00%,20998,99.99%
9513,1,0.00%,20999,100.00%


Feature: PAY_AMT6


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_AMT6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5026,23.93%,5026,23.93%
2000,901,4.29%,5927,28.22%
1000,888,4.23%,6815,32.45%
3000,614,2.92%,7429,35.38%
5000,593,2.82%,8022,38.20%
...,...,...,...,...
39268,1,0.00%,20996,99.98%
7006,1,0.00%,20997,99.99%
2595,1,0.00%,20998,99.99%
902,1,0.00%,20999,100.00%


Feature: default payment next month


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
default payment next month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16445,78.31%,16445,78.31%
1,4555,21.69%,21000,100.00%


In [5]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [6]:
mask = [
    "ID",
    "default payment next month"
]

X = df[df.columns[df.columns.isin(mask) == False]]
y = df[["default payment next month"]]

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("ignore", category=UserWarning, message="Solver lbfgs supports only 'l2' or None penalties, got l1 penalty.")
# warnings.filterwarnings("ignore", category=UserWarning, message="Solver newton-cg supports only 'l2' or None penalties, got l1 penalty.")

param_grid = {
    "scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
    # 'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    # 'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    # 'max_iter': [100, 200, 300, 400, 500]
}

# Llamada a la función con los parámetros y datos
feature_importances_df = train_classifier_model(X, y, model=LogisticRegression, param_grid=param_grid)
feature_importances_df



Mejores hiperparámetros encontrados GridSearchCV:
{'logisticregression__solver': 'sag', 'scaler': RobustScaler()}
