In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
from data_analysis_octopus import *
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split, StratifiedKFold)
from sklearn.feature_selection import RFECV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler)
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

import pickle



class GroupNumericalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.bins_dict = {}  # Diccionario para almacenar los bins calculados por columna

    def fit(self, X):
        self.bins_dict = {}
        self.numerical_columns = X.select_dtypes(include=["number"]).columns  # Obtener columnas numéricas
        
        for col in self.numerical_columns:
            # Calcular bins basados en cuantiles de la columna especificada
            quantiles = np.unique(
                X[col].quantile([0, 0.2, 0.4, 0.6, 0.8, 1]).values.astype(int)
            )
            bins = list(quantiles[:-1]) + [quantiles[-1] + 1]
            self.bins_dict[col] = bins
        return self

    def transform(self, X):
        X_transformed = pd.DataFrame(index=X.index)
        
        for col in self.bins_dict:
            bins = self.bins_dict[col]
            labels = [f"{bins[i]}_a_{bins[i+1]}" for i in range(len(bins)-1)]
            X_transformed[f"cat_{col}"] = pd.cut(X[col], bins=bins, labels=labels, right=False)
        
        return X_transformed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


def freq_discrete(df, features):
    for feature in features:
        print(f"Feature: {feature}")
        abs_ = df[feature].value_counts(dropna=False).to_frame().rename(columns={"count": "Absolute frequency"})
        rel_ = df[feature].value_counts(dropna=False, normalize= True).to_frame().rename(columns={"proportion": "Relative frequency"})
        freq = abs_.join(rel_)
        freq["Accumulated frequency"] = freq["Absolute frequency"].cumsum()
        freq["Accumulated %"] = freq["Relative frequency"].cumsum()
        freq["Absolute frequency"] = freq["Absolute frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Relative frequency"] = freq["Relative frequency"].map(lambda x: "{:,.2%}".format(x))
        freq["Accumulated frequency"] = freq["Accumulated frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Accumulated %"] = freq["Accumulated %"].map(lambda x: "{:,.2%}".format(x))
        display(freq)


def generate_param_grid(model_name, param_grid):

    renamed_param_grid = {}

    for key, value in param_grid.items():
        if key == "scaler":
            renamed_param_grid["preprocessor__num__scaler"] = value
        else:
            renamed_param_grid[model_name + "__" + key] = value

    return renamed_param_grid


def _set_n_jobs_config(model):
    if "n_jobs" in model.__dict__:
        model_to_train = model(n_jobs=-1)
    else:
        model_to_train = model()
    return model_to_train

def perform_grid_search(X_train, y_train, model, param_grid, cv=3, verbose=True):

    if verbose:
        verbose = 3
    else:
        verbose = 1

    model_to_train = _set_n_jobs_config(model)
    
    grid_search = GridSearchCV(
        model_to_train, param_grid, cv=StratifiedKFold(n_splits=cv),
        scoring="f1_micro", n_jobs=-1, error_score=-1, verbose=verbose
    )
    # Entrenamiento
    grid_search.fit(X_train, y_train)
    # Obtener los mejores hiperparámetros y el mejor modelo
    best_model = grid_search.best_estimator_
    print("Mejores hiperparámetros encontrados GridSearchCV:")
    print(grid_search.best_params_)

    return best_model, grid_search.best_params_


def cross_validation_report(model, X_train, y_train, verbose):
    cv_scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=10,
            scoring="f1_micro",
        )

    f1_mean_score_train = cv_scores.mean()
    std_dev_train = round(np.std(cv_scores), 4)

    if verbose:
        print(">>> F1 Macro Score de validación cruzada (train):", f1_mean_score_train)
        print(">>> Standar deviation (train):", std_dev_train)

    return f1_mean_score_train, std_dev_train


def test_report(model, X_test, y_test, verbose):
    # Predicción y evaluación en el conjunto de prueba
    y_pred = model.predict(X_test)
    # Evaluación del modelo en el conjunto de prueba
    f1_score_test = f1_score(y_test, y_pred)

    if verbose:
        print(">>> F1 Macro Score en el conjunto de prueba (test):", f1_score_test)
        print("\nReporte de Clasificación en el conjunto de prueba:")
        print(classification_report(y_test, y_pred))

    return f1_score_test


def train_classifier_model(X_train, X_test, y_train, y_test, model, param_grid=None, verbose: bool = True):
    model_name = model.__name__.lower()

    if model_name in ["svc"]:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    # Si se proporciona un grid de parámetros, realizar GridSearchCV
    if param_grid:
        best_model, best_params = perform_grid_search(
            X_train, y_train, model, param_grid, verbose=verbose)
    else:
        best_params = ""
        model_to_train = _set_n_jobs_config(model)
        best_model = model_to_train
        best_model.fit(X_train, y_train)

    f1_mean_score_train, std_dev_train = cross_validation_report(
        best_model, X_train, y_train, verbose)
    f1_score_test = test_report(best_model, X_test, y_test, verbose)
    
    return best_model, best_params, f1_mean_score_train, std_dev_train, f1_score_test


def get_feature_importances(model, X):
    coefficients = pd.Series(model.coef_.flatten())
    features_df = pd.DataFrame(
        {
            "Características": pd.Series(X.columns),
            "Coeficientes": coefficients
        }
    )

    features_df["Importancia"] = features_df["Coeficientes"].abs()
    features_df = features_df.sort_values(by="Importancia", ascending=False)
    return features_df.reset_index(drop=True)


def get_feature_importances_from_pipeline(pipeline, categorical_features):

    model_name = list(pipeline.named_steps.keys())[1]

    if hasattr(pipeline.named_steps[model_name], "coef_"):
    # Obtener los coeficientes del modelo
        feature_importances = pipeline.named_steps[model_name].coef_[0]
    else:
        feature_importances = pipeline.named_steps[model_name].feature_importances_

    preprocessor = pipeline.named_steps["preprocessor"]
    onehot_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]

    # Obtener los nombres de las columnas después de aplicar OneHotEncoder
    categorical_features = onehot_encoder.get_feature_names_out(input_features=categorical_features)
    numerical_features = list(preprocessor.named_transformers_["num"].get_feature_names_out())

    feature_names =  numerical_features + list(categorical_features)

    feature_importances_df = pd.DataFrame({
        "Característica": feature_names,
        "Coef": feature_importances,
        "Importancia": np.abs(feature_importances)
    }).sort_values("Importancia", ascending=False)

    return feature_importances_df.reset_index(drop=True)


def transform_data(X_train, X_test, numerical_features=None,
                   categorical_features=None):
    
    numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler()),  
    ])

    categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ])

    if numerical_features:
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numerical_transformer, numerical_features),
                ("cat", categorical_transformer, categorical_features)
        ])
    else:
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", categorical_transformer, categorical_features)
        ])

    # Ajustar y transformar los datos de entrenamiento
    X_train_transformed_data = preprocessor.fit_transform(X_train)
    X_test_transformed_data = preprocessor.transform(X_test)
    # Obtener los nombres de las columnas después de aplicar OneHotEncoder
    onehot_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    categorical_features_encoded = onehot_encoder.get_feature_names_out(
        input_features=categorical_features)

    if numerical_features:
        feature_names =  numerical_features + list(categorical_features_encoded)
    else:
        feature_names = list(categorical_features)
    # Convertir los datos transformados a DataFrames de Pandas
    try:
        X_train_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_train_transformed_data, columns=feature_names)
        X_test_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_test_transformed_data, columns=feature_names)

    except:
        X_train_transformed_df = pd.DataFrame(X_train_transformed_data, columns=feature_names)
        X_test_transformed_df = pd.DataFrame(X_test_transformed_data, columns=feature_names)


    return X_train_transformed_df, X_test_transformed_df, preprocessor


def evaluate_models(params_dict, X_train, X_test, y_train, y_test):
    results_list = []
    for model_name, model in params_dict.items():
        results = train_classifier_model(
            X_train, X_test, y_train, y_test, model=model, verbose=False
        )

        _, _, f1_mean_score_train, std_dev_train, f1_score_test = results

        results_list.append([
            model_name, f1_mean_score_train, std_dev_train, f1_score_test
        ])
        
    return pd.DataFrame(results_list, columns=["model", "f1-score-train", "std-dev", "f1-score-test"])

In [2]:
filename = "https://raw.githubusercontent.com/cuauhtemocbe/Diplomado-Ciencia-Datos/main/data/train_p3.csv"
raw_df = pd.read_csv(filename)

print(len(raw_df))
raw_df.head()

21000


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,18125,220000,2,3,2,32,0,0,0,0,...,171391,178742,164793,7000,15000,6000,10000,10000,7000,0
1,3983,220000,2,2,1,41,2,0,0,2,...,29579,48933,97187,1465,3009,0,20000,50000,1240,0
2,19251,80000,2,1,2,27,-1,-1,-1,0,...,917,702,3099,1325,917,0,702,3099,0,0
3,4024,20000,2,2,2,38,1,2,2,4,...,14190,13721,13848,2500,2501,0,0,441,1,0
4,20610,100000,2,1,2,28,0,0,0,0,...,100999,101188,99328,4600,4000,4000,4000,4000,4400,0


In [3]:
# Ya que la target se encuentra muy desbalanceado
# Realizaré un muestro de la categoria 0
target = "default payment next month"
raw_df[target].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,proportion
default payment next month,Unnamed: 1_level_1
0,0.783095
1,0.216905


In [4]:
df_0 = raw_df[raw_df[target] == 0].sample(frac=0.35, random_state=0)
df_1 = raw_df[raw_df[target] == 1]

df = pd.concat([df_0, df_1])

print(len(df))
df[target].value_counts(normalize=True).to_frame()

10311


Unnamed: 0_level_0,proportion
default payment next month,Unnamed: 1_level_1
0,0.558239
1,0.441761


## Train and test

In [36]:
mask = [
    "ID",
    "default payment next month"
]

X = df[df.columns[df.columns.isin(mask) == False]]
y = df[["default payment next month"]]

squeezed_y = y.squeeze()
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, squeezed_y, test_size=0.25, random_state=42, stratify=squeezed_y)

In [37]:
numerical_features = [
    "LIMIT_BAL",
    "BILL_AMT1",
    "BILL_AMT2",
    "BILL_AMT3",
    "BILL_AMT4",
    "BILL_AMT5",
    "BILL_AMT6",
    "PAY_AMT1",
    "PAY_AMT2",
    "PAY_AMT3",
    "PAY_AMT4",
    "PAY_AMT5",
    "PAY_AMT6",
    "AGE"
]

categorical_features = [
    "SEX",
    "EDUCATION",
    "MARRIAGE", 
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

freq_discrete(X, categorical_features)
    

Feature: SEX


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,6131,59.46%,6131,59.46%
1,4180,40.54%,10311,100.00%


Feature: EDUCATION


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
EDUCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,4947,47.98%,4947,47.98%
1,3505,33.99%,8452,81.97%
3,1739,16.87%,10191,98.84%
5,71,0.69%,10262,99.52%
4,32,0.31%,10294,99.84%
6,16,0.16%,10310,99.99%
0,1,0.01%,10311,100.00%


Feature: MARRIAGE


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
MARRIAGE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,5449,52.85%,5449,52.85%
1,4735,45.92%,10184,98.77%
3,119,1.15%,10303,99.92%
0,8,0.08%,10311,100.00%


Feature: PAY_0


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4468,43.33%,4468,43.33%
-1,1759,17.06%,6227,60.39%
2,1496,14.51%,7723,74.90%
1,1490,14.45%,9213,89.35%
-2,841,8.16%,10054,97.51%
3,181,1.76%,10235,99.26%
4,44,0.43%,10279,99.69%
5,14,0.14%,10293,99.83%
8,8,0.08%,10301,99.90%
7,6,0.06%,10307,99.96%


Feature: PAY_2


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4976,48.26%,4976,48.26%
2,1967,19.08%,6943,67.34%
-1,1879,18.22%,8822,85.56%
-2,1238,12.01%,10060,97.57%
3,163,1.58%,10223,99.15%
4,50,0.48%,10273,99.63%
5,13,0.13%,10286,99.76%
7,9,0.09%,10295,99.84%
6,8,0.08%,10303,99.92%
1,7,0.07%,10310,99.99%


Feature: PAY_3


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5134,49.79%,5134,49.79%
-1,1841,17.85%,6975,67.65%
2,1796,17.42%,8771,85.06%
-2,1329,12.89%,10100,97.95%
3,125,1.21%,10225,99.17%
4,43,0.42%,10268,99.58%
7,17,0.16%,10285,99.75%
6,12,0.12%,10297,99.86%
5,11,0.11%,10308,99.97%
8,2,0.02%,10310,99.99%


Feature: PAY_4


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5364,52.02%,5364,52.02%
-1,1756,17.03%,7120,69.05%
2,1554,15.07%,8674,84.12%
-2,1441,13.98%,10115,98.10%
3,98,0.95%,10213,99.05%
4,40,0.39%,10253,99.44%
7,35,0.34%,10288,99.78%
5,17,0.16%,10305,99.94%
6,4,0.04%,10309,99.98%
1,1,0.01%,10310,99.99%


Feature: PAY_5


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5581,54.13%,5581,54.13%
-1,1767,17.14%,7348,71.26%
-2,1487,14.42%,8835,85.69%
2,1281,12.42%,10116,98.11%
3,100,0.97%,10216,99.08%
4,45,0.44%,10261,99.52%
7,36,0.35%,10297,99.86%
5,11,0.11%,10308,99.97%
6,2,0.02%,10310,99.99%
8,1,0.01%,10311,100.00%


Feature: PAY_6


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
PAY_6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5394,52.31%,5394,52.31%
-1,1806,17.52%,7200,69.83%
-2,1620,15.71%,8820,85.54%
2,1315,12.75%,10135,98.29%
3,95,0.92%,10230,99.21%
4,32,0.31%,10262,99.52%
7,27,0.26%,10289,99.79%
6,14,0.14%,10303,99.92%
5,6,0.06%,10309,99.98%
8,2,0.02%,10311,100.00%


In [7]:
# NOTE: Probando con tasa de crecimiento
# df[["BILL_AMT1", "BILL_AMT2", "bill_grow_rate_1"]][np.isinf(df["bill_grow_rate_1"])]
# for n, (final_period, initial_period) in enumerate(zip(bill_columns, bill_columns[1:]), 1):
#     df[f"bill_grow_rate_{n}"] = round((df[final_period] - df[initial_period] ) / df[initial_period] * 100)
#     df[f"bill_grow_rate_{n}"] = df[f"bill_grow_rate_{n}"].fillna(0)
#     df[f"bill_grow_rate_{n}"] = df[f"bill_grow_rate_{n}"].replace([np.inf, -np.inf], np.nan)
#     # df[f"bill_grow_rate_{n}"] = df[f"bill_grow_rate_{n}"].astype("int64")

## Generación de features

In [38]:
grouper = GroupNumericalFeatures()
data_grouped_train = grouper.fit_transform(X_train_raw[numerical_features])
new_categorical_features = data_grouped_train.columns.to_list()
X_train = pd.concat([X_train_raw, data_grouped_train], axis=1)

data_grouped_test = grouper.transform(X_test_raw)
X_test = pd.concat([X_test_raw, data_grouped_test], axis=1)

X_train, X_test, preprocessor = transform_data(
    X_train, X_test, numerical_features=numerical_features,
    categorical_features=categorical_features + new_categorical_features)

# X_train, X_test, preprocessor = transform_data(
#     X_train_raw, X_test_raw, numerical_features=numerical_features,
#     categorical_features=categorical_features)



In [39]:
with open("../results/7-Clasificación-clientes/grouper.pickle", "wb") as f:
    pickle.dump(grouper, f)

with open("../results/7-Clasificación-clientes/preprocessor.pickle", "wb") as f:
    pickle.dump(preprocessor, f)

In [9]:
print(X_train.shape)
print(", ".join(X_train.columns))

(7733, 128)
LIMIT_BAL, BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6, PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6, AGE, SEX_2, EDUCATION_2, EDUCATION_3, EDUCATION_4, EDUCATION_5, EDUCATION_6, MARRIAGE_1, MARRIAGE_2, MARRIAGE_3, PAY_0_-1, PAY_0_0, PAY_0_1, PAY_0_2, PAY_0_3, PAY_0_4, PAY_0_5, PAY_0_6, PAY_0_7, PAY_0_8, PAY_2_-1, PAY_2_0, PAY_2_1, PAY_2_2, PAY_2_3, PAY_2_4, PAY_2_5, PAY_2_6, PAY_2_7, PAY_3_-1, PAY_3_0, PAY_3_1, PAY_3_2, PAY_3_3, PAY_3_4, PAY_3_5, PAY_3_6, PAY_3_7, PAY_3_8, PAY_4_-1, PAY_4_0, PAY_4_1, PAY_4_2, PAY_4_3, PAY_4_4, PAY_4_5, PAY_4_6, PAY_4_7, PAY_5_-1, PAY_5_0, PAY_5_2, PAY_5_3, PAY_5_4, PAY_5_5, PAY_5_6, PAY_5_7, PAY_6_-1, PAY_6_0, PAY_6_2, PAY_6_3, PAY_6_4, PAY_6_5, PAY_6_6, PAY_6_7, PAY_6_8, cat_LIMIT_BAL_160000_a_250000, cat_LIMIT_BAL_250000_a_760001, cat_LIMIT_BAL_50000_a_90000, cat_LIMIT_BAL_90000_a_160000, cat_BILL_AMT1_13888_a_35106, cat_BILL_AMT1_1843_a_13888, cat_BILL_AMT1_35106_a_82219, cat_BILL_AMT1_82219_a_613861, 

## Detección y remoción de variables univaridadas
cuando el 90% de un valor se concentre en una clase

In [10]:
dfs_list = [ ]
for c in X_train.columns:
    tmp_count = count_percentage(X_train, c)
    tmp_df = tmp_count[tmp_count["porcentaje"] >= 90]

    if not tmp_df.empty:
        dfs_list.append(create_feature_dataframe(tmp_df, c))

unit_vars = pd.concat(dfs_list).reset_index(drop=True)
unit_vars

Unnamed: 0,feature,category,conteo,porcentaje
0,EDUCATION_4,0.0,7704,99.62
1,EDUCATION_5,0.0,7677,99.28
2,EDUCATION_6,0.0,7721,99.84
3,MARRIAGE_3,0.0,7648,98.9
4,PAY_0_3,0.0,7596,98.23
5,PAY_0_4,0.0,7692,99.47
6,PAY_0_5,0.0,7723,99.87
7,PAY_0_6,0.0,7732,99.99
8,PAY_0_7,0.0,7728,99.94
9,PAY_0_8,0.0,7728,99.94


In [11]:
remove_unit_columns = unit_vars["feature"].to_list()
X_train = X_train.drop(columns=remove_unit_columns)
X_test = X_test.drop(columns=remove_unit_columns)
print(X_train.shape)
print(X_test.shape)

(7733, 88)
(2578, 88)


# Modelos base

In [12]:
models_dict = {
    "knn": KNeighborsClassifier,
    "sdg": SGDClassifier,
    "svc": SVC,
    "logistic_regression": LogisticRegression,
}

results_df = evaluate_models(models_dict, X_train, X_test, y_train, y_test)
results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,model,f1-score-train,std-dev,f1-score-test
0,knn,0.660673,0.0134,0.588669
1,sdg,0.678,0.0204,0.58906
2,svc,0.715502,0.0131,0.604577
3,logistic_regression,0.721191,0.0113,0.63745


# Selección de features

In [13]:
def get_best_features_rfecv(X, y, model, scoring):
    
    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring=scoring)
    rfecv.fit(X, y)

    return X.columns[rfecv.support_].tolist()

In [14]:
model = LogisticRegression(n_jobs=-1)
best_features_f1 = get_best_features_rfecv(X_train, y_train, model, scoring="f1")
len(best_features_f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

66

In [15]:
best_features_recall = get_best_features_rfecv(X_train, y_train, model, scoring="recall")
len(best_features_recall)

3

In [16]:
best_features_micro = get_best_features_rfecv(X_train, y_train, model, scoring="f1_micro")
len(best_features_micro)

21

In [17]:
best_features_roc_auc = get_best_features_rfecv(X_train, y_train, model, scoring="roc_auc")
len(best_features_roc_auc)

48

In [18]:
kbest_df = get_kbest(X_train, pd.DataFrame(y_train), k=25)
features_kbest = kbest_df["Feature"].head(25).values
len(features_kbest)

25

## Logistic Regression con top features

In [19]:
features_dict = {
    "f1": best_features_f1,
    "recall": best_features_recall,
    "f1_micro": best_features_micro,
    "roc_auc": best_features_roc_auc,
    "kbest": features_kbest
}

results_list = []
for score_type, top_features in features_dict.items():
    results =  train_classifier_model(
        X_train[top_features], X_test[top_features],
        y_train, y_test, model=LogisticRegression, verbose=False
    )
    _, _, f1_mean_score_train, std_dev_train, f1_score_test  = results

    results_list.append([
        score_type, f1_mean_score_train, std_dev_train, f1_score_test])

pd.DataFrame(results_list, columns=["score_type", "f1-score-train", "std-dev", "f1-score-test"])

Unnamed: 0,score_type,f1-score-train,std-dev,f1-score-test
0,f1,0.72326,0.012,0.633516
1,recall,0.706836,0.019,0.641162
2,f1_micro,0.72158,0.012,0.620101
3,roc_auc,0.722873,0.0112,0.635908
4,kbest,0.719249,0.0166,0.615946


## LogisticRegression GridSearch

In [22]:
param_grid = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "solver": ["newton-cg", "newton-cholesky", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": [100, 200, 300, 400, 500]
}

features_dict = {
    "f1": best_features_f1,
    "kbest": features_kbest,
    "roc_auc": best_features_roc_auc
}

results_list = []
for score_type, top_features in features_dict.items():
    results =  train_classifier_model(
        X_train[top_features], X_test[top_features], y_train, y_test,
        model=LogisticRegression, param_grid=param_grid, verbose=False
    )
    _, best_params, f1_mean_score_train, std_dev_train, f1_score_test  = results

    results_list.append([score_type, best_params, f1_mean_score_train,
                         std_dev_train, f1_score_test])

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


1440 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to -1.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Mejores hiperparámetros encontrados GridSearchCV:
{'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}




Fitting 3 folds for each of 720 candidates, totalling 2160 fits


1440 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to -1.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Mejores hiperparámetros encontrados GridSearchCV:
{'C': 0.1, 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}
Fitting 3 folds for each of 720 candidates, totalling 2160 fits


1440 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to -1.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Mejores hiperparámetros encontrados GridSearchCV:
{'C': 0.1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'sag'}




In [24]:
pd.DataFrame(results_list, columns=["score_type", "best_params", "f1-score-train", "std-dev", "f1-score-test"]).to_dict(orient="records")#

[{'score_type': 'f1',
  'best_params': {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'},
  'f1-score-train': 0.7236479236238555,
  'std-dev': 0.011,
  'f1-score-test': 0.6328358208955224},
 {'score_type': 'kbest',
  'best_params': {'C': 0.1,
   'max_iter': 200,
   'penalty': 'l1',
   'solver': 'liblinear'},
  'f1-score-train': 0.7196364043576655,
  'std-dev': 0.0179,
  'f1-score-test': 0.6149035956227201},
 {'score_type': 'roc_auc',
  'best_params': {'C': 0.1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'sag'},
  'f1-score-train': 0.7217072648929804,
  'std-dev': 0.0134,
  'f1-score-test': 0.6340725806451613}]

In [61]:
# Definir los modelos base
kwargs_f1 = {"C": 10, "max_iter": 500, "penalty": "l2", "solver": "sag"}
clf1 = LogisticRegression(**kwargs_f1, random_state=42)
clf1.fit(X_train[best_features_f1], y_train)
_ = test_report(clf1, X_test[best_features_f1], y_test, verbose=True)

>>> F1 Macro Score en el conjunto de prueba (test): 0.6328358208955224

Reporte de Clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.71      0.84      0.77      1439
           1       0.73      0.56      0.63      1139

    accuracy                           0.71      2578
   macro avg       0.72      0.70      0.70      2578
weighted avg       0.72      0.71      0.71      2578





In [62]:
with open("../results/7-Clasificación-clientes/model.pickle", "wb") as f:
    pickle.dump(clf1, f)

with open("../results/7-Clasificación-clientes/best_features.pickle", "wb") as f:
    pickle.dump(best_features_f1, f)

In [25]:
# # Definir los modelos base
# kwargs_f1 = {"C": 0.1,
#    "max_iter": 100,
#    "penalty": "l2",
#    "solver": "newton-cholesky"}

# clf1 = LogisticRegression(**kwargs_f1, random_state=42)
# clf1.fit(X_train[best_features_f1], y_train)
# _ = test_report(clf1, X_test[best_features_f1], y_test, verbose=True)

# kwargs_micro = {"C": 1,
#    "max_iter": 100,
#    "penalty": "l2",
#    "solver": "newton-cholesky"
#    }
# clf2 = LogisticRegression(**kwargs_micro, random_state=42)
# clf2.fit(X_train[best_features_micro], y_train)
# _ = test_report(clf2, X_test[best_features_micro], y_test, verbose=True)

# kwargs_roc_auc = {"C": 1, "max_iter": 200, "penalty": "l1", "solver": "saga"}

# clf3 = LogisticRegression(**kwargs_roc_auc, random_state=42)
# clf3.fit(X_train[best_features_roc_auc], y_train)
# _ = test_report(clf3, X_test[best_features_roc_auc], y_test, verbose=True)

In [26]:
# y_predict_1 = clf1.predict(X_train[best_features_f1])
# y_predict_2 = clf2.predict(X_train[best_features_micro])
# y_predict_3 = clf3.predict(X_train[best_features_roc_auc])

# predict_df = pd.DataFrame({
#     "f1": y_predict_1,
#     "roc-auc": y_predict_2,
#     "kbest": y_predict_3
# })

# predict_df["y_pred"] = predict_df.mode(axis=1)

# f1_score(y_train, predict_df["y_pred"], average="macro")

In [27]:
# y_predict_1 = clf1.predict(X_test[best_features_f1])
# y_predict_2 = clf2.predict(X_test[best_features_micro])
# y_predict_3 = clf3.predict(X_test[best_features_roc_auc])

# predict_df = pd.DataFrame({
#     "f1": y_predict_1,
#     "roc-auc": y_predict_2,
#     "kbest": y_predict_3
# })

# predict_df["y_pred"] = predict_df.mode(axis=1)

# f1_score(y_test, predict_df["y_pred"], average="binary")

## KNN

In [28]:
results_list = []
for score_type, top_features in features_dict.items():
    results =  train_classifier_model(
        X_train[top_features], X_test[top_features],
        y_train, y_test, model=KNeighborsClassifier, verbose=False
    )
    _, _, f1_mean_score_train, std_dev_train, f1_score_test  = results

    results_list.append([
        score_type, f1_mean_score_train, std_dev_train, f1_score_test])

pd.DataFrame(results_list, columns=["score_type", "f1-score-train", "std-dev", "f1-score-test"])

Unnamed: 0,score_type,f1-score-train,std-dev,f1-score-test
0,f1,0.668173,0.0089,0.606143
1,kbest,0.675154,0.0136,0.618231
2,roc_auc,0.667782,0.0164,0.592051


## SVC

In [29]:
fresults_list = []
for score_type, top_features in features_dict.items():
    results =  train_classifier_model(
        X_train[top_features], X_test[top_features],
        y_train, y_test, model=SVC, verbose=False
    )
    _, _, f1_mean_score_train, std_dev_train, f1_score_test  = results

    results_list.append([
        score_type, f1_mean_score_train, std_dev_train, f1_score_test])

pd.DataFrame(results_list, columns=["score_type", "f1-score-train", "std-dev", "f1-score-test"])

Unnamed: 0,score_type,f1-score-train,std-dev,f1-score-test
0,f1,0.668173,0.0089,0.606143
1,kbest,0.675154,0.0136,0.618231
2,roc_auc,0.667782,0.0164,0.592051
3,f1,0.722095,0.0134,0.637506
4,kbest,0.714983,0.0148,0.601279
5,roc_auc,0.725459,0.0143,0.637186


## Predicción sobre nuevos datos

In [4]:
numerical_features = [
    "LIMIT_BAL",
    "BILL_AMT1",
    "BILL_AMT2",
    "BILL_AMT3",
    "BILL_AMT4",
    "BILL_AMT5",
    "BILL_AMT6",
    "PAY_AMT1",
    "PAY_AMT2",
    "PAY_AMT3",
    "PAY_AMT4",
    "PAY_AMT5",
    "PAY_AMT6",
    "AGE"
]

categorical_features = [
    "SEX",
    "EDUCATION",
    "MARRIAGE", 
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

with open("../results/7-Clasificación-clientes/grouper.pickle", "rb") as f:
    grouper = pickle.load(f)

with open("../results/7-Clasificación-clientes/preprocessor.pickle", "rb") as f:
    preprocessor = pickle.load(f)

with open("../results/7-Clasificación-clientes/model.pickle", "rb") as f:
    model = pickle.load(f)

with open("../results/7-Clasificación-clientes/best_features.pickle", "rb") as f:
    best_features = pickle.load(f)

filename = "../data/test_p3.csv"
raw_data = pd.read_csv(filename)
predictions_df = raw_data[["ID"]].copy()

data_grouped = grouper.transform(raw_data[numerical_features])
new_categorical_features = data_grouped.columns.to_list()
X = pd.concat([raw_data, data_grouped], axis=1)

transformed_X = preprocessor.transform(X)

categorical_features = categorical_features + new_categorical_features

onehot_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]
categorical_features_encoded = onehot_encoder.get_feature_names_out(
        input_features=categorical_features)
feature_names =  numerical_features + list(categorical_features_encoded)

X_transformed_df = pd.DataFrame.sparse.from_spmatrix(transformed_X, columns=feature_names)
X_transformed_df.head()



Unnamed: 0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,...,cat_PAY_AMT5_2000_a_5000,cat_PAY_AMT5_5000_a_332001,cat_PAY_AMT5_930_a_2000,cat_PAY_AMT6_2000_a_4818,cat_PAY_AMT6_4818_a_443002,cat_PAY_AMT6_873_a_2000,cat_AGE_27_a_31,cat_AGE_31_a_37,cat_AGE_37_a_44,cat_AGE_44_a_76
0,1.176471,-0.258004,-0.223476,-0.352693,-0.374683,-0.366619,-0.373718,0.22686,-0.382998,-0.411719,...,0.0,0.0,0,0.0,1.0,0,0.0,0,0.0,1.0
1,1.529412,-0.046401,-0.003794,-0.3554,-0.332155,-0.28356,-0.258045,4.284029,-0.418572,0.155899,...,0.0,1.0,0,1.0,0.0,0,0.0,0,1.0,0.0
2,0.352941,-0.348216,-0.342801,-0.224951,-0.358457,-0.358188,-0.356195,-0.266561,1.29568,-0.196443,...,0.0,0.0,0,0.0,1.0,0,0.0,0,0.0,1.0
3,-0.058824,1.068604,1.177524,1.215529,1.332194,1.419641,1.410888,1.202359,0.4158,0.372482,...,1.0,0.0,0,1.0,0.0,0,1.0,0,0.0,0.0
4,-0.470588,0.231827,0.290938,0.32898,0.414281,0.454191,0.475882,0.22686,-0.0462,0.110908,...,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,1.0


In [9]:
predictions_df["y_hat"] = model.predict(X_transformed_df[best_features])

filename = "CDD_G24_M2_P3_Cuauhtemoc_Salvador_Bautista_Enciso.csv"
predictions_df.to_csv(filename, index=False)

predictions_df.head()

Unnamed: 0,ID,y_hat
0,21256,0
1,14589,0
2,19988,0
3,21287,1
4,12418,0


In [10]:
predictions_df["y_hat"].value_counts().to_frame()

Unnamed: 0_level_0,count
y_hat,Unnamed: 1_level_1
0,6788
1,2212
