In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LassoCV
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.metrics import precision_score, recall_score, f1_score

In [9]:

df = pd.read_csv(r"../data/final.csv")
features = df.drop('loan_status', axis=1)
target = df['loan_status'].squeeze()

In [3]:
features

Unnamed: 0,gender,married,dependents,education,self_employed,income,loan_amount
0,0,0,2,1,0,96000.0,2990.0
1,0,1,0,0,1,41000.0,1220.0
2,0,1,3,1,0,91000.0,2970.0
3,0,1,3,1,0,82000.0,3070.0
4,0,0,5,0,1,98000.0,2420.0
...,...,...,...,...,...,...,...
4785,1,0,0,1,0,34800.0,71.0
4786,0,1,3,1,0,49272.0,40.0
4787,0,1,1,1,0,96864.0,253.0
4788,0,1,2,1,0,90996.0,187.0


In [4]:
target

0       Approved
1       Rejected
2       Rejected
3       Rejected
4       Rejected
          ...   
4785    Approved
4786    Approved
4787    Approved
4788    Approved
4789    Rejected
Name: loan_status, Length: 4790, dtype: object

In [5]:



X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Función para eliminar outliers
def remove_outliers(df, columns, z_threshold=3):
    columns = [col for col in columns if col in df.columns]
    return df[(np.abs(stats.zscore(df[columns])) < z_threshold).all(axis=1)]

# Definir diferentes conjuntos de características
all_features = features.columns.tolist()
numeric_features = [col for col in ['dependents', 'income', 'loan_amount'] if col in features.columns]
categorical_features = [col for col in ['gender', 'married', 'education', 'self_employed'] if col in features.columns]

feature_sets = {
    'All Features': all_features,
    'Numeric Only': numeric_features,
    'Categorical Only': categorical_features,
    'No Family Size': [col for col in all_features if col != 'Family_Size']
}

# Definir modelos
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Preparar resultados en un DataFrame
results = []

# Iterar a través de diferentes escenarios
for outliers in ['With Outliers', 'Without Outliers']:
    for scaling in ['No Scaling', 'Standardization', 'Normalization']:
        for feature_set_name, features_list in feature_sets.items():
            # Preparar los datos
            X_subset = features[features_list]
            y_subset = target
            
            if outliers == 'Without Outliers':
                X_subset = remove_outliers(X_subset, numeric_features)
                y_subset = y_subset[X_subset.index]
            
            # Dividir los datos en train y test
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)
            
            # Aplicar escalado si es necesario
            if scaling == 'Standardization':
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
            elif scaling == 'Normalization':
                scaler = MinMaxScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
            else:
                X_train_scaled = X_train
                X_test_scaled = X_test
            
            # Entrenar y evaluar modelos
            for model_name, model in models.items():
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
                accuracy = accuracy_score(y_test, y_pred)
                
                results.append({
                    'Model': model_name,
                    'Outliers': outliers,
                    'Scaling': scaling,
                    'Feature Set': feature_set_name,
                    'Accuracy': accuracy
                })

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Imprimir resumen
print(results_df)

# Encontrar el mejor modelo
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest performing model:")
print(best_model)

# Opcional: Guardar los resultados en un CSV
results_df.to_csv('../data/model_comparison_results.csv', index=False)

                   Model          Outliers        Scaling     Feature Set  \
0    Logistic Regression     With Outliers     No Scaling    All Features   
1          Decision Tree     With Outliers     No Scaling    All Features   
2          Random Forest     With Outliers     No Scaling    All Features   
3                    SVM     With Outliers     No Scaling    All Features   
4                    KNN     With Outliers     No Scaling    All Features   
..                   ...               ...            ...             ...   
115  Logistic Regression  Without Outliers  Normalization  No Family Size   
116        Decision Tree  Without Outliers  Normalization  No Family Size   
117        Random Forest  Without Outliers  Normalization  No Family Size   
118                  SVM  Without Outliers  Normalization  No Family Size   
119                  KNN  Without Outliers  Normalization  No Family Size   

     Accuracy  
0    0.616910  
1    0.535491  
2    0.551148  
3    0.6169

In [6]:
#display all rows
pd.set_option('display.max_rows', None)

results_df.sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Model,Outliers,Scaling,Feature Set,Accuracy
60,Logistic Regression,Without Outliers,No Scaling,All Features,0.635887
98,SVM,Without Outliers,Standardization,No Family Size,0.635887
63,SVM,Without Outliers,No Scaling,All Features,0.635887
65,Logistic Regression,Without Outliers,No Scaling,Numeric Only,0.635887
68,SVM,Without Outliers,No Scaling,Numeric Only,0.635887
75,Logistic Regression,Without Outliers,No Scaling,No Family Size,0.635887
78,SVM,Without Outliers,No Scaling,No Family Size,0.635887
80,Logistic Regression,Without Outliers,Standardization,All Features,0.635887
85,Logistic Regression,Without Outliers,Standardization,Numeric Only,0.635887
88,SVM,Without Outliers,Standardization,Numeric Only,0.635887


In [4]:

# Supongamos que tenemos una función para eliminar outliers
def remove_outliers(df, threshold=3):
    # Eliminar filas con valores fuera del rango del umbral de desviación estándar
    return df[(np.abs(df - df.mean()) / df.std()) < threshold].dropna()

# Definir los modelos
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Bagging with Decision Tree': BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(),
    'Pasting': BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, bootstrap=False, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42)
}

# Definir escaladores
scalers = {
    'No Scaling': None,
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler()
}

# Definir diferentes conjuntos de características
feature_sets = {
    'All Features': features,
    'No Family Size': features.drop(columns=['dependents']) # Cambia esto según el nombre exacto de la columna
}

# Inicializar lista para almacenar los resultados
results = []

# Loop para entrenar modelos con diferentes combinaciones
for outliers_option in ['With Outliers', 'Without Outliers']:
    # Eliminar outliers si corresponde
    data = features.copy()
    if outliers_option == 'Without Outliers':
        data = remove_outliers(data)
    
    # Obtener las etiquetas (target) correspondientes
    target_data = target.loc[data.index]

    for scaling_option, scaler in scalers.items():
        # Escalar los datos si corresponde
        scaled_data = data.copy()
        if scaler:
            scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=data.columns)
        
        for feature_set_name, feature_set in feature_sets.items():
            # Usar las características correspondientes
            if feature_set_name == 'All Features':
                X = scaled_data
            else:
                X = scaled_data[feature_set.columns]

            # Dividir el conjunto de datos
            X_train, X_test, y_train, y_test = train_test_split(X, target_data, test_size=0.2, random_state=42)

            # Entrenar y evaluar cada modelo
            for model_name, model in models.items():
                # Entrenar el modelo
                model.fit(X_train, y_train)

                # Hacer predicciones
                y_pred = model.predict(X_test)

                # Calcular la precisión
                accuracy = accuracy_score(y_test, y_pred)

                # Almacenar los resultados
                results.append({
                    'Model': model_name,
                    'Outliers': outliers_option,
                    'Scaling': scaling_option,
                    'Feature Set': feature_set_name,
                    'Accuracy': accuracy
                })

# Convertir la lista de resultados a un DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
# Mostrar los resultados
results_df




Unnamed: 0,Model,Outliers,Scaling,Feature Set,Accuracy
0,Logistic Regression,Without Outliers,No Scaling,All Features,0.635887
1,SVM,Without Outliers,Standardization,No Family Size,0.635887
2,Logistic Regression,Without Outliers,No Scaling,No Family Size,0.635887
3,SVM,Without Outliers,No Scaling,No Family Size,0.635887
4,Logistic Regression,Without Outliers,Standardization,All Features,0.635887
...,...,...,...,...,...
91,Pasting,With Outliers,Standardization,All Features,0.528184
92,Random Forest,With Outliers,Standardization,No Family Size,0.527140
93,Pasting,Without Outliers,Normalization,No Family Size,0.517314
94,Pasting,Without Outliers,No Scaling,No Family Size,0.516264


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Definir el espacio de hiperparámetros para Logistic Regression
logistic_params = {
    'logistic__C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],  # Regularización
    'logistic__solver': ['liblinear', 'saga']  # Algoritmos de optimización
}

# Definir el espacio de hiperparámetros para SVM
svm_params = {
    'svm__C': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],  # Regularización
    'svm__kernel': ['linear', 'rbf']  # Tipos de kernel
}

# Definir los modelos en un Pipeline
logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Escalado
    ('logistic', LogisticRegression(max_iter=1000))
])

svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Escalado
    ('svm', SVC())
])

# Grid Search para Logistic Regression
logistic_grid = GridSearchCV(logistic_pipeline, logistic_params, cv=5, scoring='accuracy')
logistic_grid.fit(X_train, y_train)

# Imprimir los mejores parámetros y la precisión
print(f"Mejores parámetros para Logistic Regression: {logistic_grid.best_params_}")
print(f"Mejor accuracy para Logistic Regression: {logistic_grid.best_score_:.4f}")

# Grid Search para SVM
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)

# Imprimir los mejores parámetros y la precisión
print(f"Mejores parámetros para SVM: {svm_grid.best_params_}")
print(f"Mejor accuracy para SVM: {svm_grid.best_score_:.4f}")

# Evaluar en el conjunto de prueba
logistic_best = logistic_grid.best_estimator_
svm_best = svm_grid.best_estimator_

y_pred_logistic = logistic_best.predict(X_test)
y_pred_svm = svm_best.predict(X_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(f"Accuracy en el conjunto de prueba para Logistic Regression: {accuracy_logistic:.4f}")
print(f"Accuracy en el conjunto de prueba para SVM: {accuracy_svm:.4f}")


Mejores parámetros para Logistic Regression: {'logistic__C': 0.1, 'logistic__solver': 'liblinear'}
Mejor accuracy para Logistic Regression: 0.6282
Mejores parámetros para SVM: {'svm__C': 0.05, 'svm__kernel': 'linear'}
Mejor accuracy para SVM: 0.6282
Accuracy en el conjunto de prueba para Logistic Regression: 0.6359
Accuracy en el conjunto de prueba para SVM: 0.6359


In [9]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline

# Definir otros modelos para probar
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
}

# Entrenar y evaluar los nuevos modelos
results = []
for model_name, model in models.items():
    # Definir un pipeline con escalado
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = pipeline.predict(X_test)
    
    # Calcular la precisión
    accuracy = accuracy_score(y_test, y_pred)
    
    # Almacenar los resultados
    results.append({
        'Model': model_name,
        'Accuracy': accuracy
    })

# Ahora, usando Voting Classifier
voting_model = VotingClassifier(estimators=[
    ('logistic', LogisticRegression(C=0.1, solver='liblinear', max_iter=1000)),
    ('svm', SVC(C=0.1, kernel='linear')),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
], voting='hard')

# Entrenar el Voting Classifier
voting_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_voting = voting_model.predict(X_test)

# Calcular la precisión
voting_accuracy = accuracy_score(y_test, y_pred_voting)

# Almacenar los resultados del Voting Classifier
results.append({
    'Model': 'Voting Classifier',
    'Accuracy': voting_accuracy
})

# Convertir los resultados a DataFrame y ordenarlos
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

# Mostrar los resultados
results_df

Unnamed: 0,Model,Accuracy
2,Voting Classifier,0.635887
1,Gradient Boosting,0.623295
0,Random Forest,0.558237


In [11]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Codificar la variable objetivo
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)  # target_data es tu variable objetivo original


# Dividir el conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(features, y_encoded, test_size=0.2, random_state=42)

# Definir el modelo XGBoost
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42)

# Entrenar el modelo
xgb_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_xgb = xgb_model.predict(X_test)

# Calcular la precisión
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

# Mostrar la precisión
print(f'Accuracy de XGBoost: {xgb_accuracy:.4f}')


Accuracy de XGBoost: 0.5637


In [9]:
# Importar las librerías necesarias
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer  # Puedes reemplazar esto con tu propio conjunto de datos



# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Crear el modelo LightGBM con los hiperparámetros adecuados
model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',  # Clasificación binaria
    learning_rate=0.1,
    num_leaves=31,
    max_depth=-1,
    n_estimators=100
)

# Entrenar el modelo manualmente con parada temprana
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='binary_error'
)

# Predecir los valores del conjunto de prueba
y_pred = model.predict(X_test)

# Calcular la precisión (accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy del modelo LightGBM: {accuracy:.2f}')


[LightGBM] [Info] Number of positive: 1404, number of negative: 2428
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 524
[LightGBM] [Info] Number of data points in the train set: 3832, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.366388 -> initscore=-0.547743
[LightGBM] [Info] Start training from score -0.547743
Accuracy del modelo LightGBM: 0.57


In [3]:
# Importar las librerías necesarias
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Normalizar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Definir el modelo CatBoost
catboost_model = CatBoostClassifier(
    iterations=1000,  # Número máximo de iteraciones
    learning_rate=0.1,
    depth=6,  # Profundidad del árbol
    verbose=0,  # Silenciar el output de entrenamiento
    random_seed=42
)

# Definir los parámetros para Grid Search
catboost_params = {
    'iterations': [500, 1000],  # Puedes ajustar esto
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10]
}

# Grid Search para CatBoost
catboost_grid = GridSearchCV(catboost_model, catboost_params, cv=5, scoring='accuracy')
catboost_grid.fit(X_train, y_train)

# Imprimir los mejores parámetros y la precisión
print(f"Mejores parámetros para CatBoost: {catboost_grid.best_params_}")
print(f"Mejor accuracy para CatBoost: {catboost_grid.best_score_:.4f}")

# Evaluar en el conjunto de prueba
catboost_best = catboost_grid.best_estimator_
y_pred_catboost = catboost_best.predict(X_test)

# Calcular y mostrar la precisión
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"Accuracy en el conjunto de prueba para CatBoost: {accuracy_catboost:.4f}")

# Imprimir un informe de clasificación
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred_catboost))


Mejores parámetros para CatBoost: {'depth': 6, 'iterations': 500, 'learning_rate': 0.01}
Mejor accuracy para CatBoost: 0.6370
Accuracy en el conjunto de prueba para CatBoost: 0.6096

Informe de clasificación:
              precision    recall  f1-score   support

    Approved       0.62      0.97      0.75       591
    Rejected       0.36      0.02      0.05       367

    accuracy                           0.61       958
   macro avg       0.49      0.50      0.40       958
weighted avg       0.52      0.61      0.48       958



In [6]:
# Supongamos que tenemos una función para eliminar outliers
def remove_outliers(df, threshold=3):
    # Eliminar filas con valores fuera del rango del umbral de desviación estándar
    return df[(np.abs(df - df.mean()) / df.std()) < threshold].dropna()

# Definir los modelos
models = CatBoostClassifier(
            depth=6,
            iterations=500,
            learning_rate=0.01,
            class_weights={0: 1, 1: 3},  # Ajusta según el desbalance en tus datos
            eval_metric='F1',
            verbose=False
        )

# Definir escaladores
scalers = {
    'No Scaling': None,
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler()
}

# Definir diferentes conjuntos de características
feature_sets = {
    'All Features': features,
    'No Family Size': features.drop(columns=['dependents']) # Cambia esto según el nombre exacto de la columna
}

# Inicializar lista para almacenar los resultados
results = []

# Loop para entrenar modelos con diferentes combinaciones
for outliers_option in ['With Outliers', 'Without Outliers']:
    # Eliminar outliers si corresponde
    data = features.copy()
    if outliers_option == 'Without Outliers':
        data = remove_outliers(data)
    
    # Obtener las etiquetas (target) correspondientes
    target_data = target.loc[data.index]

    for scaling_option, scaler in scalers.items():
        # Escalar los datos si corresponde
        scaled_data = data.copy()
        if scaler:
            scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=data.columns)
        
        for feature_set_name, feature_set in feature_sets.items():
            # Usar las características correspondientes
            if feature_set_name == 'All Features':
                X = scaled_data
            else:
                X = scaled_data[feature_set.columns]

            # Dividir el conjunto de datos
            X_train, X_test, y_train, y_test = train_test_split(X, target_data, test_size=0.2, random_state=42)

            # Entrenar y evaluar cada modelo
            for model_name, model in models.items():
                # Entrenar el modelo
                model.fit(X_train, y_train)

                # Hacer predicciones
                y_pred = model.predict(X_test)

                # Calcular la precisión
                accuracy = accuracy_score(y_test, y_pred)

                # Almacenar los resultados
                results.append({
                    'Model': model_name,
                    'Outliers': outliers_option,
                    'Scaling': scaling_option,
                    'Feature Set': feature_set_name,
                    'Accuracy': accuracy
                })

# Convertir la lista de resultados a un DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
# Mostrar los resultados
results_df

AttributeError: 'CatBoostClassifier' object has no attribute 'items'

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Supongamos que tenemos una función para eliminar outliers
def remove_outliers(df, threshold=3):
    return df[(np.abs(df - df.mean()) / df.std()) < threshold].dropna()

# Convertir etiquetas a numéricas
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)  # Asegúrate de que 'target' sea tu serie de etiquetas original

# Definir el modelo
model = CatBoostClassifier(
    iterations=1000,  # Número máximo de iteraciones
    learning_rate=0.1,
    depth=6,  # Profundidad del árbol
    verbose=0,  # Silenciar el output de entrenamiento
    random_seed=42
)

# Definir escaladores
scalers = {
    'No Scaling': None,
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler()
}

# Definir diferentes conjuntos de características
feature_sets = {
    'All Features': features,
    'No Family Size': features.drop(columns=['dependents'])  # Cambia esto según el nombre exacto de la columna
}

# Inicializar lista para almacenar los resultados
results = []

# Loop para entrenar modelos con diferentes combinaciones
for outliers_option in ['With Outliers', 'Without Outliers']:
    # Eliminar outliers si corresponde
    data = features.copy()
    if outliers_option == 'Without Outliers':
        data = remove_outliers(data)
    
    # Obtener las etiquetas (target) correspondientes
    target_data = target_encoded[features.index.intersection(data.index)]  # Asegúrate de que las etiquetas correspondan

    for scaling_option, scaler in scalers.items():
        # Escalar los datos si corresponde
        scaled_data = data.copy()
        if scaler:
            scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=data.columns)
        
        for feature_set_name, feature_set in feature_sets.items():
            # Usar las características correspondientes
            if feature_set_name == 'All Features':
                X = scaled_data
            else:
                X = scaled_data[feature_set.columns]

            # Dividir el conjunto de datos
            X_train, X_test, y_train, y_test = train_test_split(X, target_data, test_size=0.2, random_state=42)

            # Entrenar y evaluar el modelo
            model.fit(X_train, y_train)

            # Hacer predicciones
            y_pred = model.predict(X_test)

            # Calcular la precisión
            accuracy = accuracy_score(y_test, y_pred)

            # Almacenar los resultados
            results.append({
                'Model': 'CatBoost',
                'Outliers': outliers_option,
                'Scaling': scaling_option,
                'Feature Set': feature_set_name,
                'Accuracy': accuracy
            })

# Convertir la lista de resultados a un DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

# Mostrar los resultados
results_df

Unnamed: 0,Model,Outliers,Scaling,Feature Set,Accuracy
0,CatBoost,Without Outliers,No Scaling,All Features,0.572928
1,CatBoost,Without Outliers,Standardization,All Features,0.572928
2,CatBoost,Without Outliers,Normalization,All Features,0.572928
3,CatBoost,Without Outliers,No Scaling,No Family Size,0.567681
4,CatBoost,Without Outliers,Standardization,No Family Size,0.567681
5,CatBoost,Without Outliers,Normalization,No Family Size,0.567681
6,CatBoost,With Outliers,No Scaling,All Features,0.566806
7,CatBoost,With Outliers,Standardization,All Features,0.566806
8,CatBoost,With Outliers,Normalization,All Features,0.566806
9,CatBoost,With Outliers,No Scaling,No Family Size,0.556367


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Supongamos que tenemos una función para eliminar outliers
def remove_outliers(df, threshold=3):
    return df[(np.abs(df - df.mean()) / df.std()) < threshold].dropna()

# Convertir etiquetas a numéricas
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)  # Asegúrate de que 'target' sea tu serie de etiquetas original

# Definir el modelo base
model = CatBoostClassifier(eval_metric='F1', verbose=False)

# Definir el espacio de hiperparámetros para RandomizedSearchCV
param_dist = {
    'depth': [4, 6, 8, 10],
    'iterations': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 10],
    'class_weights': [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 3}]  # Ajusta según el desbalance en tus datos
}

# Definir escaladores
scalers = {
    'No Scaling': None,
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler()
}

# Definir diferentes conjuntos de características
feature_sets = {
    'All Features': features,
    'No Family Size': features.drop(columns=['dependents'])  # Cambia esto según el nombre exacto de la columna
}

# Inicializar lista para almacenar los resultados
results = []

# Loop para entrenar modelos con diferentes combinaciones
for outliers_option in ['With Outliers', 'Without Outliers']:
    # Eliminar outliers si corresponde
    data = features.copy()
    if outliers_option == 'Without Outliers':
        data = remove_outliers(data)

    # Obtener las etiquetas (target) correspondientes
    target_data = target_encoded[features.index.intersection(data.index)]  # Asegúrate de que las etiquetas correspondan

    for scaling_option, scaler in scalers.items():
        # Escalar los datos si corresponde
        scaled_data = data.copy()
        if scaler:
            scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=data.columns)

        for feature_set_name, feature_set in feature_sets.items():
            # Usar las características correspondientes
            if feature_set_name == 'All Features':
                X = scaled_data
            else:
                X = scaled_data[feature_set.columns]

            # Dividir el conjunto de datos
            X_train, X_test, y_train, y_test = train_test_split(X, target_data, test_size=0.2, random_state=42)

            # Configurar RandomizedSearchCV
            random_search = RandomizedSearchCV(
                model,
                param_distributions=param_dist,
                n_iter=50,  # Número de combinaciones a probar
                scoring='accuracy',
                cv=3,
                verbose=0,
                random_state=42
            )

            # Entrenar y evaluar el modelo
            random_search.fit(X_train, y_train)

            # Hacer predicciones
            y_pred = random_search.predict(X_test)

            # Calcular la precisión
            accuracy = accuracy_score(y_test, y_pred)

            # Almacenar los resultados
            results.append({
                'Model': 'CatBoost',
                'Outliers': outliers_option,
                'Scaling': scaling_option,
                'Feature Set': feature_set_name,
                'Best Parameters': random_search.best_params_,
                'Accuracy': accuracy
            })

# Convertir la lista de resultados a un DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

# Mostrar los resultados
results_df

KeyboardInterrupt: 