# Oversampling Models Hyperparameters

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Import data 

In [1]:
# Import 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import skopt
import lightgbm as lgb
from tensorflow import keras
import tensorflow as tf

# Pre Processing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


# Balancing Classes
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import *
from imblearn.combine import SMOTEENN, SMOTETomek


# Metrics
from sklearn.metrics import *

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

# Ensemble
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

#Hyper para
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Warnings
from sklearn.utils._testing import ignore_warnings

import warnings
warnings.filterwarnings('ignore')


from skopt.space import Real, Integer
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
import wandb
from wandb.integration.xgboost import WandbCallback


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 50)


In [2]:
# df festures dtypes
df_dtypes = {"PAYMENT_DAY":"category", "APPLICATION_SUBMISSION_TYPE":"category", "POSTAL_ADDRESS_TYPE":"category", "SEX":"category", "MARITAL_STATUS":"category",
             "STATE_OF_BIRTH":"category", "CITY_OF_BIRTH":"category", "NACIONALITY":"category",	"RESIDENCIAL_STATE":"category","RESIDENCIAL_CITY":"category",
             "RESIDENCIAL_BOROUGH":"category", "FLAG_RESIDENCIAL_PHONE":"category", "RESIDENCIAL_PHONE_AREA_CODE":"category", "RESIDENCE_TYPE":"category",
             "FLAG_EMAIL":"category", "FLAG_VISA":"category", "FLAG_MASTERCARD":"category",	"FLAG_DINERS":"category","FLAG_AMERICAN_EXPRESS":"category","FLAG_OTHER_CARDS":"category", 
             "QUANT_BANKING_ACCOUNTS":"category","QUANT_SPECIAL_BANKING_ACCOUNTS":"category","QUANT_CARS":"category","COMPANY":"category",
             "PROFESSIONAL_STATE":"category","FLAG_PROFESSIONAL_PHONE":"category","PROFESSIONAL_PHONE_AREA_CODE":"category","PROFESSION_CODE":"category","OCCUPATION_TYPE":"category",
             "PRODUCT":"category","RESIDENCIAL_ZIP_3":"category","PROFESSIONAL_ZIP_3":"category", "QUANT_DEPENDANTS":"category","AGE":"float64","total_cards" : "category", "TARGET_LABEL_BAD=1":"object",
             "MONTHS_IN_RESIDENCE":"category", "PERSONAL_MONTHLY_INCOME":"float", "OTHER_INCOMES":"category", "MONTHS_IN_THE_JOB":"category","PERSONAL_ASSETS_VALUE":"category"}

In [8]:
#Import DF
df = pd.read_csv(r"C:\Users\59898\Desktop\proyect\model\data\Clean_data\data01.csv",  encoding = "ISO-8859-1", sep = ",", dtype=df_dtypes, index_col = 0)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Split Data and Pipelines

In [4]:
# Mask to categorical and numeric columns
numerical_mask = (df.dtypes == "float64")
categorical_mask = (df.dtypes == "category")

# List with cat and num cols
numeric_cols  = df.columns[numerical_mask].tolist()
categorical_cols = df.columns[categorical_mask].tolist()

In [5]:
# Random Seed
seed = 123

# Split data into label and features
X = df.loc[:, df.columns != 'TARGET_LABEL_BAD=1']
y = df["TARGET_LABEL_BAD=1"].astype("int64")

# Train / test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = seed, stratify = y)
# Validation / Train Split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state = seed, stratify = y_train)

In [6]:
# Processing pipeline
num_transformer = Pipeline(
                            steps = [
                                        ("inputer", SimpleImputer(missing_values= np.nan,
                                                                  strategy = "median")),
                                        ("scaler", RobustScaler())
                                    ])
cat_transformer = Pipeline(
                            steps = [
                                        ("cat_inputer", SimpleImputer(missing_values= np.nan,
                                                                      strategy = "most_frequent")),
                                        ("encoder", OneHotEncoder(drop="if_binary",
                                                                  handle_unknown="ignore",
                                                                  sparse=False))
                                    ])

# Ensemble Transformers
pre_processor = ColumnTransformer(
                            transformers= [
                                        ("num", num_transformer, numeric_cols),
                                        ("cat", cat_transformer, categorical_cols)
                                        ],
                            verbose_feature_names_out = False)

In [7]:
# Apply  preprocessor to cat and num cols

# Train
X_pre_train = pre_processor.fit_transform(X= X_train)

# Validation
X_pre_val = pre_processor.transform(X=X_val)

# Test
X_pre_test = pre_processor.transform(X=X_test)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## SMOTEENN 

we can see that SMOTEENN give us the best values so we test that method with our models

In [8]:
# SMOTEENN X varibales
smoteenn  = SMOTEENN(random_state=7)
X_over_smtee, y_over_smotee = smoteenn.fit_resample(X_pre_train, y_train)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#MLP
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(352,)),
    keras.layers.Dense(8, activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['AUC'])

model.fit(X_over_smtee, y_over_smotee, epochs=25, batch_size=16)
test_loss, test_acc = model.evaluate(X_pre_val, y_val)

##  Hyperparameters

In [10]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfederto[0m ([33manyone-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
# def train_model(): 
#       # Set default configurations (Defaults will be overwritten during sweep)
#   config_defaults = {
      
#       'max_depth': 3, 
#       'min_child_weight': 1,
#       'n_estimators': 1500,
#       'learning_rate' : 0.3,
#       'colsample_bytree' : 0.3,
#       "subsample" : 0.3,
#       "reg_alpha" : 0,
#       "reg_lambda" : 0,
#       "objective" : "binary:logistic",
#       "random_state" : 123
      
#   }

#   # Start W&B
#   wandb.init(config=config_defaults)
#   config = wandb.config

#   # Fit regression model on train set
#   model = xgb.XGBClassifier(
#       max_depth=config.max_depth, 
#       min_child_weight=config.min_child_weight,
#       n_estimators=config.n_estimators,
#       learning_rate=config.learning_rate,
#       colsample_bytree=config.colsample_bytree,
#       subsample=config.subsample,
#       reg_alpha=config.reg_alpha,
#       reg_lambda=config.reg_lambda,
#       objective=config.objective,
#       seed=config.random_state
#     )
  
#   model.fit(X_over_smtee, y_over_smotee)
#   # Predict on test set
#   y_pred = model.predict_proba(X_pre_val)
#   y_pred_t = model.predict_proba(X_pre_test)
#   # Evaluate predictions
#   auc_val = roc_auc_score(y_val,  y_pred[:, 1])
#   auc_test = roc_auc_score(y_test,  y_pred_t[:, 1])
#   print(f"AUC: {round(auc_val, 4)}")

#   # Log model performance metrics to W&B
#   wandb.log({"auc_val": auc_val, "auc_test": auc_test})

In [None]:
# def train_model():
#       # Set default configurations (Defaults will be overwritten during sweep)
#   config_defaults = {
      
#       'n_estimators': 1500,
#       'learning_rate' : 0.1,
#       "penalty" : 0,
#       "random_state" : 123
      
#   }

#   # Start W&B
#   wandb.init(config=config_defaults)
#   config = wandb.config

#   # Fit regression model on train set
#   model = SGDClassifier(
#                         random_state = config.random_state,
#                         loss = "log_loss",
#                         verbose = 0,
#                         max_iter  =config.n_estimators,
#                         penalty = config.penalty,
#                         alpha= config.learning_rate)
  
#   model.fit(X_over_smtee, y_over_smotee)
#   # Predict on test set
#   y_pred = model.predict_proba(X_pre_val)
#   y_pred_t = model.predict_proba(X_pre_test)
#   # Evaluate predictions
#   auc_val = roc_auc_score(y_val,  y_pred[:, 1])
#   auc_test = roc_auc_score(y_test,  y_pred_t[:, 1])
#   print(f"AUC: {round(auc_val, 4)}")

#   # Log model performance metrics to W&B
#   wandb.log({"auc_val": auc_val, "auc_test": auc_test})

In [12]:
# sweep_configs = { 
#     "method": "bayes",
#     "metric": {
#         "name": "auc_val",
#         "goal": "maximize"
#     },
#     "parameters": {
#         "max_depth": {
#             "values": [2,3,4]
#         },
#         "min_child_weight": {
#             "values": [0,3,5,7,10]
#         },
#         "n_estimators": {
#             "values": [5000,4000,3000]
        
#         },
#         'learning_rate' : {
#             "values" : [0.05, 0.025, 0.01, 0.005]
#         },
#         'colsample_bytree': {
#             "values" : [0.3, 0.5, 0.7, 0.9]
#         },
#         "subsample" : {
#             "values" : [0.3, 0.5, 0.7, 0.9]
#         },
#         "reg_alpha" : {
#             "values" : [0,1,2,3,4,5]
#         },
#         "reg_lambda" : {
#             "values" : [0,1,2,3,4,5]
#         },
#         "random_state"  : {
#             "values" : [123]
#         },
#         "objective" : {
#             "values" : ["binary:logistic"]
#         }
#     }}

In [14]:
# sweep_configs = {
#     "method": "bayes",
#     "metric": {
#         "name": "auc_val",
#         "goal": "maximize"
#     },
#     "parameters": {

#         "penalty": {
#             "values": ["l1", "l2"]
#         },
#         "n_estimators": {
#             "values": [3000, 2500, 2000, 1500]
        
#         },
#         'learning_rate' : {
#             "values" : [0.05, 0.025, 0.01, 0.005]
       
#         }
#     }}

In [None]:
# sweep_id = wandb.sweep(sweep=sweep_configs, project="credit")
# wandb.agent(sweep_id=sweep_id, function=train_model, count=150)