<a href="https://www.kaggle.com/code/cristobalchavez/xgboost-drop-columns-and-tuning-with-optuna?scriptVersionId=125561649" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error,  make_scorer, roc_auc_score
prepath = ""
trainpath = prepath+"/kaggle/input/playground-series-s3e11/train.csv"
testpath = prepath+"/kaggle/input/playground-series-s3e11/test.csv"
originalpath = prepath+"/kaggle/input/media-campaign-cost-prediction/train_dataset.csv"
outputpath = prepath+"/kaggle/working/playground-series-s3e11/"

In [None]:
data = pd.read_csv(trainpath)
data = data.drop(columns=["id"])
data.head()

In [None]:
features_target = data.columns
features = list(features_target[:-1])

corr = data[features_target].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(11, 9))

sns.heatmap(corr, mask=mask, cmap="coolwarm", 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#Data from original dataset
original_df = pd.read_csv(originalpath)
# original_df.head()

In [None]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(
    data[features], data["cost"], test_size=0.30, random_state=21)
# Add original dataset to training set
train = pd.concat([x_train, y_train], axis=1)
train2 = pd.concat([train, original_df])
# Shuffle
train2 = train2.sample(frac=1)
x_train_2 = train2[features]
y_train_2 = train2["cost"]

In [None]:
# XGBoost default parameters
reg_default = xgb.XGBRegressor(n_estimators=1000,
                            early_stopping_rounds=20, 
                            eval_metric="rmsle",
                            verbosity=0
                         )
reg_default.fit(x_train, y_train, eval_set=[(x_test, y_test)])
print(reg_default)

In [None]:
y_pred_test_default = reg_default.predict(x_test).flatten()
#logloss of the model trained with original data
print(mean_squared_log_error(y_test, y_pred_test_default, squared=False)
)
# xgb.plot_importance(reg_default, importance_type='weight')
xgb.plot_importance(reg_default, importance_type='gain')
# xgb.plot_importance(reg_default, importance_type='cover')

In [None]:
xgb.plot_importance(reg_default, importance_type='cover')

In [None]:
# Best parameters from previous notebook
params = { "n_estimators": 10000,
            "max_depth": 11,
            "learning_rate": 0.01,
            "gamma": 22,
            "min_child_weight": 20,
            "reg_lambda": 0,
            "eval_metric": "rmsle",
            "early_stopping_rounds": 20,
            "objective":"reg:squarederror",
            "verbosity": 1,
            "subsample": 0.8
            }

reg_prev = xgb.XGBRegressor(**params)
reg_prev.fit(x_train, y_train, eval_set=[(x_test, y_test)])
print(reg_prev)

In [None]:
y_pred_test_prev = reg_prev.predict(x_test).flatten()
#logloss of the model trained with original data
mean_squared_log_error(y_test, y_pred_test_prev, squared=False)
# xgb.plot_importance(reg_prev, importance_type='weight')
xgb.plot_importance(reg_prev, importance_type='gain')

In [None]:
xgb.plot_importance(reg_prev, importance_type='cover')

In [None]:
# Drop features with low average gain
dropping = [
            "low_fat",
            "recyclable_package",
            "units_per_case",
            "gross_weight",
            "store_sales(in millions)",
            "unit_sales(in millions)"
]
x_train_dropped = x_train.drop(columns=dropping, inplace=False)
x_test_dropped = x_test.drop(columns=dropping, inplace=False)
x_train_2_dropped = x_train_2.drop(columns=dropping, inplace=False)

In [None]:
# Find good parameters with optuna and original+generated data
import optuna
from sklearn.model_selection import cross_val_score
rmsle=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)
def objective(trial):

    param = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":10,
        "verbosity": 0,
        "reg_lambda": trial.suggest_float("lambda", 1e-5, 100),
        "reg_alpha": trial.suggest_loguniform("alpha", 1e-5, 100),
        "max_depth":  trial.suggest_int("max_depth", 3, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.5),
        "gamma": trial.suggest_float("gamma", 0, 30),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 150),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0)
    }

    fit_params = {
                "eval_set":[(x_test_dropped, y_test)],
                "verbose": False
                }

    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, x_train_2_dropped, y_train_2, scoring=rmsle, cv=5, fit_params=fit_params)

    return scores.mean()
study = optuna.create_study(direction="maximize", study_name="original_data_dropped_columns_1")
study.optimize(objective, n_trials = 300)


In [None]:
best_params = study.best_params
best_score = study.best_value
print(f"Best score: {best_score}\n")
print(f"Optimized parameters: {best_params}\n")

In [None]:
#Training with best parameters and original+generated data
params = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        # 'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":20,
        }
# Add best parameters to params dictionary
params.update(best_params)
reg = xgb.XGBRegressor(**params)
reg.fit(x_train_2_dropped, y_train_2, eval_set=[(x_test_dropped, y_test)])
print(reg)

In [None]:
y_pred_test = reg.predict(x_test_dropped).flatten()
mean_squared_log_error(y_test, y_pred_test, squared=False)

In [None]:
#Training with best parameters but only on generated data
params = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        # 'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":20,
        }
# Add best parameters to params dictionary
params.update(best_params)
reg2 = xgb.XGBRegressor(**params)
reg2.fit(x_train_dropped, y_train, eval_set=[(x_test_dropped, y_test)])
print(reg2)

In [None]:
y_pred_test_2 = reg2.predict(x_test_dropped).flatten()
mean_squared_log_error(y_test, y_pred_test_2, squared=False)

In [None]:
# Read data for submission
submit_df = pd.read_csv(testpath)
submit_df.head()

In [None]:
submit_id = submit_df["id"]
# Predict the cost of the submission data
submit_dropped = submit_df[features].drop(columns=dropping)
y_pred_submit = reg.predict(submit_dropped).flatten()
submit_final = pd.DataFrame({"id": submit_id, "Class": y_pred_submit})
# Save prediction
submit_final.to_csv(outputpath+"submission3.csv", index=False)

In [None]:
y_pred_submit = reg2.predict(submit_dropped).flatten()
submit_final = pd.DataFrame({"id": submit_id, "Class": y_pred_submit})
# Save prediction
submit_final.to_csv(outputpath+"submission4.csv", index=False)

In [None]:
# Find good parameters with optuna using only generated data
import optuna
from sklearn.model_selection import cross_val_score
rmsle=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)
def objective(trial):

    param = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":10,
        "verbosity": 0,
        "reg_lambda": trial.suggest_float("lambda", 0, 100),
        "reg_alpha": trial.suggest_float("alpha", 0, 100),
        "max_depth":  trial.suggest_int("max_depth", 8, 25),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.5, log=True),
        "gamma": trial.suggest_float("gamma", 0, 30),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0)
    }

    fit_params = {
                "eval_set":[(x_test_dropped, y_test)],
                "verbose": False
                }

    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, x_train_dropped, y_train, scoring=rmsle, cv=5, fit_params=fit_params)

    return scores.mean()
study2 = optuna.create_study(direction="maximize", study_name="dropped_columns_2")
study2.optimize(objective, n_trials = 300)


In [None]:
best_params = study2.best_params
best_score = study2.best_value
print(f"Best score: {best_score}\n")
print(f"Optimized parameters: {best_params}\n")

In [None]:
#Training with best parameters and generated data
params = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        # 'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":20,
        }
# Add best parameters to params dictionary
params.update(best_params)
reg3 = xgb.XGBRegressor(**params)
reg3.fit(x_train_dropped, y_train, eval_set=[(x_test_dropped, y_test)])
print(reg3)

In [None]:
y_pred_test_3 = reg3.predict(x_test_dropped).flatten()
mean_squared_log_error(y_test, y_pred_test_3, squared=False)

In [None]:
y_pred_submit = reg3.predict(submit_dropped).flatten()
submit_final = pd.DataFrame({"id": submit_id, "Class": y_pred_submit})
# Save prediction
submit_final.to_csv(outputpath+"submission5.csv", index=False)

In [None]:
#Training with best parameters and original+generated data
params = {
        "objective": 'reg:squarederror',
        "eval_metric": "rmsle",
        # 'tree_method': 'gpu_hist',
        "n_estimators": 1000,
        "early_stopping_rounds":20,
        }
# Add best parameters to params dictionary
params.update(best_params)
reg4 = xgb.XGBRegressor(**params)
reg4.fit(x_train_2_dropped, y_train_2, eval_set=[(x_test_dropped, y_test)])
print(reg4)

In [None]:
y_pred_test_4 = reg4.predict(x_test_dropped).flatten()
mean_squared_log_error(y_test, y_pred_test_4, squared=False)

In [None]:
y_pred_submit = reg4.predict(submit_dropped).flatten()
submit_final = pd.DataFrame({"id": submit_id, "Class": y_pred_submit})
# Save prediction
submit_final.to_csv(outputpath+"submission6.csv", index=False)