In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error,  make_scorer, roc_auc_score
prepath = "/home/cristobal"
trainpath = prepath+"/kaggle/input/playground-series-s3e11/train.csv"
testpath = prepath+"/kaggle/input/playground-series-s3e11/test.csv"
originalpath = prepath+"/kaggle/input/media-campaign-cost-prediction/train_dataset.csv"
outputpath = prepath+"/kaggle/working/playground-series-s3e11/"

In [None]:
data = pd.read_csv(trainpath)
data = data.drop(columns=["id"])
data.head()

In [None]:
features_target = data.columns
features = list(features_target[:-1])

corr = data[features_target].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(11, 9))

sns.heatmap(corr, mask=mask, cmap="coolwarm", 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#Data from original dataset
original_df = pd.read_csv(originalpath)
original_df.head()

In [None]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(
    data[features], data["cost"], test_size=0.30, random_state=21)
# Add original dataset to training set
train = pd.concat([x_train, y_train], axis=1)
train2 = pd.concat([train, original_df])
# Shuffle
train2 = train2.sample(frac=1)
x_train_2 = train2[features]
y_train_2 = train2["cost"]

In [None]:
# Number of missing values in each column of training data
missing_val_count_by_column = (x_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
# Random Grid search to find good parameters
params_grid = { 
            "max_depth": [3, 6, 12, 18],
            "learning_rate": [0.3, 0.01, 0.001],
            "gamma": [0.1, 2, 10],#, 100, 1000],
            "min_child_weight": [1, 10, 50],
            "reg_lambda": [0, 50, 100],
            "objective": ["reg:squaredlogerror", 'reg:squarederror']
            }

rmsle=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)

reg_cv_1 = xgb.XGBRegressor(n_estimators=1000,
                            early_stopping_rounds=5, 
                            eval_metric="rmsle",
                            verbosity=0
                         )
random_search = RandomizedSearchCV(estimator=reg_cv_1, 
                           param_distributions=params_grid, 
                           n_iter=100,
                           scoring=rmsle, 
                           # n_jobs=-2, 
                           cv=5, 
                           verbose=3)
random_result = random_search.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

In [None]:
grid_results = pd.DataFrame(random_result.cv_results_)
# Mean score + 2std
grid_results["worst_case_score"] = grid_results.mean_test_score-2*grid_results.std_test_score
# Mean score - 2std
grid_results["best_case_score"] = grid_results.mean_test_score+2*grid_results.std_test_score
# Show top 10 results sorted by std
best20 = grid_results.sort_values('rank_test_score')[:9]
best20.sort_values('std_test_score')

In [None]:
# Second Random Grid search to find good parameters
params_grid = { 
            "max_depth": [10, 12, 14],
            "learning_rate": [0.1, 0.01, 0.06],
            "gamma": [0.1, 2, 8, 15],#, 100, 1000],
            "min_child_weight": [30, 50, 70],
            "reg_lambda": [0, 50, 100, 150, 200],
            }

reg_cv_2 = xgb.XGBRegressor(n_estimators=1000,
                            early_stopping_rounds=5, 
                            eval_metric="rmsle",
                            objective='reg:squarederror',
                            verbosity=0
                         )
random_search_2 = RandomizedSearchCV(estimator=reg_cv_2, 
                           param_distributions=params_grid, 
                           n_iter=100,
                           scoring=rmsle, 
                           n_jobs=4, 
                           cv=5, 
                           verbose=3)
random_result_2 = random_search_2.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

In [None]:
grid_results_2 = pd.DataFrame(random_result_2.cv_results_)
# Mean score + 2std
grid_results_2["worst_case_score"] = grid_results_2.mean_test_score-2*grid_results_2.std_test_score
# Mean score - 2std
grid_results_2["best_case_score"] = grid_results_2.mean_test_score+2*grid_results_2.std_test_score
# Show top 10 results sorted by std
best20_2 = grid_results_2.sort_values('rank_test_score')[:9]
best20_2.sort_values('std_test_score')

In [None]:
# Third Random Grid search to find good parameters
params_grid = { 
            "max_depth": [11, 12, 13],
            "learning_rate": [ 0.01, 0.03, 0.06],
            "gamma": [2, 8, 15],#, 100, 1000],
            "min_child_weight": [40, 200],
            "reg_lambda": [0],
            }

rmsle=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)

reg_cv_3 = xgb.XGBRegressor(n_estimators=1000,
                            early_stopping_rounds=5, 
                            eval_metric="rmsle",
                            objective='reg:squarederror',
                            verbosity=0
                         )
random_search_3 = RandomizedSearchCV(estimator=reg_cv_3, 
                           param_distributions=params_grid, 
                           n_iter=30,
                           scoring=rmsle, 
                           n_jobs=4, 
                           cv=5, 
                           verbose=3)
random_result_3 = random_search_3.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

In [None]:
grid_results_3 = pd.DataFrame(random_result_3.cv_results_)
# Mean score + 2std
grid_results_3["worst_case_score"] = grid_results_3.mean_test_score-2*grid_results_3.std_test_score
# Mean score - 2std
grid_results_3["best_case_score"] = grid_results_3.mean_test_score+2*grid_results_3.std_test_score
# Show top 10 results sorted by std
best20_3 = grid_results_3.sort_values('rank_test_score')[:9]
best20_3.sort_values('std_test_score')

In [None]:
# Fourth Random Grid search to find good parameters
params_grid = { 
            "max_depth": [11],
            "learning_rate": [0.008, 0.01, 0.03, 0.05, 0.07],
            "gamma": [1, 5, 15, 20, 25],#, 100, 1000],
            "min_child_weight": [30, 70, 110],
            "reg_lambda": [0],
            }

rmsle=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)

reg_cv_4 = xgb.XGBRegressor(n_estimators=1000,
                            early_stopping_rounds=5, 
                            eval_metric="rmsle",
                            objective='reg:squarederror',
                            verbosity=0
                         )
random_search_4 = RandomizedSearchCV(estimator=reg_cv_4, 
                           param_distributions=params_grid, 
                           n_iter=40,
                           scoring=rmsle, 
                           n_jobs=4, 
                           cv=5, 
                           verbose=3)
random_result_4 = random_search_4.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

In [None]:
grid_results_4 = pd.DataFrame(random_result_4.cv_results_)
# Mean score + 2std
grid_results_4["worst_case_score"] = grid_results_4.mean_test_score-2*grid_results_4.std_test_score
# Mean score - 2std
grid_results_4["best_case_score"] = grid_results_4.mean_test_score+2*grid_results_4.std_test_score
# Show top 10 results sorted by std
best20_4 = grid_results_4.sort_values('rank_test_score')[:9]
best20_4.sort_values('std_test_score')

In [None]:
#Training with best parameters
params = { "n_estimators": 10000,
            "max_depth": 11,
            "learning_rate": 0.01,
            "gamma": 25,
            "min_child_weight": 30,
            "reg_lambda": 0,
            "eval_metric": "rmsle",
            "early_stopping_rounds": 20,
            "objective":"reg:squarederror",
            "verbosity": 1
            }

reg = xgb.XGBRegressor(**params)
# start = time.time() # time at start of BDT fit
reg.fit(x_train, y_train, eval_set=[(x_test, y_test)])
# elapsed = time.time() - start # time after fitting BDT
# print("Time taken to fit BDT: "+str(round(elapsed,1))+"s") # print total time taken to fit BDT
print(reg)

In [None]:
y_pred_test = reg.predict(x_test).flatten()
y_pred_train = reg.predict(x_train).flatten()
mean_squared_log_error(y_test, y_pred_test, squared=False)

In [None]:
# Train model with original data
params = { "n_estimators": 10000,
            "max_depth": 11,
            "learning_rate": 0.01,
            "gamma": 25,
            "min_child_weight": 30,
            "reg_lambda": 0,
            "eval_metric": "rmsle",
            "early_stopping_rounds": 20,
            "objective":"reg:squarederror",
            "verbosity": 1
            }

reg2 = xgb.XGBRegressor(**params)
# start = time.time() # time at start of BDT fit
reg2.fit(x_train_2, y_train_2, eval_set=[(x_test, y_test)])
# elapsed = time.time() - start # time after fitting BDT
# print("Time taken to fit BDT: "+str(round(elapsed,1))+"s") # print total time taken to fit BDT
print(reg2)

In [None]:
y_pred_test_2 = reg2.predict(x_test).flatten()
y_pred_train_2 = reg2.predict(x_train).flatten()
#logloss of the model trained with original data
mean_squared_log_error(y_test, y_pred_test_2, squared=False)

In [None]:
# Read data for submission
submit_df = pd.read_csv(testpath)
submit_df.head()

In [None]:
submit_id = submit_df["id"]
# Predict the cost of the submission data
y_pred_submit = reg2.predict(submit_df[features]).flatten()
submit_final = pd.DataFrame({"id": submit_id, "Class": y_pred_submit})
# Save prediction
submit_final.to_csv(outputpath+"submission.csv", index=False)