In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection

from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import seaborn as sns

In [2]:
# Load the training data
df = pd.read_csv("../input/30-days-of-ml/train.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# Preview the data
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [3]:
df.cat8.unique()

array(['C', 'A', 'G', 'E', 'F', 'D', 'B'], dtype=object)

In [4]:
df["kfold"] = -1

In [5]:
kf = model_selection.StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(df,df['cat8'])):
    df.loc[valid_indicies, "kfold"] = fold

In [6]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

In [7]:
# Model hyperparameters
from xgboost import XGBRegressor
params = {
    'learning_rate': 0.07853392035787837,
    'reg_lambda': 1.7549293092194938e-05,
    'reg_alpha': 14.68267919457715,
    'subsample': 0.8031450486786944,
    'colsample_bytree': 0.170759104940733,
    'max_depth': 3
}

In [8]:
final_predictions = []
scores = []
for fold in range(7):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(
        random_state=0, 
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=5000,
        **params
    )
    
    model.fit(xtrain,
              ytrain,
              early_stopping_rounds=300,
              eval_set=[(xvalid, yvalid)],
              verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
print(np.mean(scores), np.std(scores))

[0]	validation_0-rmse:7.17331
[1000]	validation_0-rmse:0.71784
[2000]	validation_0-rmse:0.71623
[2953]	validation_0-rmse:0.71605
0 0.7159945261364977
[0]	validation_0-rmse:7.17406
[1000]	validation_0-rmse:0.71856
[2000]	validation_0-rmse:0.71694
[3000]	validation_0-rmse:0.71662
[3130]	validation_0-rmse:0.71662
1 0.716580882515304
[0]	validation_0-rmse:7.17244
[1000]	validation_0-rmse:0.72252
[2000]	validation_0-rmse:0.72097
[3000]	validation_0-rmse:0.72081
[3171]	validation_0-rmse:0.72077
2 0.7207157690534768
[0]	validation_0-rmse:7.17233
[1000]	validation_0-rmse:0.71811
[2000]	validation_0-rmse:0.71639
[3000]	validation_0-rmse:0.71606
[3262]	validation_0-rmse:0.71609
3 0.7160362521745589
[0]	validation_0-rmse:7.16753
[1000]	validation_0-rmse:0.71841
[2000]	validation_0-rmse:0.71678
[3000]	validation_0-rmse:0.71654
[3222]	validation_0-rmse:0.71655
4 0.7165076354124533
[0]	validation_0-rmse:7.17626
[1000]	validation_0-rmse:0.71845
[2000]	validation_0-rmse:0.71712
[2639]	validation_0-rms

In [9]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [10]:
sample_submission.target = preds
sample_submission.to_csv("0831_KFold_XGBoost_submission.csv", index=False)