In [1]:
# inspired by https://www.kaggle.com/hamditarek/tabular-playground-series-xgboost-lightgbm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
from typing import Tuple

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, cross_validate

import xgboost as xgb


# read data
in_kaggle = False


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-jan-2021/train.csv'
        test_path = '../input/tabular-playground-series-jan-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-jan-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)

Bad key "text.kerning_factor" on line 4 in
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-01-29 18:52:29.741648


In [3]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

df_train = pd.read_csv(train_set_path)
df_test = pd.read_csv(test_set_path)

subm = pd.read_csv(sample_subm_path)

Wall time: 1.16 s


In [4]:
df_train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [5]:

# target labels list
target = 'target'

# drop sig_id from train and test sets
df_train = df_train.drop(['id'], axis=1, errors='ignore')
df_test = df_test.drop(['id'], axis=1, errors='ignore')


In [6]:
# stratified continuous target split
# courtesy @tolgadincer and his contribution for https://www.kaggle.com/tolgadincer/continuous-target-stratification

def create_folds(df, n_s=5, n_grp=None):
    df['Fold'] = -1
    
    if n_grp is None:
        skf = KFold(n_splits=n_s, random_state=42, shuffle=True)
        target = df.target
    else:
        skf = StratifiedKFold(n_splits=n_s, random_state=42, shuffle=False)
        df['grp'] = pd.cut(df.target, n_grp, labels=False)
        target = df.grp
    
    for fold_no, (t, v) in enumerate(skf.split(target, target)):
        df.loc[v, 'Fold'] = fold_no
    return df

In [7]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb # LightGBM Model


#Additional scklearn functions
from sklearn import metrics   
from sklearn.model_selection import cross_val_score, KFold, cross_validate


# Setting stratified kfold upon the continuous target for future use
kfolds = 5
df_train = create_folds(df_train, n_s=kfolds) #, n_grp=1000
np.random.seed(1)


# set a hideout fold 

y = df_train['target']
X_train, X_hideout, y_train, y_hideout = train_test_split(df_train, y, test_size=0.3, random_state=42, shuffle=True)

X_hideout = X_hideout.drop(['target', 'Fold', 'grp'], axis=1, errors='ignore').copy()

def print_lightgbm_feature_importance(X, y, estimator):
    #Print Feature Importance:
    estimator.fit(X, y)
    predictors = X.columns

    feat_imp = pd.Series(estimator.feature_importances_, predictors).sort_values(ascending=False)
    feat_imp.nlargest(30).plot(kind='barh', title='Feature Importances', figsize=(8,10))
    plt.ylabel('Feature Importance Score')
    df = feat_imp.to_frame().reset_index()
    df = df.rename(columns={'index': 'predictor', 0: "fi_score"})
    return df



def model_check(estimator, model_name, model_description):
    model_table = pd.DataFrame()

    for i in range(0, kfolds):
        fold_fit_start_time = dt.datetime.now()
        
        X_valid = X_train[X_train['Fold'] == i]
        y_valid = X_valid['target']
        X_valid = X_valid.drop(['target', 'Fold', 'grp'], axis=1, errors='ignore')
        
        X_tr = X_train[X_train['Fold'] != i]
        y_tr = X_tr['target']
        X_tr = X_tr.drop(['target', 'Fold', 'grp'], axis=1, errors='ignore')
    

        fit_model = estimator.fit(X_tr, y_tr)
        
        pred_tr = estimator.predict(X_tr)
        pred_val = estimator.predict(X_valid)

        train_score = estimator.score(X_tr, y_tr.values.ravel())
        validation_score = estimator.score(X_valid, y_valid.values.ravel())

        print('Accuracy of the Regressor on the training set, fold {}: {:.4f}'.format(i, train_score))
        print('Accuracy of the Regressor on the validation set, fold {}: {:.4f}'.format(i, validation_score))
        
        X_hide = X_hideout.copy()
        pred_hideout = estimator.predict(X_hide)

        hideout_score = estimator.score(X_hide, y_hideout.values.ravel())
        print('Accuracy of the Regressor on the hide-out set, fold {}: {:.4f}'.format(i, hideout_score))
        
        rmse = mean_squared_error(y_hideout, pred_hideout, squared=False)
        print('RMSE of the Regressor on the hideout set, fold {}: {:.4f}'.format(i, rmse))

        fold_fit_end_time = dt.datetime.now()

        fit_time = fold_fit_end_time - fold_fit_start_time

        cv_attributes = {
            'train_score': round(train_score, 4),
            'validation_score': round(validation_score, 4),
            'test_score': round(hideout_score, 4),
            'test_rmse': round(rmse, 4),
            'fit_time': fit_time,
        }

        if i == 0:
            # the initial fold, just initializing the results dataframe
            cv_results = pd.DataFrame(data=[cv_attributes])
        else:
            # appending the results dataframe
            fold_result = pd.DataFrame(data=[cv_attributes])
            cv_results = pd.concat([cv_results, fold_result])

        del X_tr, X_valid, X_hide, y_tr, y_valid

    train_score = cv_results['train_score'].mean()
    validation_score = cv_results['validation_score'].mean()
    test_score = cv_results['test_score'].mean()
    test_rmse = cv_results['test_rmse'].mean()
    test_std = cv_results['test_score'].std()
    fit_time = cv_results['fit_time'].mean()

    attributes = {
        'model_name': model_name,
        'train_score': train_score,
        'validation_score': validation_score,
        'test_score': test_score,
        'test_std': test_std,
        'test_rmse': test_rmse,
        'fit_time': fit_time,
        'description': model_description,
    }
    
    model_table = pd.DataFrame(data=[attributes])
    return model_table

In [None]:
# params for XGB are taked from this great kernel https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna 
# by Hamza Ghanmi

model = xgb.XGBRegressor(
                 colsample_bytree=0.5,
                 alpha=0.01563,
                 #gamma=0.0,
                 learning_rate=0.01,
                 max_depth=15,
                 min_child_weight=257,
                 n_estimators=4000,                                                                  
                 #reg_alpha=0.9,
                 reg_lambda=0.003,
                 subsample=0.7,
                 random_state=2020,
                 metric_period=100,
                 silent=1)

# fit the baseline model with the training data
result_df = model_check(model, "Initial model", "Initial baseline")
display(result_df.head())

Parameters: { metric_period, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [None]:
subm.info()

In [None]:
# training and predicting with the best model
best_model = model

X_train_final = X_train.drop(['target', 'Fold', 'grp'], axis=1, errors='ignore').copy()

best_model.fit(X_train_final, y_train)
# output the feature importance of the best model
feat_imp = print_lightgbm_feature_importance(X_hideout, y_hideout, best_model)


In [None]:
# predicting with the best model on the hideout set
pred_hideout = best_model.predict(X_hideout)
hideout_score = best_model.score(X_hideout, y_hideout.ravel())

print('Score of the Regressor on the hideout set: {:.4f}'.format(hideout_score))

In [None]:
# in scikit-learn >= 0.22.0 
# https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python
rmse = mean_squared_error(y_hideout, pred_hideout, squared=False)
print('RMSE of the Regressor on the hideout set: {:.4f}'.format(rmse))



# Results

## Submission 1



In [None]:
# predicting on the test dataset
pred_test = best_model.predict(df_test)

In [None]:
%%time
#save sumbmission to a file
subm['target'] = pred_test
subm.to_csv('submission.csv', index=False)

In [None]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)