![Banner](https://raw.githubusercontent.com/crunchdao/quickstarters/refs/heads/master/competitions/datacrunch-legacy/assets/banner.webp)

# Import Libraries

In [29]:
# Lib & Dependencies
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn import metrics

from scipy.stats import spearmanr

import xgboost as xgb

# Load data

In [2]:
train_datalink_X = 'https://tournament.datacrunch.com/data/X_train.csv'
train_datalink_y = 'https://tournament.datacrunch.com/data/y_train.csv'
hackathon_data_link = 'https://tournament.datacrunch.com/data/X_test.csv'

In [5]:
train_data = pd.read_csv(train_datalink_X)
test_data = pd.read_csv(hackathon_data_link)
train_targets = pd.read_csv(train_datalink_y)

In [6]:
train = train_data.merge(train_targets, left_index=True, right_index=True, how='inner')

In [7]:
train.head()

Unnamed: 0,Moons,id,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,Feature_12,Feature_13,target_r,target_g,target_b
0,0,0x5c5369f3e1687b61,0.5,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,1.0,0.0
1,0,0x3874689d0b4888b8,0.5,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0
2,0,0x8a3fda97cd9219c8,0.75,1.0,0.75,0.0,0.0,0.0,0.0,0.5,0.5,1.0,0.75,0.75,1.0,0.5,1.0,0.5
3,0,0x6be36f219426c022,0.5,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.75,0.5,0.25,0.0,0.0
4,0,0xd4584a6f7e1f2b6a,0.75,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.75,1.0


In [8]:
test_data.head()

Unnamed: 0,Moons,id,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,Feature_12,Feature_13
0,0,0x33aa5dd041631fa0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0xbd9da03cd0267e56,0.25,0.75,1.0,0.0,0.0,0.0,0.0,0.25,0.25,0.75,0.5,0.5,0.75
2,0,0xb95b58e7e25a02cf,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.75,0.75,0.75,1.0,0.5,0.75
3,0,0xad8af9c07a5770d3,0.25,0.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0xcdc024f2efb69bc5,0.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.25,0.25,0.5


# Set features

In [9]:
# Feature columns
features = train.columns[train.columns.str.startswith('Feature')]
features

Index(['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10',
       'Feature_11', 'Feature_12', 'Feature_13'],
      dtype='object')

# Set Targets

In [24]:
# Targets columns
targets = train.columns[train.columns.str.startswith('target')]
targets

Index(['target_r', 'target_g', 'target_b'], dtype='object')

In [25]:
# Pick target_r
target = 'target_r'

# Set Time group (Moons)

In [14]:
moons = train.Moons
moons

0        0
1        0
2        0
3        0
4        0
        ..
6533    13
6534    13
6535    13
6536    13
6537    13
Name: Moons, Length: 6538, dtype: int64

# Standard Cross Validations

In [13]:
crossvalidators = [
    model_selection.KFold(3),
    model_selection.KFold(3, shuffle=True),
    model_selection.GroupKFold(3),
    model_selection.TimeSeriesSplit(3)
]

# Metric Spearman Rank Correlation

In [15]:
def spearman(y_true, y_pred): 
    return spearmanr(y_pred, y_true).correlation

# Set Model

In [19]:
model = xgb.XGBRegressor(objective="reg:squarederror", max_depth=5, learning_rate=0.01, n_estimators=200, n_jobs=-1, colsample_bytree=0.5)

# Calculate Cross Validations Scores

In [28]:
for cv in crossvalidators:
    print(cv)
    print(np.mean(
            model_selection.cross_val_score(
            model,
            train[features],
            train[target],
            cv=cv,
            n_jobs=1,
            groups=moons,
            scoring=metrics.make_scorer(spearman, greater_is_better=True)
        )))
    print()

KFold(n_splits=3, random_state=None, shuffle=False)
0.003664552146971723

KFold(n_splits=3, random_state=None, shuffle=True)
0.05660923891616019

GroupKFold(n_splits=3)
0.013858792104565748

TimeSeriesSplit(max_train_size=None, n_splits=3)
0.00540922191225974



# Here is a more elaborated Time-Series CV

In [30]:
class TimeSeriesSplitGroups(_BaseKFold):
    def __init__(self, n_splits=5):
        super().__init__(n_splits, shuffle=False, random_state=None)

    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_list = np.unique(groups)
        n_groups = len(group_list)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_groups))
        indices = np.arange(n_samples)
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds,
                            n_groups, test_size)
        test_starts = list(test_starts)[::-1]
        for test_start in test_starts:
            
            yield (indices[groups.isin(group_list[:test_start])],
                   indices[groups.isin(group_list[test_start:test_start + test_size])])

In [31]:
print(np.mean(
        model_selection.cross_val_score(
        model,
        train[features],
        train[target],
        cv=TimeSeriesSplitGroups(3),
        n_jobs=1,
        groups=moons,
        scoring=metrics.make_scorer(spearman, greater_is_better=True)
    )))
print(cv)

0.012727819252278602
TimeSeriesSplit(max_train_size=None, n_splits=3)


# About
Last updated: 2021-02-25

Created by: [Jeremy Berros](https://github.com/jberros)

Greatly inspired by the works from: [Jon Taylor](https://github.com/jonrtaylor) and [Michael Oliver](https://github.com/the-moliver)