In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv
/kaggle/input/g-research-crypto-forecasting/asset_details.csv
/kaggle/input/g-research-crypto-forecasting/example_test.csv
/kaggle/input/g-research-crypto-forecasting/train.csv
/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/__init__.py


In [2]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [3]:
import random
import pandas as pd
import numpy as np

In [4]:
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [5]:
import gresearch_crypto

In [6]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin


In [7]:
import xgboost as xgb

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)

    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
    return df_feat

## Some useful funcs and imports

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
from sklearn.metrics import f1_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from functools import partial

In [10]:
def m(x, w):
    return np.sum(x * w) / np.sum(w)

def cov(x, y, w):
    return np.sum(w * (x - m(x, w)) * (y - m(y, w))) / np.sum(w)

def wcc(x, y, w):
    return cov(x, y, w) / np.sqrt(cov(x, x, w) * cov(y, y, w))

## Parameters grid

In [11]:
space = [
    Real(0.5, 0.8, name="colsample_bytree"),
    Real(0.001, 0.5, name="learning_rate"),
    Integer(6, 15, name="max_depth"),
    Integer(100, 1000, name="n_estimators"),
    Real(0.5, 0.95, name="subsample"),
]

## Data preprocessing

In [12]:
train_ratio = 0.75

In [13]:
df_train['Asset_ID'].value_counts()

1     1956282
6     1956200
9     1956030
5     1955140
2     1953537
7     1951127
0     1942619
13    1874560
3     1791867
12    1778749
11    1701261
8     1592071
4     1156866
10     670497
Name: Asset_ID, dtype: int64

In [14]:
weight = df_assets[df_assets['Asset_ID'] == 12]['Weight'].values[0]

In [15]:
weight

2.079441541679836

In [16]:
df = df_train[df_train['Asset_ID'] == 12].reset_index(drop=True)

In [17]:
df

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1518825180,12,4,0.410000,0.410000,0.360000,0.360000,3779.362927,0.405237,
1,1518826560,12,1,0.410000,0.410000,0.410000,0.410000,800.000000,0.410000,
2,1518827820,12,3,0.470000,0.470000,0.470000,0.470000,400.000000,0.470000,
3,1518828660,12,1,0.469800,0.469800,0.469800,0.469800,450.000000,0.469800,0.022222
4,1518828720,12,1,0.450000,0.450000,0.450000,0.450000,312.978512,0.450000,
...,...,...,...,...,...,...,...,...,...,...
1778744,1632182160,12,122,0.282546,0.282800,0.281509,0.281798,142533.884179,0.282147,
1778745,1632182220,12,160,0.281595,0.281990,0.280675,0.281025,152181.736233,0.281339,
1778746,1632182280,12,187,0.280721,0.280911,0.280167,0.280435,92322.353887,0.280628,
1778747,1632182340,12,336,0.280721,0.282400,0.280200,0.282077,681270.709717,0.281377,


In [18]:
df_proc = get_features(df)
df_proc['y'] = df['Target']
df_proc = df_proc.dropna(how="any")
df_proc = df_proc.reset_index(drop=True)
df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
df_proc = df_proc.reset_index(drop=True)

X = df_proc.drop("y", axis=1)
y = df_proc["y"]

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [19]:
df_proc

Unnamed: 0,Count,Open,High,Low,Close,Volume,VWAP,Upper_Shadow,Lower_Shadow,Close/Open,Close-Open,High-Low,High/Low,Mean,Median,y
0,1,0.469800,0.469800,0.469800,0.469800,450.000000,0.469800,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.469800,0.469800,0.022222
1,2,0.460010,0.460010,0.460000,0.460000,2308.215000,0.460003,0.000000,0.000000,0.999978,-0.000010,0.000010,1.000022,0.460005,0.460005,0.021739
2,1,0.474499,0.474499,0.474499,0.474499,1470.000000,0.474499,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.474499,0.474499,-0.009482
3,1,0.489900,0.489900,0.489900,0.489900,5000.000000,0.489900,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.489900,0.489900,0.010922
4,2,0.483995,0.484000,0.483995,0.484000,700.000000,0.483999,0.000000,0.000000,1.000010,0.000005,0.000005,1.000010,0.483997,0.483997,-0.006211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716928,273,0.280678,0.281100,0.280050,0.281027,650577.325780,0.280737,0.000073,0.000628,1.001244,0.000349,0.001050,1.003749,0.280714,0.280852,-0.000743
1716929,270,0.281107,0.282020,0.280737,0.281771,572937.878934,0.281488,0.000249,0.000370,1.002359,0.000663,0.001283,1.004570,0.281409,0.281439,-0.000860
1716930,189,0.281771,0.282027,0.281039,0.281559,368763.574739,0.281585,0.000256,0.000520,0.999247,-0.000212,0.000988,1.003516,0.281599,0.281665,-0.001822
1716931,159,0.281493,0.281903,0.280767,0.281143,264667.037879,0.281267,0.000410,0.000376,0.998756,-0.000350,0.001136,1.004046,0.281326,0.281318,-0.001340


In [20]:
col_names = df_proc.drop('y', axis=1).columns

In [21]:
col_names

Index(['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP',
       'Upper_Shadow', 'Lower_Shadow', 'Close/Open', 'Close-Open', 'High-Low',
       'High/Low', 'Mean', 'Median'],
      dtype='object')

In [22]:
X.shape

(1716933, 15)

In [23]:
y.shape

(1716933,)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=42)
target_col = "y"
df_train = pd.DataFrame(X_train, columns=col_names)
df_test = pd.DataFrame(X_test, columns=col_names)

In [25]:
df_train.shape

(1287699, 15)

In [26]:
df_test.shape

(429234, 15)

## Best params finding

In [27]:
def return_model_assessment(args, X_train, y_train, X_test, w):
    global models, train_scores, test_scores, curr_model_hyper_params
    params = {curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params)}
    model = xgb.XGBRegressor(missing=-999, random_state=2022)
    model.set_params(**params)
    fitted_model = model.fit(X_train, y_train, sample_weight=None)
    models.append(fitted_model)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_score = wcc(train_predictions, y_train, w)
    test_score = wcc(test_predictions, y_test, w)
    train_scores.append(train_score)
    test_scores.append(test_score)
    return 1 - test_score

In [28]:
models = []
train_scores = []
test_scores = []
curr_model_hyper_params = ['colsample_bytree', 'learning_rate',
                        'max_depth', 'n_estimators', 'subsample']
objective_function = partial(return_model_assessment, X_train=X_train, y_train=y_train, X_test=X_test, w=weight)

n_calls = 10
results = gp_minimize(objective_function, space, base_estimator=None, n_calls=10, n_random_starts=n_calls-1, random_state=42)

In [29]:
print(results)

          fun: 3.4964724904273936e-07
    func_vals: array([5.09197503e-07, 3.49647249e-07, 5.24876577e-07, 4.15854499e-07,
       1.99999957e+00, 1.55364300e-06, 5.63418384e-07, 1.99999981e+00,
       5.92755124e-07, 5.69431716e-07])
       models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                         n_restarts_optimizer=2, noise='gaussian',
                         normalize_y=True, random_state=1608637542), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                         n_restarts_optimizer=2, noise='gaussian',
                         normalize_y=True, random_state=1608637542)]
 random_state: RandomState(MT19937) at 0x7F98E4C6B160
        space: Space([Real(low=0.5, high=0.8, prior='uniform', transform='normalize'),
       Real(low=0.001, high=0.5, prior='uniform', transform='normalize'),
       Integer(low=6, high=15, pri

In [31]:
print(results.x)

[0.5299924747454009, 0.23016519709096778, 9, 229, 0.7928998128269837]


## The results of hyperparameter matching for XGBoost for each coin

### Binance Coin     (ID=0 ) 
* colsample_bytree - 0.6156249507619749
* learning_rate - 0.008967159857886885
* max_depth - 8
* n_estimators - 317
* subsample - 0.8074685834714562

### Bitcoin          (ID=1 )
* colsample_bytree - 0.73896289605807
* learning_rate - 0.09253396014321574
* max_depth - 13
* n_estimators - 637
* subsample - 0.700624738784116

### Bitcoin Cash     (ID=2 )
* colsample_bytree - 0.6829989973347863
* learning_rate - 0.4167642609563461
* max_depth - 8
* n_estimators - 452
* subsample - 0.582006239504628

### Cardano          (ID=3 )
* colsample_bytree - 0.6829989973347863
* learning_rate - 0.4167642609563461
* max_depth - 8
* n_estimators - 452
* subsample - 0.582006239504628

### Dogecoin         (ID=4 )
* colsample_bytree - 0.7266084230952958
* learning_rate - 0.05918488024797159
* max_depth - 9
* n_estimators - 471
* subsample - 0.7693819367697938

### EOS.IO           (ID=5 )
* colsample_bytree - 0.6829989973347863
* learning_rate - 0.4167642609563461
* max_depth - 8
* n_estimators - 452
* subsample - 0.582006239504628

### Ethereum         (ID=6 )
* colsample_bytree - 0.6951993738259458
* learning_rate - 0.4202769113980745
* max_depth - 8
* n_estimators - 476
* subsample - 0.5563315209270074

### Ethereum Classic (ID=7 )
* colsample_bytree - 0.6855158027999262
* learning_rate - 0.19184853364231427
* max_depth - 15
* n_estimators - 520
* subsample - 0.8869731830313443

### IOTA             (ID=8 )
* colsample_bytree - 0.7266084230952958
* learning_rate - 0.05918488024797159
* max_depth - 9
* n_estimators - 471
* subsample - 0.7693819367697938

### Litecoin         (ID=9 )
* colsample_bytree - 0.5299924747454009
* learning_rate - 0.23016519709096778
* max_depth - 9
* n_estimators - 229
* subsample - 0.7928998128269837

### Maker            (ID=10)
* colsample_bytree - 0.5299924747454009
* learning_rate - 0.23016519709096778
* max_depth - 9
* n_estimators - 229
* subsample - 0.7928998128269837

### Monero           (ID=11)
* colsample_bytree - 0.5299924747454009
* learning_rate - 0.23016519709096778
* max_depth - 9
* n_estimators - 229
* subsample - 0.7928998128269837

### Stellar          (ID=12)
* colsample_bytree - 0.5299924747454009
* learning_rate - 0.23016519709096778
* max_depth - 9
* n_estimators - 229
* subsample - 0.7928998128269837

### TRON             (ID=13)
* colsample_bytree - 0.685244452888315
* learning_rate - 0.3062149270836522
* max_depth - 6
* n_estimators - 121
* subsample - 0.7361485971162751