## Data loading

In [1]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [2]:
from catboost import Pool, cv, CatBoostRegressor

In [3]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import plotly.graph_objects as go

In [4]:
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

In [5]:
import gresearch_crypto

In [6]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

## Data preprocessing

In [7]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)

    return df_feat

In [8]:
from sklearn.preprocessing import StandardScaler

## Model getting

In [9]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    df_proc = df_proc.reset_index(drop=True)
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.reset_index(drop=True)
     
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    model = CatBoostRegressor(iterations=500, learning_rate=0.05, depth=10, random_seed=42, verbose = 0, silent=True, task_type="GPU")
    model.fit(X, y)

    return X, y, model

In [10]:
from sklearn.metrics import f1_score
import pandas as pd
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from functools import partial

In [11]:
def m(x, w):
    """Weighted Mean"""
    return np.sum(x * w) / np.sum(w)
def cov(x, y, w):
    return np.sum(w * (x - m(x, w)) * (y - m(y, w))) / np.sum(w)

def wcc(x, y, w):
    return cov(x, y, w) / np.sqrt(cov(x, x, w) * cov(y, y, w))

## Parameters grid

In [12]:
space = [
    Real(0.001, 0.5, name="learning_rate"),
    Integer(6, 15, name="max_depth"),
    Integer(100, 1000, name="n_estimators"),
    Real(0.5, 0.95, name="subsample"),
]

## Parameters finding

In [13]:
train_ratio = 0.75

In [14]:
def return_model_assessment(args, X_train, y_train, X_test, w):
    global models, train_scores, test_scores, curr_model_hyper_params
    params = {curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params)}
    model = CatBoostRegressor(random_state=2022, silent=True)
    model.set_params(**params)
    fitted_model = model.fit(X_train, y_train, sample_weight=None)
    models.append(fitted_model)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_score = wcc(train_predictions, y_train, w)
    test_score = wcc(test_predictions, y_test, w)
    train_scores.append(train_score)
    test_scores.append(test_score)
    return 1 - test_score

In [None]:
for i in range(14):
    print(i, "started:")
    weight = df_assets[df_assets['Asset_ID'] == i]['Weight'].values[0]
    df = df_train[df_train['Asset_ID'] == i].reset_index(drop=True)
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")

    df_proc = df_proc.reset_index(drop=True)
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.reset_index(drop=True)
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    col_names = df_proc.drop('y', axis=1).columns
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=42)
    target_col = "y"
    
    models = []
    train_scores = []
    test_scores = []
    curr_model_hyper_params = ['learning_rate',
                            'max_depth', 'n_estimators', 'subsample']
    objective_function = partial(return_model_assessment, X_train=X_train, y_train=y_train, X_test=X_test, w=weight)

    n_calls = 5 
    results = gp_minimize(objective_function, space, base_estimator=None, n_calls=5, n_random_starts=n_calls-1, random_state=42)
    
    print(results.x)

In [None]:
# First [0.07229054214304846, 12, 151, 0.8248994475200712]