## Data loading

In [1]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [4]:
import gresearch_crypto

In [5]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin


## Data preprocessing

In [6]:
for i in range(14):
 
    dfcrop=df_train[df_train['Asset_ID']==i]
    print('Percentage of values not nan',(1-(np.sum((dfcrop['Target'].isnull()).astype(int))/dfcrop.shape[0]))*100)
  

Percentage of values not nan 99.3094374141301
Percentage of values not nan 99.9844603180932
Percentage of values not nan 99.75116928934543
Percentage of values not nan 98.95466572016784
Percentage of values not nan 87.46838441098623
Percentage of values not nan 99.88225907096167
Percentage of values not nan 99.9826193640732
Percentage of values not nan 99.52201983776557
Percentage of values not nan 87.87076707006158
Percentage of values not nan 99.97336441670117
Percentage of values not nan 94.03054748939965
Percentage of values not nan 85.91615278314144
Percentage of values not nan 96.52474857329506
Percentage of values not nan 98.87899026971662


In [7]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)

    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
   

    return df_feat

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet

In [9]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.reset_index(drop=True)
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.dropna(how="any")
    df_proc = df_proc.reset_index(drop=True)
    
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    model = ElasticNet(
        alpha=0.2,
        l1_ratio=0.0,
        selection='cyclic'
    )
    model.fit(X, y)

    del X
    del y
    #return X, y, model
    return model

## Training

In [10]:
%%time
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_assets['Asset_ID'], df_assets['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    try:
        model = get_Xy_and_model_for_asset(df_train, asset_id)
        models[asset_id] = model
    except:         
        models[asset_id] = None

Training model for Binance Coin     (ID=0 )


  positive)


Training model for Bitcoin          (ID=1 )


  positive)


Training model for Bitcoin Cash     (ID=2 )


  positive)


Training model for Cardano          (ID=3 )


  positive)


Training model for Dogecoin         (ID=4 )


  positive)


Training model for EOS.IO           (ID=5 )


  positive)


Training model for Ethereum         (ID=6 )


  positive)


Training model for Ethereum Classic (ID=7 )


  positive)


Training model for IOTA             (ID=8 )


  positive)


Training model for Litecoin         (ID=9 )


  positive)


Training model for Maker            (ID=10)


  positive)


Training model for Monero           (ID=11)


  positive)


Training model for Stellar          (ID=12)


  positive)


Training model for TRON             (ID=13)
CPU times: user 41min 10s, sys: 1min 54s, total: 43min 4s
Wall time: 11min 21s


  positive)


In [11]:
import traceback

In [12]:
models

{0: ElasticNet(alpha=0.2, l1_ratio=0.0),
 1: ElasticNet(alpha=0.2, l1_ratio=0.0),
 2: ElasticNet(alpha=0.2, l1_ratio=0.0),
 3: ElasticNet(alpha=0.2, l1_ratio=0.0),
 4: ElasticNet(alpha=0.2, l1_ratio=0.0),
 5: ElasticNet(alpha=0.2, l1_ratio=0.0),
 6: ElasticNet(alpha=0.2, l1_ratio=0.0),
 7: ElasticNet(alpha=0.2, l1_ratio=0.0),
 8: ElasticNet(alpha=0.2, l1_ratio=0.0),
 9: ElasticNet(alpha=0.2, l1_ratio=0.0),
 10: ElasticNet(alpha=0.2, l1_ratio=0.0),
 11: ElasticNet(alpha=0.2, l1_ratio=0.0),
 12: ElasticNet(alpha=0.2, l1_ratio=0.0),
 13: ElasticNet(alpha=0.2, l1_ratio=0.0)}

## Kaggle submission

In [13]:
%%time
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
scaler = StandardScaler()
for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(pd.DataFrame([row]))
                x_test = scaler.fit_transform(x_test)
                y_pred = model.predict(x_test)[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
CPU times: user 593 ms, sys: 4.99 ms, total: 598 ms
Wall time: 611 ms


<br>

## Public score

-0.0020