In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
 
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Imports**

In [None]:
#all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# **Read the datasets**

In [None]:
# Train set
train_set = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/train.csv")
print(train_set.head())

# Supplemental train set
sup_train_set = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv")
print(sup_train_set.head())

# Example Test set
test = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/example_test.csv")
print(test.head())

# Asset details 
assets = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/asset_details.csv")
print(assets.head())

#Example sample submission
sample_submission = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv")
print(sample_submission.head())

# **Preprocessing the data**

In [None]:
#General info
print(train_set.info(show_counts =True))

#Identifying null values
print(train_set.isna().sum())
train_set = train_set.dropna()
print(train_set.isna().sum())

print(sup_train_set.isna().sum())
sup_train_set = sup_train_set.dropna()
print(sup_train_set.isna().sum())

In [None]:
#Convert unix timestamp to datetime
train_set['readable_timestamp'] = train_set['timestamp'].astype('datetime64[s]')
train_set

sup_train_set['readable_timestamp'] = sup_train_set['timestamp'].astype('datetime64[s]')
sup_train_set

# **Visualizations**

In [None]:
#Assigning the Asset names
train_set_join = pd.merge(left=train_set, right=assets, how='left', left_on='Asset_ID', right_on='Asset_ID')
print(train_set_join)
#Time range for each crypto 
train_set_join.groupby('Asset_Name')['readable_timestamp'].agg(['min', 'max'])

In [None]:
import matplotlib.dates as mdates
def get_line_graph(df, title):
    fig, ax = plt.subplots(1, figsize=(15, 5))
#     change the label
    ax.plot(df, label=['Target','Prediction'], linewidth=3)
    ax.set_ylabel('Price ($)', fontsize=14)
    ax.set_title(title, fontsize=16)
    ax.legend(loc='best', fontsize=16)
    half_year_locator = mdates.MonthLocator(interval=1)
    ax.xaxis.set_major_locator(half_year_locator)
    fig.autofmt_xdate()

In [None]:
#Bitcoin graph
df = train_set_join[train_set_join['Asset_ID']==1].set_index('readable_timestamp')
get_line_graph(df['Close'],"Bitcoin Trend")

# **Feature Engineering**

In [None]:
# Extracting the datetime features for the train and val sets
train_set['year'] = train_set['readable_timestamp'].dt.year
train_set['day_of_year'] = train_set['readable_timestamp'].dt.dayofyear
train_set['weekday'] = train_set['readable_timestamp'].dt.weekday
train_set['week_of_year'] = train_set['readable_timestamp'].dt.isocalendar().week.astype('int64')
train_set['day_of_month'] = train_set['readable_timestamp'].dt.day
train_set['quarter'] = train_set['readable_timestamp'].dt.quarter
train_set['hour'] = train_set['readable_timestamp'].dt.hour
train_set['minute'] = train_set['readable_timestamp'].dt.minute
print(train_set.head())

sup_train_set['year'] = sup_train_set['readable_timestamp'].dt.year
sup_train_set['day_of_year'] = sup_train_set['readable_timestamp'].dt.dayofyear
sup_train_set['weekday'] = sup_train_set['readable_timestamp'].dt.weekday
sup_train_set['week_of_year'] = sup_train_set['readable_timestamp'].dt.isocalendar().week.astype('int64')
sup_train_set['day_of_month'] = sup_train_set['readable_timestamp'].dt.day
sup_train_set['quarter'] = sup_train_set['readable_timestamp'].dt.quarter
sup_train_set['hour'] = sup_train_set['readable_timestamp'].dt.hour
sup_train_set['minute'] = sup_train_set['readable_timestamp'].dt.minute
sup_train_set.head()

In [None]:
#Adding new features
# import pandas_ta as pta
#1 MACD: Moving Average Convergence Divergence
def macd(df):
    exp1 = df.Close.ewm(span=12, adjust=False).mean()
    exp2 = df.Close.ewm(span=26, adjust=False).mean()
    macd = exp1-exp2
    return macd

# def rsi(df):
#     pta.rsi(df['Close'], length = 14)
    
def kaufmann_efficiency(df):
    df['direction'] = df['Close'].diff(3).abs()
    df['volatility'] = df['Close'].diff().abs().rolling(3).mean()
    return(df['direction'] / df['volatility'])
    

In [None]:
def get_features(df):
    df_feat = df.copy()
    df_feat['macd'] = macd(df_feat)
#     df_feat['rsi'] = rsi(df_feat)
    df_feat['kaufmann'] = kaufmann_efficiency(df_feat)
    return df_feat

In [None]:
def get_asset_features(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    df_proc = get_features(df)
    return(df_proc)

# **CatBoost Model**

In [None]:
# CatBoost Model training function
from catboost import CatBoostRegressor
def catboost_model(X_train, y_train):
    model=CatBoostRegressor(task_type="GPU",iterations=200, depth=10, learning_rate=0.05, loss_function='RMSE')
    model.fit(X=X_train, y=y_train)
    return model

In [None]:
#Create different training models for each asset

from scipy.stats import pearsonr
import joblib
from sklearn import metrics
import math
pearson_list = []
crypto_models={}
rmse_list = []
for i in range(0,14):
    
    temp_train_df = get_asset_features(train_set,i)
    X_train = temp_train_df.drop(['timestamp','Target'], axis=1)
    y_train = temp_train_df['Target']
    model = catboost_model(X_train,y_train)
#     comment this line ->
    globals()[f"model{i}"] = model
    
    crypto_models.update({i: model})    
    
    # get val set features and calculate validation accuracy for each asset

    val_df = get_asset_features(sup_train_set,i)
    X_val = val_df.drop(['timestamp','Target'], axis=1)
    y_val = val_df['Target']
    val_predictions = model.predict(X_val)
    #has to close to 1 or -1 as it signifies the extent of linear relation with the target
    print("Pearson's coeff for asset "+ str(i) + ":")
    pr = pearsonr(val_predictions, y_val)[0]
    pearson_list.append(pr)
    print("Pearson's Corr: " + str(pr))
    
    mse = metrics.mean_squared_error(y_val, val_predictions)
    rmse = math.sqrt(mse)
    rmse_list.append(rmse)
    print("RMSE: " + str(rmse))
    
    # Line Forecast graph
    val_df['predictions'] = val_predictions
    df = val_df[['Target','predictions','readable_timestamp']]
    df = df.set_index('readable_timestamp')
    get_line_graph(df,"Trend")
    
avg = sum(pearson_list)/len(pearson_list)
print("Average Pearson's Coefficient: "+ str(avg))
avg_rmse = sum(rmse_list)/len(rmse_list)
print("Average RMSE: " + str(avg_rmse))

# Saving all the models
joblib.dump(crypto_models, 'crypto_models.joblib')

# XGBoost Model

In [None]:
from xgboost import XGBRegressor
def xgboost_model(X_train,y_train):
    my_model = XGBRegressor(n_estimators=1000)
    my_model.fit(X_train, y_train, verbose=False)
    return my_model

In [None]:
# #Create different training models for each asset

# from scipy.stats import pearsonr
# import joblib
# from sklearn import metrics
# import math
# pearson_list = []
# crypto_models_xgboost={}
# rmse_list = []
# for i in range(0,14):
    
#     temp_train_df = get_asset_features(train_set,i)
#     X_train = temp_train_df.drop(['readable_timestamp','timestamp','Target'], axis=1)
#     y_train = temp_train_df['Target']
#     model = xgboost_model(X_train,y_train)
#     globals()[f"model{i}"] = model
    
#     crypto_models_xgboost.update({i: model})    
    
#     # get val set features and calculate validation accuracy for each asset

#     val_df = get_asset_features(sup_train_set,i)
#     X_val = val_df.drop(['readable_timestamp','timestamp','Target'], axis=1)
#     y_val = val_df['Target']
#     val_predictions = model.predict(X_val)
#     #has to close to 1 or -1 as it signifies the extent of linear relation with the target
#     print("Pearson's coeff for asset "+ str(i) + ":")
#     pr = pearsonr(val_predictions, y_val)[0]
#     pearson_list.append(pr)
#     print("Pearson's Corr: " + str(pr))
    
#     mse = metrics.mean_squared_error(y_val, val_predictions)
#     rmse = math.sqrt(mse)
#     rmse_list.append(rmse)
#     print("RMSE: " + str(rmse))
    
# avg = sum(pearson_list)/len(pearson_list)
# print("Average Pearson's Coefficient: "+ str(avg))
# avg_rmse = sum(rmse_list)/len(rmse_list)
# print("Average RMSE: " + str(avg_rmse))

# # Saving all the models
# joblib.dump(crypto_models_xgboost, 'crypto_models.joblib')

In [None]:
#Calculating the predictions for the test set received from the G-research API

import gresearch_crypto
import pandas as pd

#my test
env = gresearch_crypto.make_env()

# You can only iterate through a result from `env.iter_test()` once
# so be careful not to lose it once you start iterating.
iter_test = env.iter_test()

# test_df =pd.read_csv("actual_test_df.csv")
# sample_prediction_df = pd.read_csv("sample_prediction_df.csv")
for (test_df, sample_prediction_df) in iter_test:

    my_temp_df = test_df

    my_temp_df['readable_timestamp'] = my_temp_df['timestamp'].astype('datetime64[s]')
    my_temp_df['year'] = my_temp_df['readable_timestamp'].dt.year.astype('int64')
    my_temp_df['day_of_year'] = my_temp_df['readable_timestamp'].dt.dayofyear.astype('int64')
    my_temp_df['weekday'] = my_temp_df['readable_timestamp'].dt.weekday.astype('int64')
    my_temp_df['week_of_year'] = my_temp_df['readable_timestamp'].dt.isocalendar().week.astype('int64')
    my_temp_df['day_of_month'] = my_temp_df['readable_timestamp'].dt.day.astype('int64')
    my_temp_df['quarter'] = my_temp_df['readable_timestamp'].dt.quarter.astype('int64')
    my_temp_df['hour'] = my_temp_df['readable_timestamp'].dt.hour.astype('int64')
    my_temp_df['minute'] = my_temp_df['readable_timestamp'].dt.minute.astype('int64')

    #Data type conversions
    my_temp_df['Asset_ID'] = my_temp_df['Asset_ID'].astype('int64')
    my_temp_df['Count'] = my_temp_df['Count'].astype('float64')
    my_temp_df['Open'] = my_temp_df['Open'].astype('float64')
    my_temp_df['High'] = my_temp_df['High'].astype('float64')
    my_temp_df['Low'] = my_temp_df['Low'].astype('float64')
    my_temp_df['Close'] = my_temp_df['Close'].astype('float64')
    my_temp_df['Volume'] = my_temp_df['Volume'].astype('float64')
    my_temp_df['VWAP'] = my_temp_df['VWAP'].astype('float64')

    my_temp_df = my_temp_df.drop(['timestamp'], axis=1)
    my_temp_df = my_temp_df.set_index('row_id')
    sample_prediction_df = sample_prediction_df.set_index('row_id')
    predictions_df =pd.DataFrame()
    for i in range(0,14):
        temp_df = get_asset_features(my_temp_df,i)
        #Hardcoding because models can't be stored in a list or be iterated upon
        if i==0:       
            temp_df['predictions'] = model0.predict(temp_df)
        elif i==1:       
            temp_df['predictions'] = model1.predict(temp_df)
        elif i==2:       
            temp_df['predictions'] = model2.predict(temp_df)
        elif i==3:       
            temp_df['predictions'] = model3.predict(temp_df)
        elif i==4:       
            temp_df['predictions'] = model4.predict(temp_df)
        elif i==5:       
            temp_df['predictions'] = model5.predict(temp_df)
        elif i==6:       
            temp_df['predictions'] = model6.predict(temp_df)
        elif i==7:       
            temp_df['predictions'] = model7.predict(temp_df)
        elif i==8:       
            temp_df['predictions'] = model8.predict(temp_df)
        elif i==9:       
            temp_df['predictions'] = model9.predict(temp_df)
        elif i==10:       
            temp_df['predictions'] = model10.predict(temp_df)
        elif i==11:       
            temp_df['predictions'] = model11.predict(temp_df)
        elif i==12:       
            temp_df['predictions'] = model12.predict(temp_df)
        elif i==13:       
            temp_df['predictions'] = model13.predict(temp_df)
        predictions_df = predictions_df.append(temp_df[temp_df['predictions'].notna()])
#     print(predictions_df)

    sample_prediction_df['Target'] = predictions_df['predictions']
    sample_prediction_df = sample_prediction_df.reset_index()
    print(sample_prediction_df)

    # sample_prediction_df['Target'] = predictions# make your predictions here
    env.predict(sample_prediction_df)   # register your predictions

In [None]:
sample_prediction_df.to_csv("sample_prediction_df.csv")

In [None]:
# from sklearn.preprocessing import StandardScaler
# # simple preprocessing of the data 
# scaler = StandardScaler()

# X_btc_train_scaled = scaler.fit_transform(X_btc_train)
# X_btc_test_scaled = scaler.transform(X_btc_test)

# X_eth_train_scaled = scaler.fit_transform(X_eth_train)
# X_eth_test_scaled = scaler.transform(X_eth_test)

In [None]:
# binance_coin = train_set[train_set['Asset_ID']==0]
# bitcoin = train_set[train_set['Asset_ID']==1]
# bitcoin_cash = train_set[train_set['Asset_ID']==2]
# cardano = train_set[train_set['Asset_ID']==3]
# dogecoin = train_set[train_set['Asset_ID']==4]
# eosio = train_set[train_set['Asset_ID']==5]
# ethereum = train_set[train_set['Asset_ID']==6]
# ethereum_classic = train_set[train_set['Asset_ID']==7]
# iota = train_set[train_set['Asset_ID']==8]
# litecoin = train_set[train_set['Asset_ID']==9]
# maker = train_set[train_set['Asset_ID']==10]
# monero = train_set[train_set['Asset_ID']==11]
# stellar = train_set[train_set['Asset_ID']==12]
# tron = train_set[train_set['Asset_ID']==13]

# (task_type="GPU", iterations=100, learning_rate=0.05, depth=10, random_seed=42, verbose = 0)
# model.fit(train_set,train_labels, eval_set=(X_val,y_val),plot=True);
# model.fit(train_df,train_labels);