### G-Research Crypto Forecasting
 you'll use your machine learning expertise to forecast short term returns in 14 popular cryptocurrencies. 
 
 a dataset of millions of rows of high-frequency market data dating back to 2018

#### Import dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
path = '/content/drive/My Drive/Colab Notebooks/g-research-crypto/input/'
crypto_df = pd.read_csv(path + 'train.csv')
asset_details = pd.read_csv(path + 'asset_details.csv')

In [3]:
crypto_df.head(5)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [4]:
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24236806 entries, 0 to 24236805
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  int64  
 1   Asset_ID   int64  
 2   Count      float64
 3   Open       float64
 4   High       float64
 5   Low        float64
 6   Close      float64
 7   Volume     float64
 8   VWAP       float64
 9   Target     float64
dtypes: float64(8), int64(2)
memory usage: 1.8 GB


### Data features
The data set includes these ten features and our job is to predict target which is residual log-return for the asset.
- **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
- **Asset_ID**: The asset ID corresponding to one of the crytocurrencies (e.g. Asset_ID = 1 for Bitcoin). The mapping from Asset_ID to crypto asset is contained in asset_details.csv.
- **Count**: Total number of trades in the time interval (last minute).
- **Open**: Opening price of the time interval (in USD).
- **High**: Highest price reached during time interval (in USD).
- **Low**: Lowest price reached during time interval (in USD).
- **Close**: Closing price of the time interval (in USD).
- **Volume**: Quantity of asset bought or sold, displayed in base currency USD.
- **VWAP**: The average price of the asset over the time interval, weighted by  volume. VWAP is an aggregated form of trade data.
- **Target**: Residual log-returns for the asset over a 15 minute horizon.

With the given asset details file, we can view the asset information including asset_id, name and the weight of each asset used to weigh their relative importance in the evaluation metric. 

In [5]:
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


### Pre-processing
#### Dealing with missing data  



In [6]:
# Check whether there is missing data in each column
crypto_df.isnull().sum()

timestamp         0
Asset_ID          0
Count             0
Open              0
High              0
Low               0
Close             0
Volume            0
VWAP              9
Target       750338
dtype: int64

In [15]:
crypto_df[crypto_df['Asset_ID'] == 1]['Target'].mean()

-1.5599925745642943e-06

In [26]:
asset_id_list = list(crypto_df.Asset_ID.unique()) # unique asset_id

for n in asset_id_list:
  asset_mean = crypto_df[crypto_df['Asset_ID'] == n]['Target'].mean() # Get mean of target for each asset_id
  print(asset_mean)
  #crypto_df['Target'] = crypto_df.apply(lambda x: x['Target'] = asset_mean if x['Asset_ID'] == n)['Target']
  #crypto_df.apply(lambda row: row['Target'] = asset_mean if row['Asset_ID'] == n)
  crypto_df['Target'].apply(lambda row: asset_mean if row['Asset_ID'] == n)


SyntaxError: ignored

In [12]:
def asset_target_mean(Asset_ID, Target):
  asset_id_list = list(crypto_df.Asset_ID.unique()) # unique asset_id
  asset_mean_dict = {} # create a dictionary with asset_id and its target mean value
  for n in asset_id_list:
    asset_mean_dict[n] = crypto_df[crypto_df['Asset_ID'] == n]['Target'].mean()

  if Target == None:
    if Asset_ID == 2:
      asset_mean = asset_mean_dict[2]
  
    elif Asset_ID == 0:
      asset_mean = asset_mean_dict[0]
  
    elif Asset_ID == 1:
      asset_mean = asset_mean_dict[1]

    elif Asset_ID == 5:
      asset_mean = asset_mean_dict[5]

    elif Asset_ID == 7:
      asset_mean = asset_mean_dict[7]

    elif Asset_ID == 6:
      asset_mean = asset_mean_dict[6]

    elif Asset_ID == 9:
      asset_mean = asset_mean_dict[9]

    elif Asset_ID == 11:
      asset_mean = asset_mean_dict[11]

    elif Asset_ID == 13:
      asset_mean = asset_mean_dict[13]

    elif Asset_ID == 12:
      asset_mean = asset_mean_dict[12]
  
    elif Asset_ID == 3:
      asset_mean = asset_mean_dict[3]
  
    elif Asset_ID == 8:
      asset_mean = asset_mean_dict[8]
  
    elif Asset_ID == 10:
      asset_mean = asset_mean_dict[10]

    elif Asset_ID == 4:
      asset_mean = asset_mean_dict[4]
  
  return asset_mean


In [13]:
mong = crypto_df[crypto_df['Asset_ID'] == 10]
mong['Target'] = mong[['Asset_ID', 'Target']].apply(lambda x : asset_target_mean(*x), axis=1)

UnboundLocalError: ignored

In [34]:
crypto_df['Target'] = crypto_df[['Asset_ID', 'Target']].apply(lambda x : asset_target_mean(*x) if x.isna() = True, axis=1)

SyntaxError: ignored

In [29]:
asset_mean_dict

{0: 3.1839348870061344e-05,
 1: -1.5599925745642943e-06,
 2: -4.636348354151073e-06,
 3: 1.7660349100923204e-06,
 4: 4.23621783365305e-05,
 5: -1.235017783129087e-06,
 6: -1.5690491770938035e-06,
 7: 3.745600992189751e-05,
 8: 1.6199659292060184e-05,
 9: -1.381605575862578e-05,
 10: -2.475019116877511e-06,
 11: 7.652282638036138e-06,
 12: -1.0203574027402607e-05,
 13: 9.131855947045975e-06}

### EDA
#### Data distribution

In [5]:
crypto_df.groupby('Asset_ID').count()

Unnamed: 0_level_0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target
Asset_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1942619,1942619,1942619,1942619,1942619,1942619,1942619,1942619,1929204
1,1956282,1956282,1956282,1956282,1956282,1956282,1956282,1956282,1955978
2,1953537,1953537,1953537,1953537,1953537,1953537,1953537,1953537,1948676
3,1791867,1791867,1791867,1791867,1791867,1791867,1791867,1791867,1773136
4,1156866,1156866,1156866,1156866,1156866,1156866,1156866,1156866,1011892
5,1955140,1955140,1955140,1955140,1955140,1955140,1955140,1955140,1952838
6,1956200,1956200,1956200,1956200,1956200,1956200,1956200,1956200,1955860
7,1951127,1951127,1951127,1951127,1951127,1951127,1951127,1951127,1941801
8,1592071,1592071,1592071,1592071,1592071,1592071,1592071,1592071,1398965
9,1956030,1956030,1956030,1956030,1956030,1956030,1956030,1956030,1955509


#### Correlation between assets