In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime

In [2]:
train = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24236806 entries, 0 to 24236805
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  int64  
 1   Asset_ID   int64  
 2   Count      float64
 3   Open       float64
 4   High       float64
 5   Low        float64
 6   Close      float64
 7   Volume     float64
 8   VWAP       float64
 9   Target     float64
dtypes: float64(8), int64(2)
memory usage: 1.8 GB


In [4]:
asset_details = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


### Sorting the asset_details dataframe and adding Asset tickers

In [5]:
asset_details.sort_values(by='Asset_ID', inplace=True)
asset_details.set_index('Asset_ID', inplace=True)
asset_details['Asset_Ticker'] = ['BNB', #Binance Coin
                                 'BTC', #Bitcoin
                                 'BCH', #Bitcoin Cash
                                 'ADA', #Cardano
                                 'DOGE', #Dogecoin
                                 'EOS', #EOS
                                 'ETH', #Etherium
                                 'ETC', #Etherium Classic
                                 'MIOTA', #IOTA
                                 'LTC', #Litecoin
                                 'MKR', #Maker
                                 'XMR', #Monero
                                 'XLM', #Stellar
                                 'TRX', #TRON
                                ]
asset_details

Unnamed: 0_level_0,Weight,Asset_Name,Asset_Ticker
Asset_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4.304065,Binance Coin,BNB
1,6.779922,Bitcoin,BTC
2,2.397895,Bitcoin Cash,BCH
3,4.406719,Cardano,ADA
4,3.555348,Dogecoin,DOGE
5,1.386294,EOS.IO,EOS
6,5.894403,Ethereum,ETH
7,2.079442,Ethereum Classic,ETC
8,1.098612,IOTA,MIOTA
9,2.397895,Litecoin,LTC


Explore the date in following steps:
1. Make dataframe for each asset
2. Clean up dataframe for each asset. Fill the gaps
3. Visualize Candlesticks for each asset and comment briefly
4. Decompose the data into seasonal, trend, remainder and comment on each component briefly
5. Plot AR, MA, ARMA, ARIMA etc using statmodels.
6. Apply regression models on each individual asset

### Creating training dataframe for each asset

In [6]:
train_copy = train.copy().set_index("timestamp")


##Binance Coin
train_BNB = train_copy[train_copy['Asset_ID']==0]

##Bitcoin
train_BTC = train_copy[train_copy['Asset_ID']==1]

##Bitcoin Cash
train_BCH = train_copy[train_copy['Asset_ID']==2]

##Cardano
train_ADA = train_copy[train_copy['Asset_ID']==3]

##Dogecoin
train_DOGE = train_copy[train_copy['Asset_ID']==4]

##EOS.IO
train_EOS = train_copy[train_copy['Asset_ID']==5]

##Etherium (Ether)
train_ETH = train_copy[train_copy['Asset_ID']==6]

##Etherium Classic
train_ETC = train_copy[train_copy['Asset_ID']==7]

##IOTA
train_MIOTA = train_copy[train_copy['Asset_ID']==8]

##Litecoin
train_LTC = train_copy[train_copy['Asset_ID']==9]

##Maker
train_MKR = train_copy[train_copy['Asset_ID']==10]

##Monero
train_XMR = train_copy[train_copy['Asset_ID']==11]

##Stellar
train_XLM = train_copy[train_copy['Asset_ID']==12]

##TRON
train_TRX = train_copy[train_copy['Asset_ID']==13]

Asset_list = [train_BNB, train_BTC, train_BCH, train_ADA, train_DOGE, train_EOS, train_ETH,
             train_ETC, train_MIOTA, train_LTC, train_MKR, train_XMR, train_XLM, train_TRX]

### Creating function to explore timeline, missing data, and to impute missing data for each asset

In [7]:
##Explore function with asset dataframe as input
def explore(Asset):
    
    #Asset ID
    ID = int(Asset.Asset_ID.values[0])
    
    #tickername
    name = asset_details[asset_details.index==ID]['Asset_Name'].values[0]
    
    #timeline
    first = Asset.index[0].astype('datetime64[s]')
    last = Asset.index[-1].astype('datetime64[s]')
    
    print(f"The timeline for {name} is from \n{first} to {last}\n")
    
    #Each consecutive timestamp should be in the increment of 60. A gap more than that reveals the missing asset data
    gaps = (Asset.index[1:] - Asset.index[:-1]).value_counts().head(10)
    
    return print(f"The gap values for {name} are\n", gaps,"\n")
    

##Impute function with asset dataframe as input and imputed asset dataframe as output
def impute(Asset):
    
    #Asset ID
    ID = int(Asset.Asset_ID.values[0])
    
    #tickername
    name = asset_details[asset_details.index==ID]['Asset_Name'].values[0]
    
    #DataFrame before imputation
    before = Asset.copy()
    
    #DataFrane after imputation
    after = Asset.reindex(range(Asset.index[0], Asset.index[-1]+60, 60), method='pad')
    
    #Gaps before imputation
    before_gaps = (before.index[1:] - before.index[:-1]).value_counts().head(10)
    
    #Gaps after imputation
    after_gaps = (after.index[1:] - after.index[:-1]).value_counts().head(10)
    
    print(f"Gaps before imputation for {name}:\n", before_gaps, "\n")
    print(f"Gaps after imputation for {name}:\n", after_gaps, "\n")
    
    return after

In [8]:
for Asset in Asset_list:
    explore(Asset)
    Asset = impute(Asset)

The timeline for Binance Coin is from 
2018-01-01T00:01:00 to 2021-09-21T00:00:00

The gap values for Binance Coin are
 60     1936018
120       5774
180        510
240        117
300         50
360         25
420         18
600         13
480         13
720         10
Name: timestamp, dtype: int64 

Gaps before imputation for Binance Coin:
 60     1936018
120       5774
180        510
240        117
300         50
360         25
420         18
600         13
480         13
720         10
Name: timestamp, dtype: int64 

Gaps after imputation for Binance Coin:
 60    1956959
Name: timestamp, dtype: int64 

The timeline for Bitcoin is from 
2018-01-01T00:01:00 to 2021-09-21T00:00:00

The gap values for Bitcoin are
 60     1956136
120         78
180         12
240         11
420          9
360          4
600          4
480          3
720          3
840          3
Name: timestamp, dtype: int64 

Gaps before imputation for Bitcoin:
 60     1956136
120         78
180         12
240         1

### Ploting Candlesticks

In [9]:
import plotly.graph_objects as go

def candlesticks(Asset, start: int, end: int):
    window = Asset.iloc[start: end]
    timeline = pd.to_datetime(window.index, unit='s')
    fig = go.Figure(data=[go.Candlestick(x=timeline, open=window['Open'], high=window['High'], low=window['Low'], close=window['Close'])])
    fig.show()

In [10]:
candlesticks(train_BTC, -1000, -1)