# Getting Data

## Loading Libraries

In [1]:
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup # for web scraping
import requests    # for requesting html
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import os
import shutil
from tqdm.auto import tqdm
import re
from csv import reader

## Scarping data source 1

In [2]:
# to store all the raw features from source1
# source1: https://bitinfocharts.com/bitcoin/
raw_values =np.array(['transactions',
                      'size',
                      'sentbyaddress',
                      'difficulty',
                      'hashrate',
                      'mining_profitability',
                      'sentinusd',
                      'transactionfees',
                      'median_transaction_fee',
                      'confirmationtime',
                      'transactionvalue',
                      'marketcap',
                      'tweets',
                      'google_trends',
                      'mediantransactionvalue',
                      'activeaddresses',
                      'top100cap',
                      'fee_to_reward',
                      'price'])

# Technical indicators which are derived from raw values for each period
technical_indicators = np.array(['sma','ema','wma','std','mom','var','trx','rsi','roc'])

                     
period_in_days = np.array(['3','7','14','30','90'])

In [3]:
# generating URLs to scrape data and creating column names:

# Raw values
URLs = []
feature_names= []
for i in range(len(raw_values)):
    url='https://bitinfocharts.com/comparison/'+'bitcoin'+'-'+raw_values[i]+'.html#alltime'
    URLs.append(url)
    # creating column names:
    feature_names.append(raw_values[i])


In [4]:
# Joining feature_names and URLs into a dataframe
details = {'Features':feature_names,'URLs': URLs}
details_df = pd.DataFrame(details)

features=pd.DataFrame(columns=details_df.Features)
print('Building URLs ...')
for i in tqdm(range(len(features.columns))):
    date=features.columns[i] + 'Date'
    features[date]=date

Building URLs ...


  0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
df_merge = 0
print('Requesting data..............')
for i in tqdm(range(len(details_df))):
    url = details_df.URLs[i]
    s = requests.Session()    
    retry = Retry(connect=10, backoff_factor=3) 
    #https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html

    adapter = HTTPAdapter(max_retries=retry)
    s.mount('http://', adapter)   
    #https://www.kite.com/python/docs/requests.adapters.HTTPAdapter
    page=s.get(url)
    soup = BeautifulSoup(page.content, 'html').find_all('script')[4].string

    text= re.search(r'd = new Dygraph\(document.getElementById\(\"container\"\)\,(.*)',soup).groups(1)
    # https://stackoverflow.com/questions/65403953/webscraping-js-elements-with-soup-isnt-working
    # https://stackoverflow.com/questions/27881366/regular-expressions-and777
    data = text[0].split('{labels:')[0]
    data = data[0:len(data)-2]
    data = re.sub(r'\[new Date\(\"', '', str(data))
    data = re.sub(r'\"\)\,', ';', str(data))
    data = re.sub(r'\]\]', '', str(data))
    data = re.sub(r'\[', '', str(data)).split('],')
    df = pd.DataFrame( list(reader(data)))
    df.columns = ['name']
    col = details_df.Features[i]
    df[['Date',col]] = df.name.str.split(';',1,expand=True)
    df.drop(['name'], axis=1,inplace = True)

    df = df.set_index(['Date'])
    df_new = df.loc['2010/07/17':'2022/07/17']
    df_new.reset_index(inplace = True)
    # https://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates
    if i==0:
      df_merge = df_new
    else:
      #df_new = df_new.drop(['Date'], axis = 1)
      #df_merge = pd.concat([df_merge,df_new],axis=1)
      df_merge = pd.merge(df_merge, df_new, left_on='Date', right_on='Date', how='left')
df_merged = pd.DataFrame(df_merge)

Requesting data..............


  0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
# saving data to csv file
df_merged.to_csv('btc_data_raw.csv', sep = ',', index=False)

In [7]:
data_raw = pd.read_csv('btc_data_raw.csv')

In [8]:
data_raw.rename(columns={'price':'avg_price'}, inplace=True)

## Scarping data source 2

In [9]:
# Fear and Greed Index
# source2: https://www.businessinsider.in/cryptocurrency/news/crypto-fear-and-greed-index-shows-extreme-fear-is-ruling-the-markets-is-this-an-opportunity-or-warning-sign-for-investors/articleshow/88890883.cms

from bs4 import BeautifulSoup
import requests
import pandas as pd
from csv import reader
url = 'https://api.alternative.me/fng/?limit=1500&format=csv&date_format=us'
text = requests.get(url)
soup = BeautifulSoup(text.content)

In [10]:
from datetime import datetime
data_lst = soup.p.text.split('\n')[4:-5]
df_fg = pd.DataFrame( list(reader(data_lst)))
df_fg.columns = ['Date','Value','Classification']
df_fg['Date'] = pd.to_datetime(df_fg.Date, format='%m-%d-%Y')
df_fg['Date'] = df_fg['Date'].dt.strftime('%Y/%m/%d')
df_fg.sort_values(by=['Date'],inplace=True)
data2 = df_fg

In [11]:
data_raw_2 = pd.DataFrame(data2)

In [12]:
data_raw_2.drop('Classification',axis = 1,inplace=True)

In [13]:
data_raw_2.head(2)

Unnamed: 0,Date,Value
1499,2018/03/29,18
1498,2018/03/30,12


In [14]:
data_raw = pd.merge(data_raw, data_raw_2, on='Date')

In [15]:
data_raw.rename(columns={'Value':'fear_gear_index'}, inplace=True)

## Scarping data source 3

In [16]:
# source3 : https://investpy.readthedocs.io/_api/crypto.html
!pip install investpy

Collecting investpy
  Downloading investpy-1.0.8.tar.gz (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.6 MB/s 
[?25hCollecting Unidecode>=1.1.1
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 60.6 MB/s 
Collecting lxml>=4.4.1
  Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 45.4 MB/s 
Building wheels for collected packages: investpy
  Building wheel for investpy (setup.py) ... [?25l[?25hdone
  Created wheel for investpy: filename=investpy-1.0.8-py3-none-any.whl size=4481592 sha256=9dc9245f1c2aef0dcc463f88162136e614c3ffbe7e998bf5fddfbabc115da319
  Stored in directory: /root/.cache/pip/wheels/96/a8/a5/0d33c72eaf00b41df7b9dc1e15d2b7c7154b3f1379ed350211
Successfully built investpy
Installing collected packages: Unidecode, lxml, investpy
  Attempting uninstall: lxml
    Found existing installation: lxml 

In [17]:
import investpy
data_raw_3 = investpy.crypto.get_crypto_historical_data(crypto='bitcoin',from_date='01/01/2010',to_date='31/05/2022')
data_raw_3.reset_index(inplace=True)
data_raw_3.drop(['Volume','Currency'],axis=1,inplace=True)

In [18]:
data_raw_3 = pd.DataFrame(data_raw_3)

In [19]:
import datetime as dt
data_raw_3['Date'] = data_raw_3['Date'].dt.strftime('%Y/%m/%d')

In [20]:
data_raw = pd.merge(data_raw, data_raw_3, on='Date')

## Scraping data source 4

In [21]:
#source4:  https://data.nasdaq.com/data/BCHAIN/MIREV-bitcoin-miners-revenue
!pip install Quandl

Collecting Quandl
  Downloading Quandl-3.7.0-py2.py3-none-any.whl (26 kB)
Collecting inflection>=0.3.1
  Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection, Quandl
Successfully installed Quandl-3.7.0 inflection-0.5.1


In [22]:
import quandl
data_raw_4 = quandl.get("BCHAIN/MIREV",authtoken='Bi24yBMurcVhy7VEQ7oC')
data_raw_4.reset_index(inplace=True)
data_raw_4['Date'] = data_raw_4['Date'].dt.strftime('%Y/%m/%d')

In [23]:
data_raw_4.rename(columns={'Value':'miners_revenue'}, inplace=True)

In [24]:
data_raw_4 = pd.DataFrame(data_raw_4)

In [25]:
data_raw = pd.merge(data_raw, data_raw_4, on='Date')

## other features

In [26]:
data_raw['coins_in_supply'] = data_raw['marketcap']/data_raw['avg_price']

In [27]:
data_raw.drop(['marketcap'], axis = 1,inplace=True)

### Note:
Market capital is the product of average bitcoin prices and BTC coins in supply. New feature 'coins_in_supply' is extracted from market cap feature and same is dropped from the dataset.

## Target Shifting

In [28]:
data_raw['next_day_BTC_price'] = data_raw['Close']

In [29]:
data_raw['next_day_BTC_price'] = data_raw['next_day_BTC_price'].shift(-1,fill_value=1).values

In [30]:
data_raw = data_raw.iloc[:-1,:]

In [31]:
data_raw

Unnamed: 0,Date,transactions,size,sentbyaddress,difficulty,hashrate,mining_profitability,sentinusd,transactionfees,median_transaction_fee,...,fee_to_reward,avg_price,fear_gear_index,Open,High,Low,Close,miners_revenue,coins_in_supply,next_day_BTC_price
0,2018/03/29,192259,564549.0,287598,3.462542e+12,2.436379e+19,0.597,4.772992e+09,1.068,0.146,...,1.413,7548.0,18,7955.2,7980.7,6954.9,7129.2,1.362733e+07,1.693143e+07,6853.7
1,2018/03/30,185778,524121.0,291035,3.462542e+12,2.779461e+19,0.510,5.500617e+09,0.958,0.129,...,1.257,6908.0,12,7132.4,7288.1,6603.8,6853.7,1.393713e+07,1.693312e+07,6938.2
2,2018/03/31,148489,555531.0,220797,3.462542e+12,2.457113e+19,0.463,3.468629e+09,0.950,0.127,...,1.240,7025.0,16,6856.5,7221.5,6804.9,6938.2,1.149622e+07,1.693489e+07,6825.2
3,2018/04/01,134967,375247.0,206387,3.494289e+12,2.531043e+19,0.551,3.626222e+09,0.925,0.128,...,0.896,6826.0,16,6939.1,7047.7,6460.1,6825.2,1.375806e+07,1.693711e+07,7068.4
4,2018/04/02,168022,474973.0,246937,3.511061e+12,2.752811e+19,0.502,6.054217e+09,0.892,0.120,...,1.086,7004.0,11,6825.4,7121.3,6787.6,7068.4,1.371990e+07,1.693851e+07,7424.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1493,2022/05/03,277745,596713.0,498779,2.979441e+13,2.216073e+20,0.164,7.698587e+10,1.332,0.434,...,1.021,38264.0,27,38515.0,38647.0,37513.0,37718.0,3.626107e+07,1.902964e+07,39688.0
1494,2022/05/04,232899,663673.0,433541,2.979441e+13,2.261539e+20,0.159,1.135850e+11,2.102,0.598,...,1.365,38809.0,21,37717.0,40021.0,37660.0,39688.0,3.587458e+07,1.903026e+07,36544.0
1495,2022/05/05,315602,858315.0,500341,2.979441e+13,2.193657e+20,0.176,2.037304e+11,2.722,0.896,...,2.227,38452.0,27,39686.0,39833.0,36183.0,36544.0,3.864103e+07,1.903134e+07,36009.0
1496,2022/05/06,286745,779808.0,437045,2.979441e+13,2.208980e+20,0.148,1.853635e+11,2.139,0.822,...,1.876,36159.0,22,36540.0,36646.0,35267.0,36009.0,3.318364e+07,1.903244e+07,35468.0


## Saving data to file

In [32]:
data_raw.to_csv('btc_raw_1498_27.csv',sep=',',index=False)