In [1]:
# PROJECT/IMPLEMENTATION IDEAS
# 1. Historical market cap, % bitcoin (requires crawling https://coinmarketcap.com/historical/)
# 2. Twitter sentiment analysis of Bitcoin vs price
# 3. Bitcoin vs. altcoin market caps
# 4. Transaction fees, user/address count(?), other features(?) vs bitcoin price
# 5. Usage by country
# 6. Bitcoin trading by exchange
# 7. Bitcoin trading by currency
# 8. Increase in leverage?

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from pandas.core import datetools
%matplotlib inline
import requests
from bs4 import BeautifulSoup
import dateutil.parser
from pprint import pprint
import json
import time
import sys
import re
import datetime

# APIs
import quandl
# import coinmarketcap

  from pandas.core import datetools


In [3]:
# Quandl API Calls
df_price = pd.read_csv('https://www.quandl.com/api/v3/datasets/BNC3/GWA_BTC.csv?api_key=pvPBMBW8afR_HqVfio9o') # Price, volume
df_eth = pd.read_csv('https://www.quandl.com/api/v3/datasets/GDAX/ETH_USD.csv?api_key=pvPBMBW8afR_HqVfio9o') # ETH Price, volume
df_fees = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/TRFUS.csv?api_key=pvPBMBW8afR_HqVfio9o') # Txn fees
df_cost = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/CPTRA.csv?api_key=pvPBMBW8afR_HqVfio9o') # cost per txn
df_no = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/NTRAN.csv?api_key=pvPBMBW8afR_HqVfio9o') # num txns
df_noblk = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/NTRBL.csv?api_key=pvPBMBW8afR_HqVfio9o') # txns per block
df_blksz = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/AVBLS.csv?api_key=pvPBMBW8afR_HqVfio9o') # blk size
df_unq = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/NADDU.csv?api_key=pvPBMBW8afR_HqVfio9o') # unique addys
df_hash = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/HRATE.csv?api_key=pvPBMBW8afR_HqVfio9o') # hash rate
df_diff = pd.read_csv('https://www.quandl.com/api/v3/datasets/BCHAIN/DIFF.csv?api_key=pvPBMBW8afR_HqVfio9o') # difficulty

df_nasdaq = pd.read_csv('https://www.quandl.com/api/v3/datasets/NASDAQOMX/COMP.csv?api_key=pvPBMBW8afR_HqVfio9o') # NASDAQ Composite
df_nasdaq = df_nasdaq.rename(columns={'Trade Date': 'Date','Index Value':'Nasdaq'})
df_nasdaq = df_nasdaq.drop(['High','Low','Total Market Value','Dividend Market Value'], 1)

df_gold = pd.read_csv('https://www.quandl.com/api/v3/datasets/NASDAQOMX/QGLD.csv?api_key=pvPBMBW8afR_HqVfio9o') # Nasdaq GOLD Index
df_gold = df_gold.rename(columns={'Trade Date': 'Date','Index Value':'Gold'})
df_gold = df_gold.drop(['High','Low','Total Market Value','Dividend Market Value'], 1)

In [4]:
sys.executable

'/Users/brianmcmahon/anaconda3/envs/tensorflow1.4/bin/python'

In [5]:
# Helper functions
def to_currency(int):
    return "${:,.0f}".format(int//1000000)

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def list_to_average(list):
    try:
        avg = list[0]/list[1]
    except:
        avg = 0
    return avg

def to_log(num):
    return np.log(num)

def to_int(num):
    return int(num)

def to_month(dt):
    return to_date(dt)[1]


In [6]:
df = df_price.drop('Open', 1)
df = df.drop(['High','Low'], 1)
df = df.rename(columns={'Close': 'BTCPrice','Volume':'BTCVol'})
df = df_eth.merge(df,how='inner',on='Date')
df = df.rename(columns={'Open': 'ETHPrice'})
df = df.drop(['High','Low'], 1)
df = df_fees.merge(df, how='inner', on='Date')
df = df.rename(columns={'Value': 'TxFees'})
df = df_cost.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'CostperTxn'})
df = df_no.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'NoTxns'})
df = df_noblk.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'NoperBlock'})
df = df_blksz.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'AvgBlkSz'})
df = df_unq.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'UniqueAddresses'})
df = df_hash.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'HashRate'})
df = df_diff.merge(df,how='inner',on='Date')
df = df.rename(columns={'Value': 'Difficulty'})

df = df_nasdaq.merge(df,how='inner',on='Date')
df = df_gold.merge(df,how='inner',on='Date')

ct = [i for i in reversed(range(len(df)))]
df['DateNum'] = ct 

df['Date'] = df['Date'].apply(to_date)
df['Date'] = pd.to_datetime(df['Date'])
df['Date2'] = df['Date']
df = df.set_index('Date2')

df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df['Weekday'] = df['Date'].dt.weekday


df = df[['BTCPrice','ETHPrice','BTCVol','TxFees','CostperTxn','NoTxns','NoperBlock','AvgBlkSz','UniqueAddresses',
         'HashRate','Difficulty','Nasdaq','Gold','DateNum','Date','Month','Quarter','Weekday']]
df_hist = df

print(df_hist.shape)
df_hist.info()

(416, 18)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 416 entries, 2018-01-30 to 2016-05-26
Data columns (total 18 columns):
BTCPrice           416 non-null float64
ETHPrice           416 non-null float64
BTCVol             416 non-null float64
TxFees             416 non-null float64
CostperTxn         416 non-null float64
NoTxns             416 non-null float64
NoperBlock         416 non-null float64
AvgBlkSz           416 non-null float64
UniqueAddresses    416 non-null float64
HashRate           416 non-null float64
Difficulty         416 non-null float64
Nasdaq             416 non-null float64
Gold               416 non-null float64
DateNum            416 non-null int64
Date               416 non-null datetime64[ns]
Month              416 non-null int64
Quarter            416 non-null int64
Weekday            416 non-null int64
dtypes: datetime64[ns](1), float64(13), int64(4)
memory usage: 61.8 KB


In [7]:
df_hist.head(2)

Unnamed: 0_level_0,BTCPrice,ETHPrice,BTCVol,TxFees,CostperTxn,NoTxns,NoperBlock,AvgBlkSz,UniqueAddresses,HashRate,Difficulty,Nasdaq,Gold,DateNum,Date,Month,Quarter,Weekday
Date2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-30,10078.060655,1150.54,447111.190799,1634257.0,96.382352,232176.0,1568.756757,1.018174,532630.0,19151150.0,2603077000000.0,7402.48,179.23,415,2018-01-30,1,1,1
2018-01-29,11335.426909,1209.41,259565.518843,1827743.0,117.430262,213446.0,1350.924051,1.031009,492738.0,20445150.0,2603077000000.0,7466.51,181.04,414,2018-01-29,1,1,0


In [8]:
df_hist.tail(2)

Unnamed: 0_level_0,BTCPrice,ETHPrice,BTCVol,TxFees,CostperTxn,NoTxns,NoperBlock,AvgBlkSz,UniqueAddresses,HashRate,Difficulty,Nasdaq,Gold,DateNum,Date,Month,Quarter,Weekday
Date2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-05-27,486.962703,12.47,762209.731197,25184.016948,7.157532,220093.0,1630.318519,0.823084,364137.0,1357376.0,199312100000.0,4933.5,168.45,1,2016-05-27,5,2,4
2016-05-26,458.279956,12.61,477365.253604,26820.960997,6.389729,247844.0,1795.971014,0.871457,397464.0,1377192.0,199312100000.0,4901.77,172.67,0,2016-05-26,5,2,3


In [9]:
# run coinmarketcap_hist.py weekly to generate .json file
# !python coinmarketcap_hist.py
mkt_cap = pd.read_json('data/coinmarketcap_hist2.json').T
mkt_cap['Date'] = mkt_cap.index
mkt_cap['Date'] = pd.to_datetime(mkt_cap['Date'],format='%Y%m%d',errors='ignore')
mkt_cap = mkt_cap.set_index('Date')
mkt_cap = mkt_cap[['BTC','ETH','Crypto Market Cap']]
mkt_cap.tail()

Unnamed: 0_level_0,BTC,ETH,Crypto Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-12-31,220903949498,69767510695,572573375760
2018-01-07,287582315011,106276577298,822534709108
2018-01-14,236679147035,132809805708,724968825266
2018-01-21,207286525403,107468021289,597208421609
2018-01-28,197709522984,113010221225,578768082167


In [10]:
# https://trends.google.com/trends/explore?date=2013-04-28%202018-01-31&q=bitcoin
df_goog = pd.read_csv('data/20180131_GoogleTrendsSearchInterest.csv') # Google Trends "bitcoin" interest over time
df_goog = df_goog.iloc[2:]
df_goog = df_goog.rename(columns={'Category: All categories': 'Interest'})
df_goog['Date2'] = df_goog.index
df_goog['Date2'] = pd.to_datetime(df_goog['Date2'])
df_goog = df_goog.set_index('Date2')
# df_goog.info()

In [11]:
df_mc = pd.concat([mkt_cap, df_goog], axis=1)
# df_mc = mkt_cap.merge(df_goog,how='inner',on='Date')
df_mc.tail()

Unnamed: 0,BTC,ETH,Crypto Market Cap,Interest
2017-12-31,220903949498,69767510695,572573375760,48
2018-01-07,287582315011,106276577298,822534709108,45
2018-01-14,236679147035,132809805708,724968825266,56
2018-01-21,207286525403,107468021289,597208421609,38
2018-01-28,197709522984,113010221225,578768082167,30


In [12]:
df_all = pd.concat([df_hist, df_mc], axis=1)
df_all = df_all.fillna(method='ffill')
df_all = df_all.iloc[200:,:]
df_all.head()

Unnamed: 0,BTCPrice,ETHPrice,BTCVol,TxFees,CostperTxn,NoTxns,NoperBlock,AvgBlkSz,UniqueAddresses,HashRate,...,Gold,DateNum,Date,Month,Quarter,Weekday,BTC,ETH,Crypto Market Cap,Interest
2016-07-13,656.669626,10.55,871225.091442,34693.7,5.69,219034.0,1531.7,0.77,379937.0,1516960.89,...,227.19,32.0,2016-07-13,7.0,3.0,2.0,10201700000.0,888582000.0,12254600000.0,3
2016-07-14,656.13206,10.49,872834.40947,35233.5,5.06,218093.0,1677.63,0.81,367252.0,1379055.35,...,226.88,33.0,2016-07-14,7.0,3.0,3.0,10201700000.0,888582000.0,12254600000.0,3
2016-07-15,663.841255,11.5,756558.62807,38613.99,5.49,220655.0,1553.9,0.79,367953.0,1506352.77,...,224.0,34.0,2016-07-15,7.0,3.0,4.0,10201700000.0,888582000.0,12254600000.0,3
2016-07-17,663.841255,11.5,756558.62807,38613.99,5.49,220655.0,1553.9,0.79,367953.0,1506352.77,...,224.0,34.0,2016-07-15,7.0,3.0,4.0,10473000000.0,935944000.0,12882700000.0,2
2016-07-18,680.14957,11.35,752528.34601,32557.41,7.59,171013.0,1147.73,0.7,330330.0,1580609.59,...,224.43,35.0,2016-07-18,7.0,3.0,0.0,10473000000.0,935944000.0,12882700000.0,2


In [13]:
df_all = df_all[['BTCPrice','ETHPrice','BTCVol','Crypto Market Cap', 'CostperTxn','TxFees','NoTxns','AvgBlkSz','UniqueAddresses','HashRate','Difficulty','Nasdaq','Gold','Interest','DateNum','Quarter','Month','Weekday']]
df_all = pd.DataFrame(df_all,dtype=np.float64)


In [14]:
df_all['logBTCPrice'] = df_all['BTCPrice'].apply(to_log)
df_all['logNasdaq'] = df_all['Nasdaq'].apply(to_log)
df_all['logETHPrice'] = df_all['ETHPrice'].apply(to_log)
df_all['logGold'] = df_all['Gold'].apply(to_log)
df_all['logCrypto Market Cap'] = df_all['Crypto Market Cap'].apply(to_log)
df_all['logInterest'] = df_all['Interest'].apply(to_log)
df_all['logCostperTxn'] = df_all['CostperTxn'].apply(to_log)
df_all['logTxFees'] = df_all['TxFees'].apply(to_log)
df_all['logNoTxns'] = df_all['NoTxns'].apply(to_log)
df_all['logAvgBlkSz'] = df_all['AvgBlkSz'].apply(to_log)
df_all['logUniqueAddresses'] = df_all['UniqueAddresses'].apply(to_log)
df_all['logHashRate'] = df_all['HashRate'].apply(to_log)
df_all['logBTCVol'] = df_all['BTCVol'].apply(to_log)
df_all['logDifficulty'] = df_all['Difficulty'].apply(to_log)

df_all.columns

Index(['BTCPrice', 'ETHPrice', 'BTCVol', 'Crypto Market Cap', 'CostperTxn',
       'TxFees', 'NoTxns', 'AvgBlkSz', 'UniqueAddresses', 'HashRate',
       'Difficulty', 'Nasdaq', 'Gold', 'Interest', 'DateNum', 'Quarter',
       'Month', 'Weekday', 'logBTCPrice', 'logNasdaq', 'logETHPrice',
       'logGold', 'logCrypto Market Cap', 'logInterest', 'logCostperTxn',
       'logTxFees', 'logNoTxns', 'logAvgBlkSz', 'logUniqueAddresses',
       'logHashRate', 'logBTCVol', 'logDifficulty'],
      dtype='object')

In [15]:
df_all.to_pickle('data/benson_btcsentiment_df.pkl')

In [16]:
# Coinmarketcap: Current
url = 'https://coinmarketcap.com/all/views/all/'
response=requests.get(url)
page=response.text
soup=BeautifulSoup(page,"lxml")

tables=soup.find_all("table")

rows=[row for row in tables[0].find_all('tr')]
df_curr = pd.read_html(tables[0].prettify())[0]
df.to_pickle('data/benson_btcsentiment_dfcurr.pkl')
df_curr.head()

Unnamed: 0,#,Name,Symbol,Market Cap,Price,Circulating Supply,Volume (24h),% 1h,% 24h,% 7d
0,1,BTC Bitcoin,BTC,"$173,107,347,346",$10280.90,16837762,"$7,568,130,000",0.11%,2.99%,-10.28%
1,2,ETH Ethereum,ETH,"$110,122,993,960",$1131.39,97334247,"$3,609,040,000",0.35%,6.72%,5.84%
2,3,XRP Ripple,XRP,"$44,960,649,146",$1.16,"38,739,142,811 *","$1,050,490,000",0.38%,4.77%,-14.63%
3,4,BCH Bitcoin Cash,BCH,"$25,449,310,034",$1502.09,16942600,"$706,446,000",0.25%,3.93%,-9.88%
4,5,ADA Cardano,ADA,"$13,818,324,858",$0.532969,"25,927,070,538 *","$464,623,000",1.26%,5.75%,-17.06%


In [17]:
# Coinmarketcap scraping: Bitcoin by time period
url = 'https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end=20180129'
response=requests.get(url)
page=response.text
soup=BeautifulSoup(page,"lxml")
tables=soup.find_all("table")

rows=[row for row in tables[0].find_all('tr')]
df = pd.read_html(tables[0].prettify())[0]
df['Date']=df['Date'].apply(to_date)
df = df.set_index('Date')
df.to_pickle('data/benson_btcsentiment_dfts.pkl')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-29,11755.5,11875.6,11179.2,11296.4,7107360000,197871000000
2018-01-28,11475.3,12040.3,11475.3,11786.3,8350360000,193133000000
2018-01-27,11174.9,11614.9,10989.2,11440.7,7583270000,188054000000
2018-01-26,11256.0,11656.7,10470.3,11171.4,9746200000,189398000000
2018-01-25,11421.7,11785.7,11057.4,11259.4,8873170000,192163000000


In [18]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-05-02,116.38,125.6,92.28,105.21,-,1292190000
2013-05-01,139.0,139.89,107.72,116.99,-,1542820000
2013-04-30,144.0,146.93,134.05,139.0,-,1597780000
2013-04-29,134.44,147.49,134.0,144.54,-,1491160000
2013-04-28,135.3,135.98,132.1,134.21,-,1500520000


In [19]:
# Coinmarketcap scraping: Ethereum
url = 'https://coinmarketcap.com/currencies/ethereum/historical-data/?start=20130428&end=20180129'
response=requests.get(url)
page=response.text
soup=BeautifulSoup(page,"lxml")
tables=soup.find_all("table")

rows=[row for row in tables[0].find_all('tr')]
df_eth = pd.read_html(tables[0].prettify())[0]
df_eth['Date']=df_eth['Date'].apply(to_date)
df_eth = df_eth.set_index('Date')
df_eth.to_pickle('data/benson_btcsentiment_dftseth.pkl')
df_eth.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-29,1246.7,1256.7,1169.08,1182.36,3715080000,121269000000
2018-01-28,1111.78,1257.77,1111.78,1246.01,5372330000,108122000000
2018-01-27,1055.75,1121.98,1042.12,1107.07,3002870000,102651000000
2018-01-26,1052.7,1080.6,996.78,1055.17,3617690000,102334000000
2018-01-25,1063.22,1104.66,1034.74,1056.03,4050190000,103335000000


In [20]:
btcsa = pd.read_json('data/btcpricesentiment_mc7.json',convert_axes=False).T # connect to most recent active file

btcsa['Date'] = btcsa.index
btcsa['Date'] = pd.to_datetime(btcsa['Date'])
btcsa['Bitcoin_S'] = btcsa['bitcoin_S'].apply(list_to_average)
btcsa['Ethereum_S'] = btcsa['ethereum_S'].apply(list_to_average)
btcsa['Blockchain_S'] = btcsa['blockchain_S'].apply(list_to_average)
btcsa = btcsa[['Date','Bitcoin_S','Ethereum_S','Blockchain_S','BTC','ETH','LTC','ADA','EOS','Crypto Market Cap']]
btcsa = btcsa.sort_values('Date')
btcsa = btcsa.set_index('Date')
btcsa.to_pickle('data/benson_btcsentiment_dfs.pkl')
print(btcsa.shape)
btcsa.head(2)

(286, 9)


Unnamed: 0_level_0,Bitcoin_S,Ethereum_S,Blockchain_S,BTC,ETH,LTC,ADA,EOS,Crypto Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-28 18:13:00,0.21875,-0.173106,0.437179,"$193,540,756,417","$116,129,045,529","$10,144,386,235","$16,631,593,500","$9,254,599,532","$576,359,759,424"
2018-01-28 18:14:00,0.472064,0.472606,0.395227,"$193,540,756,417","$116,129,045,529","$10,144,386,235","$16,631,593,500","$9,254,599,532","$576,359,759,424"


In [21]:
# Bitcoin futures scraping
url = 'http://www.cmegroup.com/trading/equity-index/us-index/bitcoin.html'
response=requests.get(url)
page=response.text
soup=BeautifulSoup(page,"lxml")

tables=soup.find_all("table")

rows=[row for row in tables[0].find_all('tr')]
df = pd.read_html(tables[0].prettify())[0]
df = df[:5]
df

Unnamed: 0,Month,Charts,Last,Change,Prior Settle,Open,High,Low,Volume,Hi / Low Limit,Updated
0,FEB 2018,Show Price Chart,10145,+145,10000.0,10020,10175,10020,32.0,10700 / 9300,19:16:49 CT 31 Jan 2018
1,MAR 2018,Show Price Chart,-,-,10010.0,-,-,-,1.0,10750 / 9350,18:07:00 CT 31 Jan 2018
2,APR 2018,Show Price Chart,-,-,10045.0,-,-,-,0.0,10790 / 9390,16:45:00 CT 31 Jan 2018
3,JUN 2018,Show Price Chart,-,-,10105.0,-,-,-,0.0,10850 / 9450,16:45:00 CT 31 Jan 2018
4,Legend: Options Price Chart About This Report,,,,,,,,,,


In [22]:
# Resources
# https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
# http://cs229.stanford.edu/proj2015/029_report.pdf
# http://text-processing.com/ API
# https://trends.google.com/trends/explore?q=bitcoin,ethereum