In [110]:
# Importing dependencies
import pandas as pd
import numpy as np
import OpenBlender
import datetime
import time
#from sqlalchemy import create_engine
#from config import db_password


In [156]:
#df2 = pd.read_csv('bitcoin_data.csv')
#df2.columns

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
       'Volume_(Currency)', 'Weighted_Price'],
      dtype='object')

In [146]:

# Read in files
df = pd.read_csv('bitcoin_data.csv')

# Display initial data
df.head(10)

# keep timestamp, high, low, weighted_price
df = df[['Timestamp', 'High', 'Low', 'Volume_(BTC)', 'Weighted_Price', 'Open', 'Close', 'Volume_(BTC)']].reset_index(drop=True)

# preprocessing notes-- consider omitting data prior to Jan 1st, 2012 due to higher volatility 
# earlier in bitcoin's lifecycle

# converting Timestamp column to str datatype
df['str_timestamp'] = [str(timestamp) for timestamp in df['Timestamp']]
df['int_timestamp'] = df['str_timestamp'].astype('int32', copy=True)

# filtering df to only include rows after 01/01/2012
df_cleaned_filtered = df.loc[df['int_timestamp'] >= 1325391360]
df_cleaned_filtered.head(2)

# context manager to suppress 1 time SettingWithCopyWarning; alternatively call .loc after timestamp conversion to avoid error
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    df_cleaned_filtered['UTC_time'] = df_cleaned_filtered['int_timestamp'].apply(lambda x: datetime.datetime.utcfromtimestamp(x))

# dropping all null values, followed by evaluation of which time periods had the greatest data loss
final_df = df_cleaned_filtered.dropna().reset_index(drop=True)

# finding percentage of data loss (null values)
btotalLength = (len(df_cleaned_filtered))
bpartialLength = (len(final_df))
bValuesDropped = btotalLength - bpartialLength
bitstampDiff = (btotalLength - bpartialLength) / btotalLength * 100 
totalDays = round(btotalLength/1440,2)
daysDropped = round(bValuesDropped/1440,2)

print(f"Percentage of total Bitstamp data lost due to dropping rows with NaN values: {bitstampDiff:.2f}%")
print(f"Ratio of Bitstamp rows dropped: {bValuesDropped:,}/{btotalLength:,}")
print(f"Total dataset contains {totalDays} days worth of data. Due to data loss, we had to drop {daysDropped} days worth of data")

# Machine learning section
# possible implementation ideas:
# 1. time series forecasting bitcoin price with ARIMA(Auto Regressive Integrated Moving Average)
# Rational for using time series: the 4 components of time series are trends, seasonality (consistent trends that span 1 calender year), noise/irregularity, and cyclicity(trends that can be observed in <1 or >1 year); this is perfect for tradeable assets 
# Qualifying criteria: the dataset must be stationary in order to more accurately fit a time series model. Stationary means that there should be a constant mean with constant std deviation or variance
# We can test if a time series is stationary by plotting the moving avg and/or movning std dev. to see if it changes over time. These results can be supported with an ADCF test
# if testing reveals our time series is NOT stationary, we can perform a variety of transformations to change it (log transform, sq rt, exponential decay etc..)
# SARIMA is ARIMA with a seasonality component. 


# ### PGAdmin SQL Schema
# db schema thoughts: can create multiple tables for each currency (1 for btc, 1 ethereum, 1 xrp)
# can also add in additional tables down the line to support our analysis (table with google analytics, table with twitter search results for sentiment analysis)

final_df.head(5)

# Rename columns to be sql friendly
final_df.rename({'Timestamp':'timestamp',
                  'High':'high',
                  'Low':'low',
                  'Volume_(BTC)':'volume',
                  'Weighted_Price':'weightedPrice',
                  'str_timestamp':'timestampSTR',
                  'int_timestamp':'timestampINT',
                  'UTC_time':'timeUTC',
                  'time_delta':'timeDelta'
                 }, axis='columns', inplace=True)


# Checking datatypes
print(final_df.dtypes)
final_df.head(2)

Percentage of total Bitstamp data lost due to dropping rows with NaN values: 27.14%
Ratio of Bitstamp rows dropped: 1,240,496/4,571,033
Total dataset contains 3174.33 days worth of data. Due to data loss, we had to drop 861.46 days worth of data
timestamp                 int64
high                    float64
low                     float64
volume                  float64
weightedPrice           float64
Open                    float64
Close                   float64
timestampSTR             object
timestampINT              int32
timeUTC          datetime64[ns]
dtype: object


Unnamed: 0,timestamp,high,low,volume,weightedPrice,Open,Close,timestampSTR,timestampINT,timeUTC
0,1325391360,4.58,4.58,1.502,4.58,4.58,4.58,1325391360,1325391360,2012-01-01 04:16:00
1,1325431680,4.84,4.84,10.0,4.84,4.84,4.84,1325431680,1325431680,2012-01-01 15:28:00


In [147]:
final_df['normalised_date'] = final_df['timeUTC'].dt.normalize()
final_df.head()

Unnamed: 0,timestamp,high,low,volume,weightedPrice,Open,Close,timestampSTR,timestampINT,timeUTC,normalised_date
0,1325391360,4.58,4.58,1.502,4.58,4.58,4.58,1325391360,1325391360,2012-01-01 04:16:00,2012-01-01
1,1325431680,4.84,4.84,10.0,4.84,4.84,4.84,1325431680,1325431680,2012-01-01 15:28:00,2012-01-01
2,1325457900,5.0,5.0,10.1,5.0,5.0,5.0,1325457900,1325457900,2012-01-01 22:45:00,2012-01-01
3,1325534640,5.0,5.0,19.048,5.0,5.0,5.0,1325534640,1325534640,2012-01-02 20:04:00,2012-01-02
4,1325591100,5.32,5.32,2.419173,5.32,5.32,5.32,1325591100,1325591100,2012-01-03 11:45:00,2012-01-03


In [148]:
final_df['date_only'] = final_df['normalised_date']
# aggregated_df = final_df.groupby(final_df.normalised_date.dt.date, as_index=False).mean()

# use this line to keep dates as Series
#aggregated_df = final_df.groupby(final_df.normalised_date.dt.date, as_index=True).mean().reset_index()

# dataframe with Time Series index
aggregated_df = final_df.groupby(final_df.normalised_date.dt.date, as_index=True).mean()

In [149]:
aggregated_df

Unnamed: 0_level_0,timestamp,high,low,volume,weightedPrice,Open,Close,timestampINT
normalised_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,1.325427e+09,4.806667,4.806667,7.200667,4.806667,4.806667,4.806667,1.325427e+09
2012-01-02,1.325535e+09,5.000000,5.000000,19.048000,5.000000,5.000000,5.000000,1.325535e+09
2012-01-03,1.325605e+09,5.252500,5.252500,11.004660,5.252500,5.252500,5.252500,1.325605e+09
2012-01-04,1.325682e+09,5.223333,5.200000,11.914807,5.208159,5.200000,5.223333,1.325682e+09
2012-01-05,1.325771e+09,6.286190,6.281429,4.514373,6.284127,6.281429,6.286190,1.325771e+09
...,...,...,...,...,...,...,...,...
2020-09-10,1.599739e+09,10343.558552,10334.291588,7.119214,10339.359896,10338.759708,10338.760864,1.599739e+09
2020-09-11,1.599826e+09,10306.071361,10299.283936,3.660672,10303.242947,10302.414327,10302.451926,1.599826e+09
2020-09-12,1.599912e+09,10375.469839,10370.388713,1.754713,10373.259995,10372.770434,10372.583252,1.599912e+09
2020-09-13,1.599999e+09,10396.386007,10390.076996,3.056612,10393.662602,10393.230099,10392.960134,1.599999e+09


In [115]:
date_price_df = aggregated_df[['weightedPrice']]
daily_deltas = date_price_df.pct_change(periods=1)
# monthly_deltas = date_price_df.pct_change(freq='M')
#aggregated_df['pct_change'] = aggregated_df.pct_change('weightedPrice', periods=1, index='normalised_date')

In [116]:
date_price_df['daily_pct_change'] = daily_deltas
date_price_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_price_df['daily_pct_change'] = daily_deltas


Unnamed: 0_level_0,weightedPrice,daily_pct_change
normalised_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01,4.806667,
2012-01-02,5.000000,0.040222
2012-01-03,5.252500,0.050500
2012-01-04,5.208159,-0.008442
2012-01-05,6.284127,0.206593
...,...,...
2020-09-10,10339.359896,0.014840
2020-09-11,10303.242947,-0.003493
2020-09-12,10373.259995,0.006796
2020-09-13,10393.662602,0.001967


In [117]:
len(date_price_df)

3177

In [118]:
# Creating the time-series index
n=len(date_price_df)
index = pd.date_range('2012-01-01', periods = n,freq='D')
index

DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04',
               '2012-01-05', '2012-01-06', '2012-01-07', '2012-01-08',
               '2012-01-09', '2012-01-10',
               ...
               '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-05',
               '2020-09-06', '2020-09-07', '2020-09-08', '2020-09-09',
               '2020-09-10', '2020-09-11'],
              dtype='datetime64[ns]', length=3177, freq='D')

In [119]:
date_price_df = date_price_df.set_index(index)

In [120]:
monthly

Unnamed: 0,weightedPrice,monthly_pct_change
2012-01-31,5.614045,0.012452
2012-02-29,4.952722,-0.007795
2012-03-30,4.836740,0.001873
2012-04-30,4.950282,0.009644
2012-05-31,5.136571,0.004594
...,...,...
2020-05-29,9585.242823,0.004041
2020-06-30,9093.863232,-0.007879
2020-07-31,11255.501070,-0.006904
2020-08-31,11039.063579,-0.046404


In [130]:
# finding monthly % change
monthly = date_price_df.resample('BM').apply(lambda x: x[-1])
monthly.drop(columns=['daily_pct_change'], inplace=True)
monthly['monthly_pct_change'] = monthly.pct_change()
monthly

Unnamed: 0,weightedPrice,monthly_pct_change
2012-01-31,5.614045,
2012-02-29,4.952722,-0.117798
2012-03-30,4.836740,-0.023418
2012-04-30,4.950282,0.023475
2012-05-31,5.136571,0.037632
...,...,...
2020-05-29,9585.242823,0.067412
2020-06-30,9093.863232,-0.051264
2020-07-31,11255.501070,0.237703
2020-08-31,11039.063579,-0.019229


In [129]:
three_months= date_price_df.resample('3M').apply(lambda x: x[-1])
three_months.drop(columns=['daily_pct_change'], inplace=True)
#three_months['quarterly_pct_change'] = three_months['daily_pct_change']

three_months['quarterly_pct_change'] = three_months.pct_change()
three_months

Unnamed: 0,weightedPrice,quarterly_pct_change
2012-01-31,5.614045,
2012-04-30,4.950282,-0.118233
2012-07-31,9.199746,0.858429
2012-10-31,10.826328,0.176807
2013-01-31,20.230478,0.868637
2013-04-30,141.614016,6.000033
2013-07-31,98.923727,-0.301455
2013-10-31,200.809802,1.029946
2014-01-31,801.904549,2.993354
2014-04-30,445.661972,-0.444246


In [131]:
yearly = date_price_df.resample('Y').apply(lambda x: x[-1])
yearly.drop(columns=['daily_pct_change'], inplace=True)
# yearly['yearly_pct_change'] = yearly['daily_pct_change']
yearly['yearly_pct_change'] = yearly.pct_change()
yearly

Unnamed: 0,weightedPrice,yearly_pct_change
2012-12-31,13.196513,
2013-12-31,728.728912,54.221323
2014-12-31,314.514811,-0.568406
2015-12-31,429.069676,0.364227
2016-12-31,1019.992995,1.37722
2017-12-31,14962.379078,13.6691
2018-12-31,3829.104391,-0.744085
2019-12-31,7218.040867,0.885047
2020-12-31,10332.429402,0.431473


In [132]:
date_price_merged = pd.merge(left=date_price_df, right=monthly, how='left')
date_price_merged['dt_index'] = index
date_price_merged

Unnamed: 0,weightedPrice,daily_pct_change,monthly_pct_change,dt_index
0,4.806667,,,2012-01-01
1,5.000000,0.040222,,2012-01-02
2,5.252500,0.050500,,2012-01-03
3,5.208159,-0.008442,,2012-01-04
4,6.284127,0.206593,,2012-01-05
...,...,...,...,...
3172,10339.359896,0.014840,,2020-09-07
3173,10303.242947,-0.003493,,2020-09-08
3174,10373.259995,0.006796,,2020-09-09
3175,10393.662602,0.001967,,2020-09-10


In [133]:
date_price_merged = pd.merge(left=date_price_merged, right=yearly, how='left')
date_price_merged

Unnamed: 0,weightedPrice,daily_pct_change,monthly_pct_change,dt_index,yearly_pct_change
0,4.806667,,,2012-01-01,
1,5.000000,0.040222,,2012-01-02,
2,5.252500,0.050500,,2012-01-03,
3,5.208159,-0.008442,,2012-01-04,
4,6.284127,0.206593,,2012-01-05,
...,...,...,...,...,...
3172,10339.359896,0.014840,,2020-09-07,
3173,10303.242947,-0.003493,,2020-09-08,
3174,10373.259995,0.006796,,2020-09-09,
3175,10393.662602,0.001967,,2020-09-10,


In [135]:
date_price_merged.iloc[59]

weightedPrice                     4.95272
daily_pct_change              -0.00779485
monthly_pct_change              -0.117798
dt_index              2012-02-29 00:00:00
yearly_pct_change                     NaN
Name: 59, dtype: object

In [151]:
log_diffs

Unnamed: 0_level_0,Open,Close
normalised_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01,4.806667,4.806667
2012-01-02,5.000000,5.000000
2012-01-03,5.252500,5.252500
2012-01-04,5.200000,5.223333
2012-01-05,6.281429,6.286190
...,...,...
2020-09-10,10338.759708,10338.760864
2020-09-11,10302.414327,10302.451926
2020-09-12,10372.770434,10372.583252
2020-09-13,10393.230099,10392.960134


In [152]:
log_diffs = aggregated_df[['Open','Close']]
# date_price_merged['daily_log_diff'] = np.log(log_diffs['Close']) - np.log(log_diffs['Open'])
date_price_merged['daily_log_diff'] = [np.log(x)- np.log(y) for x,y in zip(log_diffs['Close'], log_diffs['Open'])]

In [153]:
# log diff can be interpreted as an approximation of "percentage change"
date_price_merged

# for visualization purposes, might be interesting to plot normal weightedPrice vs Time, daily_log_diff vs Time, and a log_diff histogram
# when presenting, can filter by month/year to better observe changes within each year

Unnamed: 0,weightedPrice,daily_pct_change,monthly_pct_change,dt_index,yearly_pct_change,daily_log_diff
0,4.806667,,,2012-01-01,,0.000000e+00
1,5.000000,0.040222,,2012-01-02,,0.000000e+00
2,5.252500,0.050500,,2012-01-03,,0.000000e+00
3,5.208159,-0.008442,,2012-01-04,,4.477142e-03
4,6.284127,0.206593,,2012-01-05,,7.578054e-04
...,...,...,...,...,...,...
3172,10339.359896,0.014840,,2020-09-07,,1.118112e-07
3173,10303.242947,-0.003493,,2020-09-08,,3.649569e-06
3174,10373.259995,0.006796,,2020-09-09,,-1.804566e-05
3175,10393.662602,0.001967,,2020-09-10,,-2.597539e-05


In [155]:
# creating our targets (1 for net pct increase, 0 for no change/decrease)
# our target predicts whether or not there is positive performance
date_price_merged['target'] = [1 if x > 0 else 0 for x in date_price_merged['daily_log_diff']]
date_price_merged

Unnamed: 0,weightedPrice,daily_pct_change,monthly_pct_change,dt_index,yearly_pct_change,daily_log_diff,target
0,4.806667,,,2012-01-01,,0.000000e+00,0
1,5.000000,0.040222,,2012-01-02,,0.000000e+00,0
2,5.252500,0.050500,,2012-01-03,,0.000000e+00,0
3,5.208159,-0.008442,,2012-01-04,,4.477142e-03,1
4,6.284127,0.206593,,2012-01-05,,7.578054e-04,1
...,...,...,...,...,...,...,...
3172,10339.359896,0.014840,,2020-09-07,,1.118112e-07,1
3173,10303.242947,-0.003493,,2020-09-08,,3.649569e-06,1
3174,10373.259995,0.006796,,2020-09-09,,-1.804566e-05,0
3175,10393.662602,0.001967,,2020-09-10,,-2.597539e-05,0


In [157]:
# can use OpenBlender (NLP) to supplement findings with news articles, vectorize input parameters as 1 and 0

In [175]:
# applying ML model and observing results
# importing dependencies
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [169]:
# temporarily dropping monthly/annual log calculations since they contain NaN values
# also dropping very 1st value (cannot calculate deltas from nothing)
date_price_merged2 = date_price_merged.drop(columns=['monthly_pct_change','yearly_pct_change'])
date_price_merged2 = date_price_merged2.iloc[1:]
date_price_merged2

Unnamed: 0,weightedPrice,daily_pct_change,dt_index,daily_log_diff,target
1,5.000000,0.040222,2012-01-02,0.000000e+00,0
2,5.252500,0.050500,2012-01-03,0.000000e+00,0
3,5.208159,-0.008442,2012-01-04,4.477142e-03,1
4,6.284127,0.206593,2012-01-05,7.578054e-04,1
5,6.438999,0.024645,2012-01-06,1.552795e-03,1
...,...,...,...,...,...
3172,10339.359896,0.014840,2020-09-07,1.118112e-07,1
3173,10303.242947,-0.003493,2020-09-08,3.649569e-06,1
3174,10373.259995,0.006796,2020-09-09,-1.804566e-05,0
3175,10393.662602,0.001967,2020-09-10,-2.597539e-05,0


In [172]:
# creating training and testing datasets
df = date_price_merged2
X = df.loc[:,df.columns != 'target'].select_dtypes(include=[np.number]).values
y = df.loc[:,['target']].values

div = int(round(len(X) * 0.2)) # allocating 20% of dataset for testing
X_train = X[:div]
y_train = y[:div]
X_test = X[div:]
y_test= y[div:]

In [173]:
# creating random forest classifier
cls = RandomForestRegressor(n_estimators=1000)
cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)

df_residuals = pd.DataFrame({'y_test':y_test[:, 0], 'y_pred':y_pred})
df_residuals

  cls.fit(X_train, y_train)


Unnamed: 0,y_test,y_pred
0,1,1.0
1,1,1.0
2,1,1.0
3,1,1.0
4,1,1.0
...,...,...
2536,1,0.0
2537,1,1.0
2538,0,0.0
2539,0,0.0


In [177]:
# calculating model's accuracy
threshold = 0.5
predictions = [1 if val > threshold else 0 for val in df_residuals['y_pred']]
print(confusion_matrix(predictions, df_residuals['y_test']))

print('Accuracy score: ')
print(accuracy_score(predictions, df_residuals['y_test']))

print('Precision score: ')
print(precision_score(predictions, df_residuals['y_test']))

[[1209    3]
 [   0 1329]]
Accuracy score: 
0.9988193624557261
Precision score: 
0.9977477477477478


In [178]:
# based on the confusion matrix, there were 1329 total daily price increase events
# our model predicted 1329 + 3 = 1,332 total price increase events, where 3 were False Positives
# there were a total of 1,209 + 3 = 1,212 total price DECREASE events, our model detected 1,209 and missed 3 events

# precision: high precision indicates low false positive rate, also known as ratio of correctly predicted positive observations to all observations in total class
# accuracy: high accuracy indicates the ratio of correct predictions made out of total observations