This notebook takes the raw data from bitcoin_dataset.csv, constructs the $X$ and $y$ for us to train our models on, and saves these as a new dataset in selected_data.csv.

Each row of our data is the prediction of one day's bitcoin price. Let's say we are in row $n$. Then each column of $X$ is each of our selected features for days $n$ up to $n + optimal\_days$. The output $y_n$ is the price of bitcoin on day $n + optimal\_days + 1$.

In [1]:
#Pkg.add("DataStructures")
using DataFrames
import DataStructures.OrderedDict

In [2]:
cd("./data")
data = readtable("bitcoin_dataset.csv")
head(data)

Unnamed: 0,Date,btc_market_price,btc_total_bitcoins,btc_market_cap,btc_trade_volume,btc_blocks_size,btc_avg_block_size,btc_n_orphaned_blocks,btc_n_transactions_per_block,btc_median_confirmation_time,btc_hash_rate,btc_difficulty,btc_miners_revenue,btc_transaction_fees,btc_cost_per_transaction_percent,btc_cost_per_transaction,btc_n_unique_addresses,btc_n_transactions,btc_n_transactions_total,btc_n_transactions_excluding_popular,btc_n_transactions_excluding_chains_longer_than_100,btc_output_volume,btc_estimated_transaction_volume,btc_estimated_transaction_volume_usd
1,2009-01-03 00:00:00,0.0,50.0,0.0,0.0,0.0,0.000285,0.0,1.0,0.0,4.97102696296e-08,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,50.0,0.0,0.0
2,2009-01-05 00:00:00,0.0,50.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2009-01-07 00:00:00,0.0,50.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2009-01-09 00:00:00,0.0,750.0,0.0,0.0,0.0,0.000215,0.0,1.0,0.0,6.95943774815e-07,1.0,0.0,0.0,0.0,0.0,14.0,14.0,15.0,14.0,14.0,700.0,0.0,0.0
5,2009-01-11 00:00:00,0.0,7600.0,0.0,0.0,0.0,0.000215386792453,0.0,1.0,0.0,5.26928858074e-06,1.0,0.0,0.0,0.0,0.0,106.0,106.0,152.0,106.0,106.0,5300.0,0.0,0.0
6,2009-01-13 00:00:00,0.0,17800.0,0.0,0.0,0.0,0.00021807826087,0.0,1.0,0.0,5.71668100741e-06,1.0,0.0,0.0,57500.0,0.0,117.0,116.0,363.0,116.0,116.0,5778.0,10.0,0.0


In [3]:
"return a matrix where each row is `days` days of historical data
starting with each day in data"
function historical(feature, days)
    return [data[feature].data[i + day] for i in 1:(length(data[feature]) - days), day in 0:(days - 1)]
end

historical

In [10]:
selected_features = [
    :btc_market_price
    :btc_total_bitcoins
    :btc_trade_volume
#     :btc_blocks_size
    :btc_hash_rate
    :btc_n_unique_addresses
];
optimal_days = 14;

In [11]:
X = hcat([historical(feature, optimal_days) for feature in selected_features]...)

1567×70 Array{Float64,2}:
    0.0      0.0      0.0      0.0      0.0   …     195.0     105.0     122.0
    0.0      0.0      0.0      0.0      0.0         105.0     122.0     129.0
    0.0      0.0      0.0      0.0      0.0         122.0     129.0     127.0
    0.0      0.0      0.0      0.0      0.0         129.0     127.0     129.0
    0.0      0.0      0.0      0.0      0.0         127.0     129.0     129.0
    0.0      0.0      0.0      0.0      0.0   …     129.0     129.0     135.0
    0.0      0.0      0.0      0.0      0.0         129.0     135.0     127.0
    0.0      0.0      0.0      0.0      0.0         135.0     127.0     119.0
    0.0      0.0      0.0      0.0      0.0         127.0     119.0     141.0
    0.0      0.0      0.0      0.0      0.0         119.0     141.0     123.0
    0.0      0.0      0.0      0.0      0.0   …     141.0     123.0     124.0
    0.0      0.0      0.0      0.0      0.0         123.0     124.0     129.0
    0.0      0.0      0.0      0.0    

In [12]:
y = data[:btc_market_price].data[optimal_days + 1 : end]

1567-element Array{Float64,1}:
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    0.0 
    ⋮   
 3407.23
 3357.33
 3632.51
 4125.55
 4217.03
 4328.73
 4222.66
 4043.72
 4174.95
 4363.05
 4354.31
 4607.99

In [13]:
headers = ["$(string(feature)[5:end])_$(i)" for feature in selected_features for i in 1:optimal_days]

70-element Array{String,1}:
 "market_price_1"       
 "market_price_2"       
 "market_price_3"       
 "market_price_4"       
 "market_price_5"       
 "market_price_6"       
 "market_price_7"       
 "market_price_8"       
 "market_price_9"       
 "market_price_10"      
 "market_price_11"      
 "market_price_12"      
 "market_price_13"      
 ⋮                      
 "n_unique_addresses_3" 
 "n_unique_addresses_4" 
 "n_unique_addresses_5" 
 "n_unique_addresses_6" 
 "n_unique_addresses_7" 
 "n_unique_addresses_8" 
 "n_unique_addresses_9" 
 "n_unique_addresses_10"
 "n_unique_addresses_11"
 "n_unique_addresses_12"
 "n_unique_addresses_13"
 "n_unique_addresses_14"

In [14]:
df = DataFrame(OrderedDict(zip(headers, (X[:,i] for i in 1:size(X,2)))))
df[:output_price] = y
head(df)

Unnamed: 0,market_price_1,market_price_2,market_price_3,market_price_4,market_price_5,market_price_6,market_price_7,market_price_8,market_price_9,market_price_10,market_price_11,market_price_12,market_price_13,market_price_14,total_bitcoins_1,total_bitcoins_2,total_bitcoins_3,total_bitcoins_4,total_bitcoins_5,total_bitcoins_6,total_bitcoins_7,total_bitcoins_8,total_bitcoins_9,total_bitcoins_10,total_bitcoins_11,total_bitcoins_12,total_bitcoins_13,total_bitcoins_14,trade_volume_1,trade_volume_2,trade_volume_3,trade_volume_4,trade_volume_5,trade_volume_6,trade_volume_7,trade_volume_8,trade_volume_9,trade_volume_10,trade_volume_11,trade_volume_12,trade_volume_13,trade_volume_14,hash_rate_1,hash_rate_2,hash_rate_3,hash_rate_4,hash_rate_5,hash_rate_6,hash_rate_7,hash_rate_8,hash_rate_9,hash_rate_10,hash_rate_11,hash_rate_12,hash_rate_13,hash_rate_14,n_unique_addresses_1,n_unique_addresses_2,n_unique_addresses_3,n_unique_addresses_4,n_unique_addresses_5,n_unique_addresses_6,n_unique_addresses_7,n_unique_addresses_8,n_unique_addresses_9,n_unique_addresses_10,n_unique_addresses_11,n_unique_addresses_12,n_unique_addresses_13,n_unique_addresses_14,output_price
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,750.0,7600.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.97102696296e-08,0.0,0.0,6.95943774815e-07,5.26928858074e-06,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,1.0,0.0,0.0,14.0,106.0,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,750.0,7600.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,125550.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.95943774815e-07,5.26928858074e-06,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,6.41262478222e-06,0.0,0.0,14.0,106.0,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,129.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,750.0,7600.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,125550.0,137700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.95943774815e-07,5.26928858074e-06,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,6.41262478222e-06,6.31320424296e-06,0.0,14.0,106.0,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,129.0,127.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750.0,7600.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,125550.0,137700.0,150650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.95943774815e-07,5.26928858074e-06,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,6.41262478222e-06,6.31320424296e-06,6.11436316444e-06,14.0,106.0,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,129.0,127.0,129.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7600.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,125550.0,137700.0,150650.0,163150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26928858074e-06,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,6.41262478222e-06,6.31320424296e-06,6.11436316444e-06,6.41262478222e-06,106.0,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,129.0,127.0,129.0,129.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17800.0,30450.0,41650.0,52650.0,64100.0,71200.0,90850.0,101800.0,113050.0,125550.0,137700.0,150650.0,163150.0,176150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.71668100741e-06,6.31320424296e-06,5.36870912e-06,5.91552208593e-06,5.61726046815e-06,3.33058806519e-06,9.64379230815e-06,5.12015777185e-06,6.06465289481e-06,6.41262478222e-06,6.31320424296e-06,6.11436316444e-06,6.41262478222e-06,6.66117613037e-06,117.0,136.0,109.0,120.0,115.0,68.0,195.0,105.0,122.0,129.0,127.0,129.0,129.0,135.0,0.0


In [15]:
writetable("selected_features.csv", df)