In [1]:
import pandas as pd
import numpy as np

In [2]:
bitcoin = pd.read_csv("../data/merged_datasets.csv")

In [3]:
# Dump some columns
bitcoin.drop(inplace=True, columns=["Unnamed: 0", "Adj Close", "gold_close", "gold_label", "sp500_close", "sp500_label", \
                                    "isPartial", "neg", "neu", "pos"])

In [4]:
# Create a daily change percent
bitcoin['Daily_Change_Perc'] = bitcoin["Daily_Change"] / bitcoin["Open"]

In [5]:
# Re-add gold
gold = pd.read_csv("../data/gold_modified.csv")
gold['Daily_Change'] = gold["Close/Last"] - gold["Open"]
gold['Daily_Change_Perc'] = gold["Daily_Change"] / gold["Open"]
gold["Increased"] = np.where(gold["Daily_Change"] > 0, 1, 0)
# Get date in right format for merge
gold["Date"] = pd.to_datetime(gold["Date"]).dt.strftime('%Y-%m-%d')
gold_needed_columns = gold[["Date", "Close/Last", "Open", "Daily_Change", "Daily_Change_Perc", "Increased"]]

In [6]:
# Merge Gold Columns Back in
bitcoin = bitcoin.merge(gold_needed_columns, on="Date", suffixes=("", "_Gold"))

In [7]:
# Re-add sp500
sp500 = pd.read_csv("../data/sp_500.csv")
sp500['Daily_Change'] = sp500["Close/Last"] - sp500["Open"]
sp500['Daily_Change_Perc'] = sp500["Daily_Change"] / sp500["Open"]
sp500["Increased"] = np.where(sp500["Daily_Change"] > 0, 1, 0)
# Get date in right format for merge
sp500["Date"] = pd.to_datetime(sp500["Date"]).dt.strftime('%Y-%m-%d')
sp_needed_columns = sp500[["Date", "Close/Last", "Open", "Daily_Change", "Daily_Change_Perc", "Increased"]]

In [8]:
bitcoin = bitcoin.merge(sp_needed_columns, on="Date", suffixes=("", "_SP500"))

In [13]:
bitcoin.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,PROC_3,...,Close/Last_Gold,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500
0,2014-10-13,377.92099,397.226013,368.897003,390.414001,35221400,11.86499,1.0,-15.70184,0.079798,...,1230.0,1224.8,5.2,0.004246,1,1874.74,1905.65,-30.91,-0.01622,0
1,2014-10-14,391.691986,411.697998,391.324005,400.869995,38491500,10.455994,1.0,-11.895282,0.106462,...,1234.3,1237.3,-3.0,-0.002425,0,1877.7,1877.11,0.59,0.000314,1
2,2014-10-15,400.954987,402.22699,388.765991,394.77301,25267100,-6.096985,0.0,-9.263747,0.042858,...,1244.8,1233.3,11.5,0.009325,1,1862.49,1874.18,-11.69,-0.006237,0
3,2014-10-17,382.756012,385.477997,375.389008,383.757996,13600700,1.201996,1.0,-6.948665,-0.042687,...,1239.0,1239.5,-0.5,-0.000403,0,1886.76,1864.91,21.85,0.011716,1
4,2014-10-20,389.230988,390.084015,378.252014,382.845001,16419000,-6.700989,0.0,-3.828066,-0.002379,...,1244.7,1239.1,5.6,0.004519,1,1904.01,1885.62,18.39,0.009753,1


In [12]:
bitcoin = bitcoin.rename(columns={'Close/Last': 'Close/Last_Gold', 'Increased': "Increased_Gold"})

In [14]:
# set y label
bitcoin["label"] = bitcoin["Daily_Change_Ind"].shift(1)

In [19]:
for (columnName, columnData) in bitcoin.iteritems():
    print('Column Name : ', columnName)
    print('Any NAs : ', columnData.isnull().any())
    try:
        print('Any INFs:', np.isinf(columnData).any())
    except:
        print("Couldn't check")

Column Name :  Date
Any NAs :  False
Couldn't check
Column Name :  Open
Any NAs :  False
Any INFs: False
Column Name :  High
Any NAs :  False
Any INFs: False
Column Name :  Low
Any NAs :  False
Any INFs: False
Column Name :  Close
Any NAs :  False
Any INFs: False
Column Name :  Volume
Any NAs :  False
Any INFs: False
Column Name :  Daily_Change
Any NAs :  False
Any INFs: False
Column Name :  Daily_Change_Ind
Any NAs :  False
Any INFs: False
Column Name :  MACD
Any NAs :  False
Any INFs: False
Column Name :  PROC_3
Any NAs :  False
Any INFs: False
Column Name :  PROC_5
Any NAs :  False
Any INFs: False
Column Name :  PROC_10
Any NAs :  False
Any INFs: False
Column Name :  wpr
Any NAs :  False
Any INFs: False
Column Name :  sto_os
Any NAs :  False
Any INFs: False
Column Name :  goog_trend_score
Any NAs :  False
Any INFs: False
Column Name :  count
Any NAs :  False
Any INFs: False
Column Name :  compound
Any NAs :  False
Any INFs: False
Column Name :  retweets_count
Any NAs :  False
Any IN

In [20]:
bitcoin['compound_weighted_replies'] = bitcoin['compound_weighted_replies'].fillna(0)

In [21]:
bitcoin['compound_weighted_replies'].isnull().sum()

0

In [22]:
print(len(bitcoin))
bitcoin = bitcoin.dropna()
print(len(bitcoin))

1578
1577


In [28]:
np.isinf(bitcoin['Daily_Change_Perc_SP500']).sum()

10

In [32]:
#Dump the inf
bitcoin = bitcoin[np.isinf(bitcoin['Daily_Change_Perc_SP500']) == False]

In [33]:
# Train - Test Split
from sklearn.model_selection import train_test_split
random_state = 12345

train_set, test_set = train_test_split(bitcoin, test_size=0.2, random_state=random_state)

In [34]:
len(train_set)

1253

In [35]:
len(test_set)

314

In [36]:
train_set.to_csv("../models/bitcoin_train.csv")
test_set.to_csv("../models/bitcoin_test.csv")