In [1]:
import pandas as pd
import pymongo
from config import mongodb
import pendulum
import numpy as np

In [2]:
HOUR = 3600
DAY = HOUR * 24
CURRENCIES=[
    'USD-BTC',
    'USD-ETH',
    'USD-XRP',
]

TIMEFRAMES = {
    '-12h': -HOUR * 8,
    '12h': HOUR * 12,
    '1d': DAY,
    '2d': DAY * 2,
    '7d': DAY * 7,
    '14d': DAY * 14,
}

In [3]:
client = mongodb()
collection = client['market']['coinmarketcap']

In [4]:
datasets = []
CURRENCY = CURRENCIES[0]

In [None]:
query = collection[CURRENCY]['1h']\
    .find({}, {'_id': 0, 'timestamp': 1, 'close': 1})\
    .sort([('timestamp', pymongo.ASCENDING)])
market = pd.DataFrame(list(query))
max_date = pendulum.from_timestamp(market.iloc[-1].timestamp)
min_date = pendulum.from_timestamp(market.iloc[0].timestamp)

In [None]:
data = pd.read_parquet('../../data/bitcoin_twitter_raw/part_5.parquet')
data = data.iloc[:10000][['created_at']]
data = data[data['created_at'] < np.datetime64(max_date.subtract(seconds=TIMEFRAMES['14d']))]
data = data[data['created_at'] > np.datetime64(min_date.subtract(seconds=TIMEFRAMES['-12h']))]
datasets.append(data)

In [None]:
row_timestamps = data['created_at'].astype('int64') // 10**9
current_value = np.array(market['close'].iloc[market['timestamp'].searchsorted(row_timestamps)])

labels = {}
for label, delta in TIMEFRAMES.items():
    print(f'Creating labels: {label}')
    new_value = market['close'].iloc[market['timestamp'].searchsorted(row_timestamps + delta)]
    labels[f'{CURRENCY}-change-{label}'] = np.array(new_value - current_value) / current_value

labels = pd.DataFrame(labels)
datasets.append(labels)

In [None]:
data = pd.concat(datasets, axis=1)

In [9]:
labels.describe()

Unnamed: 0,change--12h,change-12h,change-1d,change-2d,change-7d,change-14d
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-0.000548,1.1e-05,-0.000794,-2.8e-05,-0.024366,-0.05287
std,0.023778,0.01595,0.028994,0.030907,0.048262,0.032849
min,-0.042448,-0.046056,-0.069669,-0.080646,-0.106304,-0.116021
25%,-0.01725,-0.010952,-0.026258,-0.011708,-0.062167,-0.084019
50%,0.000139,-0.002329,0.000627,0.00404,-0.044692,-0.036021
75%,0.012422,0.007509,0.014943,0.024559,0.013391,-0.022828
max,0.073684,0.044453,0.063548,0.048383,0.078625,0.026692
