In [41]:
import os
import pathlib
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [42]:
from utils.datasets import ensure_dataset
HOUR = 3600
DAY = HOUR * 24
CURRENCIES = [
    'USD-BTC',
    'USD-ETH',
    # 'USD-XRP',
    # 'USD-LINK',
]

TIMEFRAMES = {
    '-12h': -HOUR * 8,
    '12h': HOUR * 12,
    '1d': DAY,
    '2d': DAY * 2,
    '7d': DAY * 7,
    '14d': DAY * 14,
}

FEATURES = [
    f'feat-{CURRENCY}-change-{label}'
    for label, delta in TIMEFRAMES.items()
    for CURRENCY in CURRENCIES
]

OUTPUT_PATH = '../../data/bitcoin_twitter_labeled_normalized/'


In [43]:
ensure_dataset(OUTPUT_PATH, delete=True)

In [44]:
scalers = {feat: StandardScaler() for feat in FEATURES}
files = pathlib.Path("../../data/bitcoin_twitter_labeled/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    data = pd.read_parquet(file)
    for feature in FEATURES:
        values = np.array(data[feature][data[feature].notna()])
        if len(values) == 0: continue
        scalers[feature].partial_fit(values.reshape(-1, 1))

In [45]:
for feature in FEATURES:
    print(f'{feature}: mean={scalers[feature].mean_}, var={scalers[feature].var_}')

feat-USD-BTC-change--12h: mean=[-0.00063452], var=[0.00064957]
feat-USD-ETH-change--12h: mean=[-0.00064972], var=[0.00143441]
feat-USD-BTC-change-12h: mean=[0.00100798], var=[0.00083298]
feat-USD-ETH-change-12h: mean=[0.00186295], var=[0.00167703]
feat-USD-BTC-change-1d: mean=[0.00265909], var=[0.0017914]
feat-USD-ETH-change-1d: mean=[0.00495193], var=[0.00385489]
feat-USD-BTC-change-2d: mean=[0.00526322], var=[0.00364077]
feat-USD-ETH-change-2d: mean=[0.01021734], var=[0.00828368]
feat-USD-BTC-change-7d: mean=[0.02043608], var=[0.01416145]
feat-USD-ETH-change-7d: mean=[0.03836635], var=[0.03703305]
feat-USD-BTC-change-14d: mean=[0.04383856], var=[0.03605598]
feat-USD-ETH-change-14d: mean=[0.08239824], var=[0.08814765]


In [46]:
files = pathlib.Path("../../data/bitcoin_twitter_labeled/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    data = pd.read_parquet(file)
    print(f'Processing chunk {chunk}')
    for feature in FEATURES:
        data = data[data[feature].notna()]
        if len(data) == 0: break
        values = np.array(data[feature]).reshape(-1, 1)
        data[feature] = scalers[feature].transform(values).reshape(-1)
    if len(data) == 0:
        continue
    data.to_parquet(os.path.join(OUTPUT_PATH, f"part_{chunk}.parquet"))

Processing chunk 0
Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
