In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm
import tensorflow.keras.backend as K
import os
import time
import pandas as pd
import numpy as np
import psutil
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    for gpu in gpu_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print('Using CPU')
    tf.config.optimizer.set_jit(True) 

Using GPU


In [69]:
import psutil
print('used: {}% free: {:.2f}GB'.format(psutil.virtual_memory().percent, float(psutil.virtual_memory().free)/1024**3))

used: 25.9% free: 9.76GB


In [4]:
from pathlib import Path
DATA_STORE = Path('model_data.h5')

In [6]:
# Get News headlines
def get_news():
    import h5py
    import s3fs
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='****', 
                           secret='******',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/news_db.h5", 'rb'), 'r', lib_version='latest') as f:
        tickers = list(f.keys())
        for i in tqdm(tickers):
            temp_df = pd.DataFrame(f[i+'/table'].value)# took one day to realize
            temp_df = temp_df[['versionCreated', 'text', 'ticker']]
            temp_df = temp_df.rename(columns={'versionCreated':'time', 'text': 'headline'})
            temp_df[temp_df.columns[temp_df.dtypes == object]] = temp_df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
            temp_df.time = pd.to_datetime(temp_df.time, unit='ns')
            temp_df = temp_df.set_index('time')
            print()
            data = data.append(temp_df)
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S.%fZ') #get full time to drop duplicates
        data.index = pd.to_datetime(data.index)
        data = data.sort_index()
        data = data.reset_index().set_index(['ticker', 'time']) #drop identical(ticker-datetime)
        data = data[~data.index.duplicated()].reset_index().set_index('time')
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S')
        data.index = pd.to_datetime(data.index)
        data['headline'] = data.headline.apply(lambda x: '<s>' + x + '<\s') #parse start/end tokens
    f.close()
    return data

def get_prices(interval):
    import h5py
    import s3fs
    idx = pd.IndexSlice
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='AKIAVKQQJHFPQ35IN6F6', 
                           secret='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',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/universe.h5", 'rb'), 'r', lib_version='latest') as f:
        print(list(f.keys()))
        print(list(f.get('prices').keys())) 
        get_columns =['time', 'ticker', 'Open', 'Close']
        data = pd.DataFrame(f['prices/'+interval+'/table'].value,columns=get_columns)
        data[data.columns[data.dtypes == object]] = data.select_dtypes([object]).stack().str.decode('utf-8').unstack()
        data.time = pd.to_datetime(data.time, unit='ns')
        data = data.set_index(['ticker', 'time'])
        data = (data.sort_index(level = 0,sort_remaining=0)
                                    .loc[idx[:, '2019':], :]
                                    .sort_index())
        data = data[~data.index.duplicated()]
    f.close()
    return data

In [7]:
news = get_news()
news.head().append(news.tail())

  0%|          | 1/532 [00:03<26:50,  3.03s/it]




  0%|          | 2/532 [00:07<30:45,  3.48s/it]




  1%|          | 3/532 [00:08<23:54,  2.71s/it]




  1%|          | 4/532 [00:12<26:53,  3.06s/it]




  1%|          | 5/532 [00:15<26:16,  2.99s/it]




  1%|          | 6/532 [00:16<21:52,  2.50s/it]




  1%|▏         | 7/532 [00:16<16:23,  1.87s/it]




  2%|▏         | 8/532 [00:18<16:41,  1.91s/it]




  2%|▏         | 9/532 [00:20<16:57,  1.94s/it]




  2%|▏         | 10/532 [00:21<14:30,  1.67s/it]




  2%|▏         | 11/532 [00:23<13:34,  1.56s/it]




  2%|▏         | 12/532 [00:24<12:41,  1.46s/it]




  2%|▏         | 13/532 [00:25<11:02,  1.28s/it]




  3%|▎         | 14/532 [00:26<09:56,  1.15s/it]




  3%|▎         | 15/532 [00:27<09:30,  1.10s/it]




  3%|▎         | 16/532 [00:28<10:26,  1.21s/it]




  3%|▎         | 17/532 [00:30<10:59,  1.28s/it]




  3%|▎         | 18/532 [00:31<10:01,  1.17s/it]




  4%|▎         | 19/532 [00:32<11:14,  1.31s/it]




  4%|▍         | 20/532 [00:33<09:45,  1.14s/it]




  4%|▍         | 21/532 [00:33<07:46,  1.10it/s]




  4%|▍         | 22/532 [00:34<07:52,  1.08it/s]




  4%|▍         | 23/532 [00:35<07:50,  1.08it/s]




  5%|▍         | 24/532 [00:36<08:10,  1.04it/s]




  5%|▍         | 25/532 [00:37<07:46,  1.09it/s]




  5%|▍         | 26/532 [00:38<08:31,  1.01s/it]




  5%|▌         | 27/532 [00:39<08:45,  1.04s/it]




  5%|▌         | 28/532 [00:40<07:58,  1.05it/s]




  5%|▌         | 29/532 [00:41<07:51,  1.07it/s]




  6%|▌         | 30/532 [00:42<07:41,  1.09it/s]




  6%|▌         | 31/532 [00:43<07:29,  1.11it/s]




  6%|▌         | 32/532 [00:44<08:44,  1.05s/it]




  6%|▌         | 33/532 [00:45<07:44,  1.07it/s]




  6%|▋         | 34/532 [00:46<09:17,  1.12s/it]




  7%|▋         | 35/532 [00:47<08:45,  1.06s/it]




  7%|▋         | 36/532 [00:48<08:03,  1.02it/s]




  7%|▋         | 37/532 [00:51<13:54,  1.69s/it]




  7%|▋         | 38/532 [00:52<11:44,  1.43s/it]




  7%|▋         | 39/532 [00:53<10:04,  1.23s/it]




  8%|▊         | 40/532 [00:54<09:56,  1.21s/it]




  8%|▊         | 41/532 [00:56<11:29,  1.40s/it]




  8%|▊         | 42/532 [00:57<10:25,  1.28s/it]




  8%|▊         | 43/532 [00:59<11:02,  1.36s/it]




  8%|▊         | 44/532 [01:00<10:49,  1.33s/it]




  8%|▊         | 45/532 [01:01<09:31,  1.17s/it]




  9%|▊         | 46/532 [01:02<08:59,  1.11s/it]




  9%|▉         | 47/532 [01:02<07:59,  1.01it/s]




  9%|▉         | 48/532 [01:03<08:10,  1.01s/it]




  9%|▉         | 49/532 [01:04<07:44,  1.04it/s]




  9%|▉         | 50/532 [01:06<08:39,  1.08s/it]




 10%|▉         | 51/532 [01:06<07:25,  1.08it/s]




 10%|▉         | 52/532 [01:07<08:11,  1.02s/it]




 10%|▉         | 53/532 [01:08<07:12,  1.11it/s]




 10%|█         | 54/532 [01:09<07:37,  1.04it/s]




 10%|█         | 55/532 [01:10<08:33,  1.08s/it]




 11%|█         | 56/532 [01:11<06:30,  1.22it/s]




 11%|█         | 57/532 [01:14<12:41,  1.60s/it]




 11%|█         | 58/532 [01:17<15:24,  1.95s/it]




 11%|█         | 59/532 [01:18<14:17,  1.81s/it]




 11%|█▏        | 60/532 [01:20<14:57,  1.90s/it]




 11%|█▏        | 61/532 [01:22<13:44,  1.75s/it]




 12%|█▏        | 62/532 [01:24<14:34,  1.86s/it]




 12%|█▏        | 63/532 [01:25<11:41,  1.50s/it]




 12%|█▏        | 64/532 [01:26<11:56,  1.53s/it]




 12%|█▏        | 65/532 [01:28<11:43,  1.51s/it]




 12%|█▏        | 66/532 [01:28<09:47,  1.26s/it]




 13%|█▎        | 67/532 [01:31<12:46,  1.65s/it]




 13%|█▎        | 68/532 [01:32<11:57,  1.55s/it]




 13%|█▎        | 69/532 [01:35<14:03,  1.82s/it]




 13%|█▎        | 70/532 [01:38<17:29,  2.27s/it]




 13%|█▎        | 71/532 [01:39<14:46,  1.92s/it]




 14%|█▎        | 72/532 [01:43<19:10,  2.50s/it]




 14%|█▎        | 73/532 [01:49<27:33,  3.60s/it]




 14%|█▍        | 74/532 [01:53<27:47,  3.64s/it]




 14%|█▍        | 75/532 [01:56<25:54,  3.40s/it]




 14%|█▍        | 76/532 [01:58<22:22,  2.95s/it]




 14%|█▍        | 77/532 [01:59<17:58,  2.37s/it]




 15%|█▍        | 78/532 [02:00<15:03,  1.99s/it]




 15%|█▍        | 79/532 [02:02<16:31,  2.19s/it]




 15%|█▌        | 80/532 [02:04<14:31,  1.93s/it]




 15%|█▌        | 81/532 [02:05<12:50,  1.71s/it]




 15%|█▌        | 82/532 [02:05<10:13,  1.36s/it]




 16%|█▌        | 83/532 [02:07<11:10,  1.49s/it]




 16%|█▌        | 84/532 [02:08<10:15,  1.37s/it]




 16%|█▌        | 85/532 [02:10<10:57,  1.47s/it]




 16%|█▌        | 86/532 [02:11<10:29,  1.41s/it]




 16%|█▋        | 87/532 [02:12<09:36,  1.29s/it]




 17%|█▋        | 88/532 [02:14<10:41,  1.44s/it]




 17%|█▋        | 89/532 [02:15<09:28,  1.28s/it]




 17%|█▋        | 90/532 [02:16<08:14,  1.12s/it]




 17%|█▋        | 91/532 [02:17<07:45,  1.06s/it]




 17%|█▋        | 92/532 [02:17<06:54,  1.06it/s]




 17%|█▋        | 93/532 [02:18<06:20,  1.15it/s]




 18%|█▊        | 94/532 [02:19<07:03,  1.03it/s]




 18%|█▊        | 95/532 [02:21<07:39,  1.05s/it]




 18%|█▊        | 96/532 [02:21<07:22,  1.02s/it]




 18%|█▊        | 97/532 [02:22<06:22,  1.14it/s]




 18%|█▊        | 98/532 [02:23<07:13,  1.00it/s]




 19%|█▊        | 99/532 [02:25<07:50,  1.09s/it]




 19%|█▉        | 100/532 [02:25<07:16,  1.01s/it]




 19%|█▉        | 101/532 [02:26<07:16,  1.01s/it]




 19%|█▉        | 102/532 [02:27<07:08,  1.00it/s]




 19%|█▉        | 103/532 [02:28<06:51,  1.04it/s]




 20%|█▉        | 104/532 [02:30<08:35,  1.21s/it]




 20%|█▉        | 105/532 [02:31<08:37,  1.21s/it]




 20%|█▉        | 106/532 [02:33<08:49,  1.24s/it]




 20%|██        | 107/532 [02:34<09:44,  1.37s/it]




 20%|██        | 108/532 [02:35<08:47,  1.24s/it]




 20%|██        | 109/532 [02:36<08:47,  1.25s/it]




 21%|██        | 110/532 [02:38<09:02,  1.28s/it]




 21%|██        | 111/532 [02:39<09:18,  1.33s/it]




 21%|██        | 113/532 [02:40<06:20,  1.10it/s]





 21%|██▏       | 114/532 [02:42<08:03,  1.16s/it]




 22%|██▏       | 115/532 [02:44<09:03,  1.30s/it]




 22%|██▏       | 116/532 [02:44<06:57,  1.00s/it]




 22%|██▏       | 118/532 [02:45<05:16,  1.31it/s]





 22%|██▏       | 119/532 [02:48<08:08,  1.18s/it]




 23%|██▎       | 120/532 [02:50<10:33,  1.54s/it]




 23%|██▎       | 121/532 [02:51<09:22,  1.37s/it]




 23%|██▎       | 122/532 [02:52<07:55,  1.16s/it]




 23%|██▎       | 123/532 [02:52<06:06,  1.11it/s]




 23%|██▎       | 124/532 [02:53<06:40,  1.02it/s]




 23%|██▎       | 125/532 [02:54<07:41,  1.13s/it]




 24%|██▎       | 126/532 [02:56<07:36,  1.12s/it]




 24%|██▍       | 127/532 [02:57<07:21,  1.09s/it]




 24%|██▍       | 128/532 [02:58<08:45,  1.30s/it]




 24%|██▍       | 129/532 [03:00<10:07,  1.51s/it]




 24%|██▍       | 130/532 [03:01<08:48,  1.31s/it]




 25%|██▍       | 131/532 [03:03<09:42,  1.45s/it]




 25%|██▍       | 132/532 [03:05<11:05,  1.66s/it]




 25%|██▌       | 133/532 [03:07<10:31,  1.58s/it]




 25%|██▌       | 134/532 [03:08<09:44,  1.47s/it]




 25%|██▌       | 135/532 [03:09<08:39,  1.31s/it]




 26%|██▌       | 136/532 [03:10<07:48,  1.18s/it]




 26%|██▌       | 137/532 [03:11<07:54,  1.20s/it]




 26%|██▌       | 138/532 [03:12<08:36,  1.31s/it]




 26%|██▌       | 139/532 [03:14<09:10,  1.40s/it]




 26%|██▋       | 140/532 [03:16<11:17,  1.73s/it]




 27%|██▋       | 141/532 [03:17<09:44,  1.50s/it]




 27%|██▋       | 142/532 [03:18<08:50,  1.36s/it]




 27%|██▋       | 143/532 [03:20<08:30,  1.31s/it]




 27%|██▋       | 144/532 [03:21<07:58,  1.23s/it]




 27%|██▋       | 145/532 [03:22<07:44,  1.20s/it]




 27%|██▋       | 146/532 [03:23<06:51,  1.07s/it]




 28%|██▊       | 147/532 [03:23<06:29,  1.01s/it]




 28%|██▊       | 148/532 [03:25<07:26,  1.16s/it]




 28%|██▊       | 149/532 [03:26<07:37,  1.19s/it]




 28%|██▊       | 150/532 [03:27<07:03,  1.11s/it]




 28%|██▊       | 151/532 [03:28<06:49,  1.08s/it]




 29%|██▊       | 152/532 [03:29<07:04,  1.12s/it]




 29%|██▉       | 153/532 [03:31<07:51,  1.24s/it]




 29%|██▉       | 154/532 [03:32<07:38,  1.21s/it]




 29%|██▉       | 155/532 [03:33<07:56,  1.26s/it]




 29%|██▉       | 156/532 [03:35<07:51,  1.25s/it]




 30%|██▉       | 157/532 [03:36<07:03,  1.13s/it]




 30%|██▉       | 158/532 [03:37<07:16,  1.17s/it]




 30%|██▉       | 159/532 [03:39<08:42,  1.40s/it]




 30%|███       | 160/532 [03:40<08:31,  1.37s/it]




 30%|███       | 161/532 [03:41<08:04,  1.31s/it]




 30%|███       | 162/532 [03:42<07:44,  1.25s/it]




 31%|███       | 163/532 [03:43<07:13,  1.17s/it]




 31%|███       | 164/532 [03:45<07:17,  1.19s/it]




 31%|███       | 165/532 [03:46<07:02,  1.15s/it]




 31%|███       | 166/532 [03:47<06:46,  1.11s/it]




 31%|███▏      | 167/532 [03:48<06:43,  1.11s/it]




 32%|███▏      | 168/532 [03:49<06:31,  1.08s/it]




 32%|███▏      | 169/532 [03:49<05:40,  1.06it/s]




 32%|███▏      | 170/532 [03:50<05:19,  1.13it/s]




 32%|███▏      | 171/532 [03:51<05:10,  1.16it/s]




 32%|███▏      | 172/532 [03:52<05:04,  1.18it/s]




 33%|███▎      | 173/532 [03:53<05:21,  1.12it/s]




 33%|███▎      | 174/532 [03:54<06:25,  1.08s/it]




 33%|███▎      | 175/532 [03:55<05:28,  1.09it/s]




 33%|███▎      | 176/532 [03:56<05:42,  1.04it/s]




 33%|███▎      | 177/532 [03:57<05:51,  1.01it/s]




 33%|███▎      | 178/532 [03:58<06:49,  1.16s/it]





 34%|███▍      | 180/532 [04:00<05:41,  1.03it/s]




 34%|███▍      | 181/532 [04:00<05:03,  1.16it/s]




 34%|███▍      | 182/532 [04:03<08:00,  1.37s/it]




 34%|███▍      | 183/532 [04:04<07:41,  1.32s/it]




 35%|███▍      | 184/532 [04:05<06:13,  1.07s/it]




 35%|███▍      | 185/532 [04:08<09:34,  1.65s/it]




 35%|███▍      | 186/532 [04:09<08:14,  1.43s/it]




 35%|███▌      | 187/532 [04:10<08:52,  1.54s/it]




 35%|███▌      | 188/532 [04:12<09:09,  1.60s/it]




 36%|███▌      | 189/532 [04:14<08:49,  1.54s/it]




 36%|███▌      | 190/532 [04:15<07:45,  1.36s/it]




 36%|███▌      | 191/532 [04:16<07:30,  1.32s/it]




 36%|███▌      | 192/532 [04:17<07:01,  1.24s/it]




 36%|███▋      | 193/532 [04:18<06:46,  1.20s/it]




 36%|███▋      | 194/532 [04:19<05:50,  1.04s/it]




 37%|███▋      | 195/532 [04:19<05:38,  1.00s/it]




 37%|███▋      | 196/532 [04:20<05:09,  1.08it/s]




 37%|███▋      | 197/532 [04:21<05:28,  1.02it/s]




 37%|███▋      | 198/532 [04:22<05:34,  1.00s/it]




 37%|███▋      | 199/532 [04:23<05:32,  1.00it/s]




 38%|███▊      | 200/532 [04:24<05:34,  1.01s/it]




 38%|███▊      | 201/532 [04:25<04:51,  1.13it/s]




 38%|███▊      | 202/532 [04:26<05:52,  1.07s/it]




 38%|███▊      | 203/532 [04:28<06:15,  1.14s/it]




 38%|███▊      | 204/532 [04:29<05:47,  1.06s/it]




 39%|███▊      | 205/532 [04:30<06:40,  1.23s/it]




 39%|███▊      | 206/532 [04:32<08:08,  1.50s/it]




 39%|███▉      | 207/532 [04:35<09:20,  1.72s/it]




 39%|███▉      | 208/532 [04:36<08:37,  1.60s/it]




 39%|███▉      | 209/532 [04:37<07:05,  1.32s/it]




 39%|███▉      | 210/532 [04:38<06:51,  1.28s/it]




 40%|███▉      | 211/532 [04:40<08:21,  1.56s/it]




 40%|███▉      | 212/532 [04:43<10:30,  1.97s/it]




 40%|████      | 213/532 [04:46<12:00,  2.26s/it]




 40%|████      | 214/532 [04:47<09:39,  1.82s/it]




 40%|████      | 215/532 [04:48<08:41,  1.64s/it]




 41%|████      | 216/532 [04:49<08:25,  1.60s/it]




 41%|████      | 217/532 [04:50<07:25,  1.42s/it]




 41%|████      | 218/532 [04:53<09:45,  1.86s/it]




 41%|████      | 219/532 [04:54<07:52,  1.51s/it]




 41%|████▏     | 220/532 [04:56<08:45,  1.69s/it]




 42%|████▏     | 221/532 [04:57<07:52,  1.52s/it]




 42%|████▏     | 222/532 [04:58<06:41,  1.29s/it]




 42%|████▏     | 223/532 [04:59<06:20,  1.23s/it]




 42%|████▏     | 224/532 [05:00<05:51,  1.14s/it]




 42%|████▏     | 225/532 [05:02<07:06,  1.39s/it]




 42%|████▏     | 226/532 [05:03<06:40,  1.31s/it]




 43%|████▎     | 227/532 [05:04<06:30,  1.28s/it]




 43%|████▎     | 228/532 [05:06<06:34,  1.30s/it]




 43%|████▎     | 229/532 [05:07<06:34,  1.30s/it]




 43%|████▎     | 230/532 [05:08<06:39,  1.32s/it]




 43%|████▎     | 231/532 [05:09<05:54,  1.18s/it]




 44%|████▎     | 232/532 [05:11<06:54,  1.38s/it]




 44%|████▍     | 233/532 [05:13<07:36,  1.53s/it]




 44%|████▍     | 234/532 [05:15<08:51,  1.78s/it]




 44%|████▍     | 235/532 [05:16<06:53,  1.39s/it]




 44%|████▍     | 236/532 [05:17<06:44,  1.37s/it]




 45%|████▍     | 237/532 [05:18<05:56,  1.21s/it]




 45%|████▍     | 238/532 [05:19<05:38,  1.15s/it]




 45%|████▍     | 239/532 [05:20<05:29,  1.13s/it]




 45%|████▌     | 240/532 [05:22<06:17,  1.29s/it]




 45%|████▌     | 241/532 [05:23<06:07,  1.26s/it]




 45%|████▌     | 242/532 [05:25<07:36,  1.58s/it]




 46%|████▌     | 243/532 [05:27<08:24,  1.75s/it]




 46%|████▌     | 244/532 [05:28<06:33,  1.37s/it]




 46%|████▌     | 245/532 [05:29<06:15,  1.31s/it]




 46%|████▌     | 246/532 [05:31<07:24,  1.55s/it]




 46%|████▋     | 247/532 [05:32<07:09,  1.51s/it]




 47%|████▋     | 248/532 [05:33<06:25,  1.36s/it]




 47%|████▋     | 249/532 [05:35<07:07,  1.51s/it]




 47%|████▋     | 250/532 [05:38<08:01,  1.71s/it]




 47%|████▋     | 251/532 [05:39<07:08,  1.52s/it]




 47%|████▋     | 252/532 [05:39<06:09,  1.32s/it]




 48%|████▊     | 253/532 [05:40<05:32,  1.19s/it]




 48%|████▊     | 254/532 [05:41<04:38,  1.00s/it]




 48%|████▊     | 255/532 [05:42<04:14,  1.09it/s]




 48%|████▊     | 256/532 [05:43<04:44,  1.03s/it]




 48%|████▊     | 257/532 [05:44<04:29,  1.02it/s]




 48%|████▊     | 258/532 [05:44<04:00,  1.14it/s]




 49%|████▊     | 259/532 [05:46<04:14,  1.07it/s]




 49%|████▉     | 260/532 [05:47<04:19,  1.05it/s]




 49%|████▉     | 261/532 [05:49<06:12,  1.38s/it]




 49%|████▉     | 262/532 [05:50<05:51,  1.30s/it]




 49%|████▉     | 263/532 [05:51<05:20,  1.19s/it]




 50%|████▉     | 264/532 [05:52<05:11,  1.16s/it]




 50%|████▉     | 265/532 [05:53<05:32,  1.24s/it]




 50%|█████     | 266/532 [05:54<04:34,  1.03s/it]




 50%|█████     | 267/532 [05:56<06:08,  1.39s/it]




 50%|█████     | 268/532 [05:57<05:51,  1.33s/it]




 51%|█████     | 269/532 [06:00<07:29,  1.71s/it]




 51%|█████     | 270/532 [06:02<07:16,  1.67s/it]




 51%|█████     | 271/532 [06:03<07:03,  1.62s/it]




 51%|█████     | 272/532 [06:05<06:46,  1.56s/it]




 51%|█████▏    | 273/532 [06:06<06:41,  1.55s/it]




 52%|█████▏    | 274/532 [06:07<05:51,  1.36s/it]




 52%|█████▏    | 275/532 [06:08<05:10,  1.21s/it]




 52%|█████▏    | 276/532 [06:09<05:24,  1.27s/it]




 52%|█████▏    | 277/532 [06:12<06:41,  1.57s/it]




 52%|█████▏    | 278/532 [06:12<05:06,  1.21s/it]




 52%|█████▏    | 279/532 [06:14<06:00,  1.43s/it]




 53%|█████▎    | 280/532 [06:16<06:22,  1.52s/it]




 53%|█████▎    | 281/532 [06:16<04:55,  1.18s/it]




 53%|█████▎    | 282/532 [06:17<04:42,  1.13s/it]




 53%|█████▎    | 283/532 [06:18<04:48,  1.16s/it]




 53%|█████▎    | 284/532 [06:20<05:01,  1.21s/it]




 54%|█████▎    | 285/532 [06:21<05:12,  1.27s/it]




 54%|█████▍    | 286/532 [06:22<05:21,  1.31s/it]




 54%|█████▍    | 287/532 [06:24<05:16,  1.29s/it]




 54%|█████▍    | 288/532 [06:25<05:04,  1.25s/it]




 54%|█████▍    | 289/532 [06:26<04:50,  1.19s/it]




 55%|█████▍    | 290/532 [06:27<04:38,  1.15s/it]




 55%|█████▍    | 291/532 [06:28<04:20,  1.08s/it]




 55%|█████▍    | 292/532 [06:29<05:00,  1.25s/it]




 55%|█████▌    | 293/532 [06:30<04:04,  1.02s/it]




 55%|█████▌    | 294/532 [06:32<05:07,  1.29s/it]




 55%|█████▌    | 295/532 [06:34<05:55,  1.50s/it]




 56%|█████▌    | 296/532 [06:35<05:40,  1.44s/it]




 56%|█████▌    | 297/532 [06:36<04:56,  1.26s/it]




 56%|█████▌    | 298/532 [06:37<05:16,  1.35s/it]




 56%|█████▌    | 299/532 [06:39<04:53,  1.26s/it]




 56%|█████▋    | 300/532 [06:40<05:09,  1.34s/it]




 57%|█████▋    | 301/532 [06:42<05:22,  1.40s/it]




 57%|█████▋    | 302/532 [06:44<06:00,  1.57s/it]




 57%|█████▋    | 303/532 [06:45<05:45,  1.51s/it]




 57%|█████▋    | 304/532 [06:46<04:50,  1.28s/it]




 57%|█████▋    | 305/532 [06:47<05:22,  1.42s/it]




 58%|█████▊    | 306/532 [06:48<04:57,  1.31s/it]




 58%|█████▊    | 307/532 [06:51<06:06,  1.63s/it]




 58%|█████▊    | 308/532 [06:51<04:54,  1.31s/it]




 58%|█████▊    | 309/532 [06:53<05:25,  1.46s/it]




 58%|█████▊    | 310/532 [06:54<05:07,  1.38s/it]




 58%|█████▊    | 311/532 [06:57<05:51,  1.59s/it]




 59%|█████▊    | 312/532 [06:58<05:49,  1.59s/it]




 59%|█████▉    | 313/532 [07:00<05:36,  1.54s/it]




 59%|█████▉    | 314/532 [07:02<06:57,  1.92s/it]




 59%|█████▉    | 315/532 [07:04<06:21,  1.76s/it]




 59%|█████▉    | 316/532 [07:06<06:45,  1.88s/it]




 60%|█████▉    | 317/532 [07:06<05:19,  1.49s/it]




 60%|█████▉    | 318/532 [07:09<06:01,  1.69s/it]




 60%|█████▉    | 319/532 [07:11<06:20,  1.79s/it]




 60%|██████    | 320/532 [07:11<05:02,  1.43s/it]




 60%|██████    | 321/532 [07:12<04:43,  1.34s/it]




 61%|██████    | 322/532 [07:13<04:08,  1.18s/it]




 61%|██████    | 323/532 [07:14<03:23,  1.03it/s]




 61%|██████    | 324/532 [07:15<03:35,  1.04s/it]




 61%|██████    | 325/532 [07:17<04:36,  1.34s/it]




 61%|██████▏   | 326/532 [07:18<04:00,  1.17s/it]




 61%|██████▏   | 327/532 [07:19<04:26,  1.30s/it]




 62%|██████▏   | 328/532 [07:20<04:10,  1.23s/it]




 62%|██████▏   | 329/532 [07:22<04:40,  1.38s/it]




 62%|██████▏   | 330/532 [07:24<05:26,  1.62s/it]




 62%|██████▏   | 331/532 [07:26<05:18,  1.58s/it]




 62%|██████▏   | 332/532 [07:27<04:54,  1.47s/it]




 63%|██████▎   | 333/532 [07:30<06:34,  1.98s/it]




 63%|██████▎   | 334/532 [07:32<06:47,  2.06s/it]




 63%|██████▎   | 335/532 [07:35<07:43,  2.35s/it]




 63%|██████▎   | 336/532 [07:37<07:23,  2.26s/it]




 63%|██████▎   | 337/532 [07:39<06:15,  1.93s/it]




 64%|██████▎   | 338/532 [07:40<05:35,  1.73s/it]




 64%|██████▎   | 339/532 [07:42<05:44,  1.78s/it]




 64%|██████▍   | 340/532 [07:43<04:55,  1.54s/it]




 64%|██████▍   | 341/532 [07:44<04:54,  1.54s/it]




 64%|██████▍   | 342/532 [07:46<05:02,  1.59s/it]




 64%|██████▍   | 343/532 [07:48<05:37,  1.78s/it]




 65%|██████▍   | 344/532 [07:50<05:55,  1.89s/it]




 65%|██████▍   | 345/532 [07:52<05:36,  1.80s/it]




 65%|██████▌   | 346/532 [07:54<05:44,  1.85s/it]




 65%|██████▌   | 347/532 [07:56<06:23,  2.07s/it]




 65%|██████▌   | 348/532 [07:57<05:16,  1.72s/it]




 66%|██████▌   | 349/532 [07:59<05:35,  1.84s/it]




 66%|██████▌   | 350/532 [08:00<04:44,  1.56s/it]




 66%|██████▌   | 351/532 [08:02<04:28,  1.48s/it]




 66%|██████▌   | 352/532 [08:03<04:34,  1.53s/it]




 66%|██████▋   | 353/532 [08:05<04:17,  1.44s/it]




 67%|██████▋   | 354/532 [08:06<04:27,  1.50s/it]




 67%|██████▋   | 355/532 [08:07<03:57,  1.34s/it]




 67%|██████▋   | 356/532 [08:08<03:52,  1.32s/it]




 67%|██████▋   | 357/532 [08:10<03:44,  1.28s/it]




 67%|██████▋   | 358/532 [08:10<03:18,  1.14s/it]




 67%|██████▋   | 359/532 [08:12<03:19,  1.15s/it]




 68%|██████▊   | 360/532 [08:13<03:13,  1.12s/it]




 68%|██████▊   | 361/532 [08:15<04:13,  1.48s/it]




 68%|██████▊   | 362/532 [08:15<03:11,  1.13s/it]




 68%|██████▊   | 363/532 [08:16<02:59,  1.06s/it]




 68%|██████▊   | 364/532 [08:18<03:41,  1.32s/it]




 69%|██████▊   | 365/532 [08:20<04:00,  1.44s/it]




 69%|██████▉   | 366/532 [08:21<03:46,  1.36s/it]




 69%|██████▉   | 367/532 [08:22<03:20,  1.21s/it]




 69%|██████▉   | 368/532 [08:22<02:39,  1.03it/s]




 69%|██████▉   | 369/532 [08:24<02:50,  1.05s/it]




 70%|██████▉   | 370/532 [08:25<02:47,  1.03s/it]




 70%|██████▉   | 371/532 [08:27<03:33,  1.33s/it]




 70%|██████▉   | 372/532 [08:27<03:10,  1.19s/it]




 70%|███████   | 373/532 [08:28<02:35,  1.02it/s]




 70%|███████   | 374/532 [08:30<03:10,  1.21s/it]




 70%|███████   | 375/532 [08:30<02:24,  1.09it/s]




 71%|███████   | 376/532 [08:31<02:17,  1.14it/s]




 71%|███████   | 377/532 [08:32<02:15,  1.14it/s]




 71%|███████   | 378/532 [08:33<02:18,  1.11it/s]




 71%|███████   | 379/532 [08:33<01:59,  1.28it/s]




 71%|███████▏  | 380/532 [08:34<02:01,  1.25it/s]




 72%|███████▏  | 381/532 [08:35<02:13,  1.13it/s]




 72%|███████▏  | 382/532 [08:37<03:07,  1.25s/it]




 72%|███████▏  | 383/532 [08:40<04:01,  1.62s/it]




 72%|███████▏  | 384/532 [08:40<03:22,  1.37s/it]




 72%|███████▏  | 385/532 [08:42<03:47,  1.55s/it]




 73%|███████▎  | 386/532 [08:43<03:25,  1.41s/it]




 73%|███████▎  | 387/532 [08:44<03:00,  1.25s/it]




 73%|███████▎  | 388/532 [08:45<02:47,  1.16s/it]




 73%|███████▎  | 389/532 [08:46<02:22,  1.00it/s]




 73%|███████▎  | 390/532 [08:47<02:13,  1.07it/s]




 73%|███████▎  | 391/532 [08:48<02:28,  1.05s/it]




 74%|███████▎  | 392/532 [08:49<02:46,  1.19s/it]




 74%|███████▍  | 393/532 [08:51<03:07,  1.35s/it]




 74%|███████▍  | 394/532 [08:52<02:40,  1.16s/it]




 74%|███████▍  | 395/532 [08:53<02:20,  1.03s/it]




 74%|███████▍  | 396/532 [08:53<01:53,  1.20it/s]




 75%|███████▍  | 397/532 [08:54<02:16,  1.01s/it]




 75%|███████▍  | 398/532 [08:56<02:27,  1.10s/it]




 75%|███████▌  | 399/532 [08:57<02:28,  1.12s/it]




 75%|███████▌  | 400/532 [08:58<02:40,  1.21s/it]




 75%|███████▌  | 401/532 [08:59<02:22,  1.09s/it]




 76%|███████▌  | 402/532 [09:01<03:11,  1.48s/it]




 76%|███████▌  | 403/532 [09:02<02:51,  1.33s/it]




 76%|███████▌  | 404/532 [09:03<02:34,  1.21s/it]




 76%|███████▌  | 405/532 [09:04<02:29,  1.18s/it]




 76%|███████▋  | 406/532 [09:06<02:50,  1.35s/it]




 77%|███████▋  | 407/532 [09:08<03:13,  1.55s/it]




 77%|███████▋  | 408/532 [09:09<03:00,  1.45s/it]




 77%|███████▋  | 409/532 [09:11<03:00,  1.47s/it]




 77%|███████▋  | 410/532 [09:12<02:25,  1.20s/it]




 77%|███████▋  | 411/532 [09:12<02:06,  1.05s/it]




 77%|███████▋  | 412/532 [09:14<02:30,  1.25s/it]




 78%|███████▊  | 413/532 [09:16<02:47,  1.41s/it]




 78%|███████▊  | 414/532 [09:17<02:24,  1.23s/it]




 78%|███████▊  | 415/532 [09:18<02:29,  1.28s/it]




 78%|███████▊  | 416/532 [09:19<02:25,  1.26s/it]




 78%|███████▊  | 417/532 [09:20<02:17,  1.19s/it]




 79%|███████▊  | 418/532 [09:21<02:16,  1.20s/it]




 79%|███████▉  | 419/532 [09:22<02:04,  1.10s/it]




 79%|███████▉  | 420/532 [09:23<02:02,  1.09s/it]




 79%|███████▉  | 421/532 [09:24<01:56,  1.05s/it]




 79%|███████▉  | 422/532 [09:25<01:52,  1.03s/it]




 80%|███████▉  | 423/532 [09:27<02:22,  1.31s/it]




 80%|███████▉  | 424/532 [09:28<02:00,  1.12s/it]




 80%|███████▉  | 425/532 [09:30<02:31,  1.42s/it]




 80%|████████  | 426/532 [09:32<02:38,  1.50s/it]




 80%|████████  | 427/532 [09:33<02:25,  1.39s/it]




 80%|████████  | 428/532 [09:34<02:19,  1.34s/it]




 81%|████████  | 429/532 [09:35<02:02,  1.19s/it]




 81%|████████  | 430/532 [09:36<02:00,  1.18s/it]




 81%|████████  | 431/532 [09:37<01:55,  1.14s/it]




 81%|████████  | 432/532 [09:38<01:51,  1.11s/it]




 81%|████████▏ | 433/532 [09:40<02:09,  1.31s/it]




 82%|████████▏ | 434/532 [09:41<01:50,  1.13s/it]




 82%|████████▏ | 435/532 [09:41<01:37,  1.00s/it]




 82%|████████▏ | 436/532 [09:42<01:39,  1.04s/it]




 82%|████████▏ | 437/532 [09:44<02:04,  1.31s/it]




 82%|████████▏ | 438/532 [09:45<01:54,  1.21s/it]




 83%|████████▎ | 439/532 [09:47<02:00,  1.30s/it]




 83%|████████▎ | 440/532 [09:48<02:06,  1.37s/it]




 83%|████████▎ | 441/532 [09:50<02:17,  1.51s/it]




 83%|████████▎ | 442/532 [09:51<01:54,  1.27s/it]




 83%|████████▎ | 443/532 [09:53<02:03,  1.38s/it]




 83%|████████▎ | 444/532 [09:53<01:46,  1.21s/it]




 84%|████████▎ | 445/532 [09:55<01:52,  1.30s/it]




 84%|████████▍ | 446/532 [09:56<01:47,  1.25s/it]




 84%|████████▍ | 447/532 [09:57<01:37,  1.14s/it]




 84%|████████▍ | 448/532 [09:58<01:39,  1.19s/it]




 84%|████████▍ | 449/532 [10:00<01:41,  1.22s/it]




 85%|████████▍ | 450/532 [10:01<01:38,  1.20s/it]




 85%|████████▍ | 451/532 [10:03<02:03,  1.53s/it]




 85%|████████▍ | 452/532 [10:04<01:59,  1.50s/it]




 85%|████████▌ | 453/532 [10:05<01:42,  1.30s/it]




 85%|████████▌ | 454/532 [10:06<01:37,  1.25s/it]




 86%|████████▌ | 455/532 [10:07<01:30,  1.17s/it]




 86%|████████▌ | 456/532 [10:09<01:27,  1.15s/it]




 86%|████████▌ | 457/532 [10:09<01:12,  1.03it/s]




 86%|████████▌ | 458/532 [10:11<01:25,  1.16s/it]





 86%|████████▋ | 460/532 [10:13<01:33,  1.29s/it]




 87%|████████▋ | 461/532 [10:15<01:34,  1.33s/it]




 87%|████████▋ | 462/532 [10:16<01:28,  1.26s/it]




 87%|████████▋ | 463/532 [10:18<01:41,  1.47s/it]




 87%|████████▋ | 464/532 [10:20<01:45,  1.55s/it]




 87%|████████▋ | 465/532 [10:21<01:42,  1.52s/it]





 88%|████████▊ | 467/532 [10:24<01:40,  1.54s/it]




 88%|████████▊ | 468/532 [10:25<01:28,  1.38s/it]





 88%|████████▊ | 469/532 [10:28<01:53,  1.80s/it]




 88%|████████▊ | 470/532 [10:30<01:52,  1.81s/it]




 89%|████████▊ | 472/532 [10:32<01:28,  1.48s/it]





 89%|████████▉ | 473/532 [10:35<01:50,  1.87s/it]




 89%|████████▉ | 474/532 [10:36<01:40,  1.73s/it]




 89%|████████▉ | 475/532 [10:38<01:39,  1.74s/it]




 90%|████████▉ | 477/532 [10:41<01:28,  1.60s/it]





 90%|████████▉ | 478/532 [10:43<01:30,  1.67s/it]




 90%|█████████ | 479/532 [10:45<01:38,  1.86s/it]




 90%|█████████ | 480/532 [10:46<01:23,  1.60s/it]




 90%|█████████ | 481/532 [10:47<01:11,  1.39s/it]




 91%|█████████ | 482/532 [10:47<00:54,  1.09s/it]




 91%|█████████ | 483/532 [10:50<01:19,  1.61s/it]




 91%|█████████ | 484/532 [10:51<01:07,  1.40s/it]




 91%|█████████ | 485/532 [10:52<01:05,  1.40s/it]




 91%|█████████▏| 486/532 [10:54<01:12,  1.58s/it]




 92%|█████████▏| 487/532 [10:55<01:01,  1.37s/it]




 92%|█████████▏| 488/532 [10:57<01:04,  1.48s/it]




 92%|█████████▏| 489/532 [10:59<01:09,  1.61s/it]




 92%|█████████▏| 490/532 [11:00<00:59,  1.43s/it]




 92%|█████████▏| 491/532 [11:01<00:57,  1.40s/it]




 92%|█████████▏| 492/532 [11:03<01:03,  1.58s/it]




 93%|█████████▎| 493/532 [11:05<00:58,  1.51s/it]




 93%|█████████▎| 494/532 [11:05<00:49,  1.30s/it]




 93%|█████████▎| 495/532 [11:06<00:43,  1.17s/it]




 93%|█████████▎| 496/532 [11:07<00:32,  1.11it/s]




 93%|█████████▎| 497/532 [11:08<00:34,  1.01it/s]




 94%|█████████▎| 498/532 [11:08<00:28,  1.18it/s]




 94%|█████████▍| 499/532 [11:10<00:33,  1.02s/it]




 94%|█████████▍| 500/532 [11:11<00:32,  1.02s/it]




 94%|█████████▍| 501/532 [11:13<00:46,  1.50s/it]




 94%|█████████▍| 502/532 [11:14<00:39,  1.33s/it]




 95%|█████████▍| 503/532 [11:15<00:32,  1.12s/it]




 95%|█████████▍| 504/532 [11:16<00:34,  1.24s/it]




 95%|█████████▍| 505/532 [11:17<00:31,  1.18s/it]




 95%|█████████▌| 506/532 [11:19<00:30,  1.17s/it]




 95%|█████████▌| 507/532 [11:20<00:28,  1.12s/it]




 95%|█████████▌| 508/532 [11:21<00:26,  1.11s/it]




 96%|█████████▌| 509/532 [11:23<00:35,  1.53s/it]




 96%|█████████▌| 510/532 [11:24<00:31,  1.41s/it]




 96%|█████████▌| 511/532 [11:26<00:33,  1.58s/it]




 96%|█████████▌| 512/532 [11:27<00:28,  1.45s/it]




 96%|█████████▋| 513/532 [11:28<00:24,  1.29s/it]




 97%|█████████▋| 514/532 [11:31<00:29,  1.65s/it]




 97%|█████████▋| 515/532 [11:32<00:23,  1.40s/it]




 97%|█████████▋| 516/532 [11:33<00:20,  1.27s/it]




 97%|█████████▋| 517/532 [11:33<00:16,  1.12s/it]




 97%|█████████▋| 518/532 [11:35<00:17,  1.24s/it]




 98%|█████████▊| 519/532 [11:36<00:15,  1.22s/it]




 98%|█████████▊| 520/532 [11:38<00:15,  1.28s/it]




 98%|█████████▊| 521/532 [11:38<00:13,  1.18s/it]




 98%|█████████▊| 522/532 [11:40<00:11,  1.17s/it]




 98%|█████████▊| 523/532 [11:42<00:13,  1.47s/it]




 98%|█████████▊| 524/532 [11:43<00:10,  1.29s/it]




 99%|█████████▊| 525/532 [11:45<00:10,  1.47s/it]




 99%|█████████▉| 526/532 [11:45<00:07,  1.26s/it]




 99%|█████████▉| 527/532 [11:47<00:06,  1.31s/it]




 99%|█████████▉| 528/532 [11:48<00:04,  1.25s/it]




 99%|█████████▉| 529/532 [11:48<00:03,  1.05s/it]




100%|█████████▉| 530/532 [11:49<00:01,  1.01it/s]




100%|█████████▉| 531/532 [11:51<00:01,  1.11s/it]




100%|██████████| 532/532 [11:52<00:00,  1.34s/it]


Unnamed: 0_level_0,ticker,headline
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-08 07:20:18,CSCO,<s>Dow Jones Selected Stocks 1710 - July 08<\s
2019-07-08 07:20:18,CVX,<s>Dow Jones Selected Stocks 1710 - July 08<\s
2019-07-08 19:50:01,CNP,<s>NYSE ORDER IMBALANCE <CNP.N> 156400.0 SHARE...
2019-07-08 23:16:50,CSCO,<s>Dow Jones Selected Stocks - July 09<\s
2019-07-09 06:01:05,AAPL,<s>RCS - IMImobile PLC - Vauxhall launch Apple...
2020-10-15 15:49:06,AAL,<s>BUZZ-U.S. STOCKS ON THE MOVE-Charles Schwab...
2020-10-15 16:00:00,AMZN,<s>Merkle Enters into Strategic Collaboration ...
2020-10-15 16:02:57,AEP,"<s>CLASS ACTION UPDATE for AEP, FENC and BMRN:..."
2020-10-15 16:04:09,ADBE,<s>Adobe (ADBE) Up 6.4% Since Last Earnings Re...
2020-10-15 16:06:53,AAL,<s>American Airlines Group Inc. - American Air...


In [12]:
news.info(verbose = 1, memory_usage = 1, null_counts= 1)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 938868 entries, 2019-07-08 07:20:18 to 2020-10-15 16:06:53
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ticker    938868 non-null  object
 1   headline  938868 non-null  object
dtypes: object(2)
memory usage: 21.5+ MB


In [13]:
# Resample News be Frequency
daily_news = news.copy().groupby('ticker').resample('D').agg({'headline': ''.join})
daily_news['headline'] = daily_news['headline'].replace('', np.nan)
daily_news = daily_news.dropna()

hourly_news = news.copy().groupby('ticker').resample('H').agg({'headline': ''.join})
hourly_news['headline'] = hourly_news['headline'].replace('', np.nan)
hourly_news = hourly_news.dropna()

min_15_news = news.copy().groupby('ticker').resample('15min').agg({'headline': ''.join})
min_15_news['headline'] = min_15_news['headline'].replace('', np.nan)
min_15_news = min_15_news.dropna()

### Daily Data Processing

In [20]:
daily = get_prices('daily')
daily.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 231075 entries, ('A', Timestamp('2019-01-02 00:00:00')) to ('ZTS', Timestamp('2020-10-14 00:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Open    231075 non-null  float64
 1   Close   231075 non-null  float64
dtypes: float64(2)
memory usage: 4.4+ MB


In [21]:
daily['returns'] = daily.Close.groupby('ticker').pct_change().shift(-1)
daily['label'] = daily.returns.where(daily.returns > 0, -1).where(daily.returns < 0, 1)

intersect = daily.index.intersection(daily_news.index)

daily_news = daily_news.loc[intersect, :].sort_index()
daily = daily.loc[intersect, :].sort_index()

daily = daily_news.join(daily[['Open', 'Close', 'returns', 'label']])


daily.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 123115 entries, ('A', Timestamp('2019-07-11 00:00:00')) to ('ZTS', Timestamp('2020-10-14 00:00:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  123115 non-null  object 
 1   Open      123115 non-null  float64
 2   Close     123115 non-null  float64
 3   returns   122729 non-null  float64
 4   label     123115 non-null  float64
dtypes: float64(4), object(1)
memory usage: 10.2+ MB


In [23]:
daily = daily.dropna() #drop based on returns NaN

with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/daily', daily)

In [24]:
# Hourly Processing

In [25]:
hourly = get_prices('hourly')
hourly.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1888760 entries, ('A', Timestamp('2019-01-02 14:00:00')) to ('ZTS', Timestamp('2020-10-14 20:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   Open    1888760 non-null  float64
 1   Close   1888760 non-null  float64
dtypes: float64(2)
memory usage: 36.3+ MB


In [26]:
# # last price after 1 hr compared to first price when news was released.
def o_c_pct_change(df):
    return (df.Close.shift(-1) - df.Open)/ df.Open

hourly['returns'] = hourly.groupby('ticker', group_keys=False).apply(o_c_pct_change)
hourly['label'] = hourly.returns.where(hourly.returns > 0, -1).where(hourly.returns < 0, 1)

hourly_intersect = hourly.index.intersection(hourly_news.index)

hourly_news = hourly_news.loc[hourly_intersect, :].sort_index()
hourly = hourly.loc[hourly_intersect, :].sort_index()

hourly = hourly_news.join(hourly[['Open', 'Close', 'returns', 'label']])

hourly['headline'] = hourly['headline'].replace('', np.nan)

hourly.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 239578 entries, ('A', Timestamp('2019-07-11 13:00:00')) to ('ZTS', Timestamp('2020-10-14 16:00:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  239578 non-null  object 
 1   Open      239578 non-null  float64
 2   Close     239578 non-null  float64
 3   returns   239507 non-null  float64
 4   label     239578 non-null  float64
dtypes: float64(4), object(1)
memory usage: 20.1+ MB


In [27]:
hourly = hourly.dropna()
hourly = hourly[['headline', 'label']]

with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/hourly', hourly)

In [28]:
# 15 Min Data Process

In [29]:
min_15 = get_prices('15_min')
min_15.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6362236 entries, ('A', Timestamp('2019-01-02 14:30:00')) to ('ZTS', Timestamp('2020-10-14 20:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   Open    6362236 non-null  float64
 1   Close   6362236 non-null  float64
dtypes: float64(2)
memory usage: 134.3+ MB


In [30]:
min_15['returns'] = min_15.groupby('ticker', group_keys=False).apply(o_c_pct_change)
min_15['label'] = min_15.returns.where(min_15.returns > 0, -1).where(min_15.returns < 0, 1)

min15_intersect = min_15.index.intersection(min_15_news.index)

min_15_news = min_15_news.loc[min15_intersect, :].sort_index()
min_15 = min_15.loc[min15_intersect, :].sort_index()

min_15 = min_15_news.join(min_15[['Open', 'Close','returns', 'label']])

min_15['headline'] = min_15['headline'].replace('', np.nan)
min_15.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 263917 entries, ('A', Timestamp('2019-07-11 16:30:00')) to ('ZTS', Timestamp('2020-10-14 16:45:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  263917 non-null  object 
 1   Open      263917 non-null  float64
 2   Close     263917 non-null  float64
 3   returns   263903 non-null  float64
 4   label     263917 non-null  float64
dtypes: float64(4), object(1)
memory usage: 21.2+ MB


In [37]:
min_15 = min_15.dropna()
min_15 = min_15[['headline', 'label']]

with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/15_min', min_15)


In [67]:
np.min([len(sentence) for sentence in daily.headline])

11

In [50]:
daily.iloc[51209]

headline    <s>Reuters Insider - Blockchain Interviews wit...
Open                                                   205.82
Close                                                  201.68
returns                                             0.0214697
label                                                       1
Name: (GS, 2019-08-05 00:00:00), dtype: object

In [58]:
len(news[news.ticker == 'GS'].loc['2019-08-05'].agg({'headline': sum})['headline'])

13763

In [63]:
120000-7600

112400

In [65]:
108871/128

850.5546875

In [66]:
50000/64

781.25