# Imports

In [1]:
_cache = './cache'
_data = './data'

## News Data

In [2]:
from src.news import NewsDataManager

In [3]:
newsData = NewsDataManager( _data, _cache )

if ( newsData.save_exists('news') ):
    newsData.load( 'news' )
else:
    newsData.process_data()

Read Saved Data                                                                                                     


In [4]:
newsData.data.head()

Unnamed: 0,Date,Headline
0,2008-08-08,"[georgia, down, two, russian, warplan, countri..."
1,2008-08-08,"[break, musharraf, impeach]"
2,2008-08-08,"[russia, today, column, troop, roll, south, os..."
3,2008-08-08,"[russian, tank, move, toward, capit, south, os..."
4,2008-08-08,"[afghan, children, rape, impun, UN, offici, sa..."


## Embeddings

In [106]:
import spacy
import gensim 
from gensim.models import Word2Vec 
import numpy as np
import gensim.downloader as api

In [107]:
model_glove_twitter = api.load("glove-twitter-25")



In [110]:
encoding_size = model_glove_twitter['bob'].shape[0]
encoding_size

25

In [111]:
create_custom = False

if create_custom:
    if ( newsData.save_exists('news+embeddings') ):
        newsData.load('news+embeddings')
    else:
        # Create a word to vec encoder 
        sentences = newsData.data.Headline.values
        news_model = Word2Vec(sentences, size=300, min_count=1)
        # Map the embeddings onto it
        newsData.use_embedding_map( news_model.wv, encoding_size )
        # Save output
        newsData.save('news+embeddings')
else:
    if ( newsData.save_exists('news+glove') ):
        newsData.load('news+glove')
    else:
        newsData.use_embedding_map( model_glove_twitter, encoding_size )
        newsData.save('news+glove')

KeyError: "word 'warplan' not in vocabulary"

In [7]:
newsData.data.head()

Unnamed: 0,Date,Headline,Embedding
0,2008-08-08,"[georgia, down, two, russian, warplan, countri...","[0.03166535, -0.11373435, 0.16130243, 0.613904..."
1,2008-08-08,"[break, musharraf, impeach]","[-0.06930076, -0.13197942, 0.07616737, 0.27997..."
2,2008-08-08,"[russia, today, column, troop, roll, south, os...","[0.06206376, -0.085277244, 0.20672376, 0.48175..."
3,2008-08-08,"[russian, tank, move, toward, capit, south, os...","[-0.014135499, -0.07695068, 0.17176159, 0.5451..."
4,2008-08-08,"[afghan, children, rape, impun, UN, offici, sa...","[-0.23319589, -0.24008308, 0.007345368, 0.7770..."


## Stock Data

In [8]:
from src.stocks import StockDataManager

In [9]:
stockData = StockDataManager( 'SPY', _data, _cache )
stockData.data.head()

Stock Data Manager
Read Saved Data                                                                                                     


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,forward_gains,backward_gains
0,1993-01-29,43.96875,43.96875,43.75,43.9375,26.299288,1003200,0.02361,
1,1993-02-01,43.96875,44.25,43.96875,44.25,26.486324,480500,0.015983,
2,1993-02-02,44.21875,44.375,44.125,44.34375,26.542448,201300,0.013898,
3,1993-02-03,44.40625,44.84375,44.375,44.8125,26.822998,529400,-0.003498,
4,1993-02-04,44.96875,45.09375,44.46875,45.0,26.93524,531500,-0.006289,0.024181


## Merge Data

In [21]:
combinedData = newsData.data.set_index('Date').join( stockData.data.set_index('Date') )
combinedData.head()

Unnamed: 0_level_0,Headline,Embedding,Open,High,Low,Close,Adj Close,Volume,forward_gains,backward_gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-08-08,"[georgia, down, two, russian, warplan, countri...","[0.03166535, -0.11373435, 0.16130243, 0.613904...",126.580002,129.929993,126.379997,129.369995,101.503098,260811700,0.001312,0.035043
2008-08-08,"[break, musharraf, impeach]","[-0.06930076, -0.13197942, 0.07616737, 0.27997...",126.580002,129.929993,126.379997,129.369995,101.503098,260811700,0.001312,0.035043
2008-08-08,"[russia, today, column, troop, roll, south, os...","[0.06206376, -0.085277244, 0.20672376, 0.48175...",126.580002,129.929993,126.379997,129.369995,101.503098,260811700,0.001312,0.035043
2008-08-08,"[russian, tank, move, toward, capit, south, os...","[-0.014135499, -0.07695068, 0.17176159, 0.5451...",126.580002,129.929993,126.379997,129.369995,101.503098,260811700,0.001312,0.035043
2008-08-08,"[afghan, children, rape, impun, UN, offici, sa...","[-0.23319589, -0.24008308, 0.007345368, 0.7770...",126.580002,129.929993,126.379997,129.369995,101.503098,260811700,0.001312,0.035043


In [22]:
combinedData = combinedData.drop(columns=['Headline', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'] )
combinedData.head()

Unnamed: 0_level_0,Embedding,forward_gains,backward_gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-08-08,"[0.03166535, -0.11373435, 0.16130243, 0.613904...",0.001312,0.035043
2008-08-08,"[-0.06930076, -0.13197942, 0.07616737, 0.27997...",0.001312,0.035043
2008-08-08,"[0.06206376, -0.085277244, 0.20672376, 0.48175...",0.001312,0.035043
2008-08-08,"[-0.014135499, -0.07695068, 0.17176159, 0.5451...",0.001312,0.035043
2008-08-08,"[-0.23319589, -0.24008308, 0.007345368, 0.7770...",0.001312,0.035043


In [23]:
# Create normalized data for regression
regressionData = combinedData.copy( deep = True )
def normalize( data, col ):
    data[col] = ( data[col] - data[col].min() ) / ( data[col].max() - data[col].min() )
normalize( regressionData, 'forward_gains' )
normalize( regressionData, 'backward_gains' )
regressionData.head()

Unnamed: 0_level_0,Embedding,forward_gains,backward_gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-08-08,"[0.03166535, -0.11373435, 0.16130243, 0.613904...",0.591182,0.596434
2008-08-08,"[-0.06930076, -0.13197942, 0.07616737, 0.27997...",0.591182,0.596434
2008-08-08,"[0.06206376, -0.085277244, 0.20672376, 0.48175...",0.591182,0.596434
2008-08-08,"[-0.014135499, -0.07695068, 0.17176159, 0.5451...",0.591182,0.596434
2008-08-08,"[-0.23319589, -0.24008308, 0.007345368, 0.7770...",0.591182,0.596434


In [31]:
# Create binary dataset aswell
binaryData = combinedData.copy( deep = True )
binaryData['forward_gains'] = binaryData['forward_gains'] > 0
binaryData['backward_gains'] = binaryData['backward_gains'] > 0
binaryData.head()

Unnamed: 0_level_0,Embedding,forward_gains,backward_gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-08-08,"[0.03166535, -0.11373435, 0.16130243, 0.613904...",True,True
2008-08-08,"[-0.06930076, -0.13197942, 0.07616737, 0.27997...",True,True
2008-08-08,"[0.06206376, -0.085277244, 0.20672376, 0.48175...",True,True
2008-08-08,"[-0.014135499, -0.07695068, 0.17176159, 0.5451...",True,True
2008-08-08,"[-0.23319589, -0.24008308, 0.007345368, 0.7770...",True,True


## Create Model

In [20]:
import tensorflow as tf

In [98]:
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(300,)),
    tf.keras.layers.Dense(300, activation=tf.nn.relu),
    tf.keras.layers.Dense(300, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.softmax)
])
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy'],
)

## Seperate Dataset

In [99]:
import random

In [100]:
# Training on forward gains for now
use_column = 'backward_gains'

In [101]:
positive_examples = binaryData['Embedding'][binaryData[use_column]].dropna().values 
positive_examples = np.stack( positive_examples )
positive_examples.shape

(28418, 300)

In [102]:
negative_examples = np.array( binaryData['Embedding'][~binaryData[use_column]].dropna().values )
negative_examples = np.stack( negative_examples )
negative_examples.shape

(21300, 300)

In [103]:
X = np.concatenate([positive_examples, negative_examples])
y = np.concatenate([np.zeros(positive_examples.shape[0]) + 1, np.zeros(negative_examples.shape[0]) ])
print(X.shape, y.shape)

(49718, 300) (49718,)


## Training

In [104]:
model.fit( x=X, y=y, validation_split=0.1, epochs = 4)

Train on 44746 samples, validate on 4972 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x1eb2db9b688>