In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow and tf.keras
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
from tensorflow import keras
import numpy as np

import os
import time
import pandas as pd
from IPython.display import clear_output
import datetime

Importing data, in this case it's a simple CSV with text and numbers

In [None]:
pandasData = pd.read_csv('/content/data.csv')  

### Separating Text

In [None]:
WSBText = []
stocksText = []
investingText = []
allText = ""

for textLine in pandasMarketData['Text']:
  allText += textLine
  WSB, Stocks, Investing = textLine.split(",|,")
  WSBText.append(WSB)
  stocksText.append(Stocks)
  investingText.append(Investing)

## Pre-processing the text

### Converting to numbers
transforming the vocabulary to numbers makes it easier to work with

In [None]:
vocab = sorted(set(allText))

In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def textToInt(textToConvert):
  text_as_int = np.array([char2idx[c] for c in textToConvert])
  return text_as_int

In [None]:
WSBTextInt = list(map(textToInt,WSBText))
stocksTextInt = list(map(textToInt,stocksText))
investingTextInt = list(map(textToInt,investingText))

### Padding the text
This is to make sure they all have the same size when we input it to the model

In [None]:
#Figure out the longest
longest = 0
for i in range(len(WSBTextInt)):
  if(len(WSBTextInt[i]) >  longest):
    longest = len(WSBTextInt[i])
  if(len(stocksTextInt[i]) >  longest):
    longest = len(stocksTextInt[i])
  if(len(investingTextInt[i]) >  longest):
    longest = len(investingTextInt[i])
  
    

In [None]:
#pad the text
for i in range(len(WSBTextInt)):
  while len(WSBTextInt[i]) < longest:
    WSBTextInt[i] = np.append(WSBTextInt[i],0)
  while len(stocksTextInt[i]) < longest:
    stocksTextInt[i] = np.append(stocksTextInt[i],0)
  while len(investingTextInt[i]) < longest:
    investingTextInt[i] = np.append(investingTextInt[i],0)

In [None]:
allTextArray = []
allTexts = np.zeros((len(WSBTextInt),longest*3))
for i in range(len(WSBTextInt)):
  currentAllTexts = np.append(WSBTextInt[i],(stocksTextInt[i], investingTextInt[i]))
  allTexts[i] = currentAllTexts
  allTextArray.append(np.append(WSBTextInt[i],(stocksTextInt[i], investingTextInt[i])))

### Adding to database.

In [None]:
pandasMarketData["WSBTextInt"] = WSBTextInt
pandasMarketData["stocksTextInt"] = stocksTextInt
pandasMarketData["investingTextInt"] = investingTextInt

In [None]:
pandasMarketData["allTextInt"] = allTextArray

In [None]:
pandasMarketData = pandasMarketData.drop("Text", axis=1)

Visualizing the data

In [None]:
pandasMarketData.head()

## Data featuring

### Date to cyclical
Transforming the date formats to cycles improve the performance of the model

In [None]:
dateTime = []
hourSine = []
hourCos = []
daySine = []
dayCos = []
monthSine = []
monthCos = []

def getSinAndCosineTime(minutesPastMidnight):
  sinTime = np.sin(2*np.pi*minutesPastMidnight/(60*24))
  cosTime = np.cos(2*np.pi*minutesPastMidnight/(60*24))
  
  times = [sinTime,cosTime]
  return(sinTime, cosTime)

def getSinAndCosineDay(dayOfMonth):
  #Podriamos hacer un sistema que cambie con cada mes
    #pero como es febrero lo dejaré así.
  sinTime = np.sin(2*np.pi*dayOfMonth/(30))
  cosTime = np.cos(2*np.pi*dayOfMonth/(30))
  
  times = [sinTime,cosTime]
  return(sinTime, cosTime)

def getSinAndCosineMonth(monthOfYear):
  #Podriamos hacer un sistema que cambie con cada mes
    #pero como es febrero lo dejaré así.
  sinTime = np.sin(2*np.pi*monthOfYear/(12))
  cosTime = np.cos(2*np.pi*monthOfYear/(12))
  
  times = [sinTime,cosTime]
  return(sinTime, cosTime)


for i in range(len(pandasMarketData)):
  currentDateString, _ = pandasMarketData["Date"][i].split(".")
  #Get the hour and minutes
  h = datetime.datetime.strptime(currentDateString, "%Y-%m-%d %H:%M:%S").hour
  m = datetime.datetime.strptime(currentDateString, "%Y-%m-%d %H:%M:%S").minute
  minutesPastMidnight = h*60 + m
  hourS, hourC = getSinAndCosineTime(minutesPastMidnight)
  hourSine.append(hourS)
  hourCos.append(hourC)
  ##########PONER DIA
  d = datetime.datetime.strptime(currentDateString, "%Y-%m-%d %H:%M:%S").day
  dayS, dayC = getSinAndCosineDay(d)
  daySine.append(dayS)
  dayCos.append(dayC)
  ##########PONER Mes
  Month = datetime.datetime.strptime(currentDateString, "%Y-%m-%d %H:%M:%S").month
  monthS, monthC = getSinAndCosineMonth(Month)
  monthSine.append(monthS)
  monthCos.append(monthC)    
  ##########Fecha
  dateTime.append(datetime.datetime.strptime(currentDateString, "%Y-%m-%d %H:%M:%S"))




In [None]:
pandasMarketData["hourSin"] = hourSine
pandasMarketData["hourCos"] = hourCos
pandasMarketData["daySin"] = daySine
pandasMarketData["dayCos"] = dayCos
pandasMarketData["monthSin"] = monthSine
pandasMarketData["monthCos"] = monthCos
pandasMarketData["dateTime"] = dateTime

drop the date

In [None]:
pandasMarketData = pandasMarketData.drop("Date", axis=1)

In [None]:
pandasMarketData.head()

# Text model

### Create dataset for text model

In [None]:
rawDataset = np.zeros((len(pandasMarketData),2,longest*3))

Building the dataset

In [None]:
delta = (pandasMarketData["dateTime"][9] - pandasMarketData["dateTime"][8])
delta.seconds

In [None]:
rowsToKeep = []
for i in range(len(allTexts)-10):
  currentDelta = (pandasMarketData["dateTime"][i+10] - pandasMarketData["dateTime"][i]).seconds
  if (currentDelta > 550 and currentDelta < 800):
    rawDataset[i][0] = allTexts[i]
    isHigher = 0
    for x in range(10):
      #if the price in the future is higher than the ask + some extra target
      if((pandasMarketData["Ask"][i] + 2) < pandasMarketData["Bid"][i+x]):
        isHigher = 1
    rawDataset[i][1] = np.repeat(isHigher,longest*3)
    
    rowsToKeep.append(i)

In [None]:
rawDataset.shape

In [None]:
cleanDataset = np.zeros((len(rowsToKeep),2,longest*3))
z = 0
for i in rowsToKeep:
  cleanDataset[z][0] = rawDataset[i][0]
  cleanDataset[z][1] = rawDataset[i][1]
  z+=1 

In [None]:
cleanDataset.shape

separating into input and target

NOTE: I need to split the data into training and test

In [None]:
tensorDataset = tf.data.Dataset.from_tensor_slices(cleanDataset)

In [None]:
tensorDataset.take(1)

In [None]:
def split_input_target(chunk):
    input_data = chunk[0]
    target_data = chunk[1][0]
    return input_data, target_data

dataset = tensorDataset.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', input_example.shape)
  print ('Target data:', target_example)

batching

In [None]:
BATCH_SIZE = 64
datasetBatched = dataset.batch(BATCH_SIZE, drop_remainder=True)

shuffle

In [None]:
BUFFER_SIZE = 10000
cleanedDataset = datasetBatched.shuffle(BUFFER_SIZE)

In [None]:
for input_example, target_example in  cleanedDataset.take(1):
  print ('Input data: ', input_example.shape)
  print ('Target data:', target_example)

### Build model

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 512

# Number of RNN units
rnn_units = 1028

In [None]:
def build_model(vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=BATCH_SIZE):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size,longest* 3]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),                             
    #tf.keras.layers.LSTM(100),
    #tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(20),
    tf.keras.layers.Dense(2) #probability of yes and of no
  ])
  return model

In [None]:
model = build_model(vocab_size=len(vocab))
model.summary()

In [None]:
#It effectively predicts it. Either 1 or 0
np.argmax(model(input_example)[0])

### Continue the model, Training loop

In [None]:
optimizer = tf.keras.optimizers.Adam()

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy (
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [None]:
# Training step
EPOCHS = 20

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(cleanedDataset):
    loss = train_step(inp, target)

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

### Saving the model

In [None]:
model.save('saved_model/my_model')
#To load: new_model = tf.keras.models.load_model('saved_model/my_model')

In [None]:
!zip -r -r /content/model.zip /content/saved_model

## testing

In [None]:
for input_example, target_example in  cleanedDataset.take(1):
  predictions = (model(input_example))


In [None]:
correct = 0
incorrect = 0
totalOnes = 0
totalZeros = 0
for input_example, target_example in  cleanedDataset.take(10):
  predictions = model(input_example)
  for i in range(len(predictions)):
    if(np.argmax(predictions[i]) == target_example[i]):
      correct += 1
    else:
      incorrect += 1 
    if(target_example[i] == 1):
      totalOnes +=1
    else:
      totalZeros += 1

print("correct predictions: ", correct)
print("incorrect predictions: ",incorrect)
print("total ones: ", totalOnes)
print("total zeros: ",totalZeros)

  

### To predict by one:

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

# Testing the model more realistically.

In [None]:
np.argmax(model(pandasMarketData["allTextInt"][0][None]))

In [None]:
prediction = 0
balance = 0
active = 0
buys = 0
losses = 0
wins = 0
balanceChart = []
for i in range(len(pandasMarketData) - 10):
    prediction = np.argmax(model(pandasMarketData["allTextInt"][i][None]))
    if(prediction == 1 and active == 0):
      active = 1
      price = pandasMarketData["Ask"][i]
      endOf10Min = i+10
      buys += 1
    if(active == 1):
      if(i < endOf10Min):
        if(price < pandasMarketData["Bid"][i]):
          balance += pandasMarketData["Bid"][i] - price
          active = 0
          price = 0
          wins += 1
          balanceChart.append(balance)
      else:
        balance += pandasMarketData["Bid"][i] - price
        active = 0
        price = 0
        losses += 1
        balanceChart.append(balance)

    if(i%100 == 0):
      print("completed: ", i/len(pandasMarketData)*100, "%")
      print("Balance: ", balance)
      print("buys: ", buys)
      print("wins: ", wins)
      print("losses: ", losses)
      


    


  

In [None]:
import matplotlib.pyplot as plt
plt.plot(balanceChart)
plt.show