In [1]:
import pandas as pd
import eikon as ek
import numpy as np
from IPython.display import HTML
import os
from time import sleep
from tqdm import tqdm
import sys
import datetime
from pytz import timezone # set timezone
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

1. Get the data and news headlines for the symbols of Dow Jones, Nasdaq100, SP500.
2. Prepare the data
- PCO to intialize weights help in time computation reduction and global optima finding
- Denoising input data helps predict small price changes
3. Build Languauge model based on character embedding
- Epoch means one pass over the full training set
- Batch means that you use all your data to compute the gradient during one iteration.
- Mini-batch means you only take a subset of all your data during one iteration.
- In the context of SGD, "Minibatch" means that the gradient is calculated across the entire batch before updating weights. If you are not using a "minibatch", every training example in a "batch" updates the learning algorithm's parameters independently.

- Batch Gradient Descent. Batch size is set to the total number of examples in the training dataset. (batch_size = len(train))
- Stochastic Gradient Descent. Batch size is set to one. (batch_size = 1)
- Minibatch Gradient Descent. Batch size is set to more than one and less than the total number of examples in the training dataset. (batch_size = 32,64...)


##### Benefits of Charac2vec:
- Having the character embedding, every single word’s vector can be formed even it is out-of-vocabulary words (optional). On the other hand, word embedding can only handle those seen words.
- Good fits for misspelling words
- handles infrequent words better than word2vec embedding as later one suffers from lack of enough training opportunity for those rare words
- Reduces model complexity and improving the performance (in terms of speed)

##### Byte Level:
- When ASCII encoding is used, there is no difference between reading characters or bytes. The ASCII-way of encoding characters allows for 256 characters to be encoded and (surprise…) these 256 possible characters are stored as bytes.
4. Train Language Model and save embeddings representation and weights of the model.
5. Use weights and embeddings representation of language model to intialize new model that predict price direction movement, ultimaetly.
- Direction prediction correctness(DPC) will be used as final metric to evaluate on test data.
- DPC is %CorrectPredictions. 

## Get all data 

In [9]:
length = 0
with pd.HDFStore('./news_update/news_db.h5', mode = 'r') as store:
    for i in store.keys():
        length += len(store[i])
print(length)


625478


In [47]:
unique_index_symbols.reverse()

In [104]:
df.loc[:, 'versionCreated'].min()

Timestamp('2020-08-14 20:03:19+0000', tz='UTC')

In [98]:
df = db[unique_index_symbols[524].split('.')[0]]
df.versionCreated.min()

Timestamp('2020-08-14 20:03:19+0000', tz='UTC')

In [74]:
(df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=18)).strftime('%Y-%m-%d')

'2020-07-27'

In [76]:
end_at = (df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
begin_from = (df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=18)).strftime('%Y-%m-%d')
for start in pd.date_range(start = begin_from, end = end_at,normalize=True, tz = 'US/Eastern',freq = 'D'):
    end = start + datetime.timedelta(days=1)
    start = start.strftime('%Y-%m-%dT%H:%M:%S')
    end = end.strftime('%Y-%m-%dT%H:%M:%S') 
    print('start: ',start, 'end: ', end)


start:  2020-07-27T00:00:00 end:  2020-07-28T00:00:00
start:  2020-07-28T00:00:00 end:  2020-07-29T00:00:00
start:  2020-07-29T00:00:00 end:  2020-07-30T00:00:00
start:  2020-07-30T00:00:00 end:  2020-07-31T00:00:00
start:  2020-07-31T00:00:00 end:  2020-08-01T00:00:00
start:  2020-08-01T00:00:00 end:  2020-08-02T00:00:00
start:  2020-08-02T00:00:00 end:  2020-08-03T00:00:00
start:  2020-08-03T00:00:00 end:  2020-08-04T00:00:00
start:  2020-08-04T00:00:00 end:  2020-08-05T00:00:00
start:  2020-08-05T00:00:00 end:  2020-08-06T00:00:00
start:  2020-08-06T00:00:00 end:  2020-08-07T00:00:00
start:  2020-08-07T00:00:00 end:  2020-08-08T00:00:00
start:  2020-08-08T00:00:00 end:  2020-08-09T00:00:00
start:  2020-08-09T00:00:00 end:  2020-08-10T00:00:00
start:  2020-08-10T00:00:00 end:  2020-08-11T00:00:00
start:  2020-08-11T00:00:00 end:  2020-08-12T00:00:00
start:  2020-08-12T00:00:00 end:  2020-08-13T00:00:00
start:  2020-08-13T00:00:00 end:  2020-08-14T00:00:00


In [50]:
length = 0
for i in range(ord('A'), ord('Z') + 1):
    length += len(db[chr(i)])

In [37]:
df = db['AAPL.N'.split('.')[0][0]].
df.loc[:, 'versionCreated'].min().strftime('%Y-%m-%d')

'2020-07-27'

In [94]:

import json
import os.path

if os.path.isfile('unique_tickers.json') == False:
    with open('unique_tickers.json', 'w', encoding='utf-8') as f:
        json.dump(unique_index_symbols, f, ensure_ascii=False, indent=4)
    
with open('unique_tickers.json', 'r') as data_file:
    unique_index_symbols = json.load(data_file)


In [95]:
import requests
import datetime
from time import sleep
from tqdm import tqdm
data = pd.DataFrame()
for t in tqdm(unique_index_symbols):
    r = requests.get("https://finnhub.io/api/v1/press-releases?symbol="+t+"&token=bsbhg3nrh5rd8518ll30")
    df = pd.DataFrame(r.json()['majorDevelopment'])
    data = data.append(df, ignore_index=True)
    sleep(0.5)

data = data.rename(columns={'datetime': 'time'})
data.time = pd.to_datetime(data.time)
data.time = data.time.apply(lambda dt: int((dt - datetime.datetime(1970,1,1)).total_seconds()))
# data['description'] = data['description'].str.replace(r"\W+|_", ' ')
# data['headline'] = data['headline'].str.replace(r"\W+|_", ' ')
data.to_csv('news_test.csv', header = True, index = False)
#df.datetime = pd.to_datetime(df.datetime, unit='s')

100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [06:58<00:00,  1.26it/s]


In [42]:
r2 = requests.get('https://finnhub.io/api/v1/company-news?symbol=DUK&from=2006-01-01&to=2006-01-02&token=bsbhg3nrh5rd8518ll30')
df = pd.DataFrame(r2.json())
# df = df[['datetime', 'headline', 'id','related', 'source', 'summary']]
# df['summary'] = df['summary'].str.replace(r"\W+|_", ' ')
# df['headline'] = df['headline'].str.replace(r"\W+|_", ' ')
# df = df.rename(columns={'related': 'symbol'})

In [96]:
#Summary Statistics
pd.to_datetime(data.time, unit='s').min(), pd.to_datetime(data.time, unit='s').max(), data.groupby('symbol')['headline'].count().mean()

(Timestamp('2019-09-02 00:00:00'),
 Timestamp('2020-08-10 23:14:47'),
 32.88030888030888)

## Prepare Data for Model

In [82]:
from keras.utils import to_categorical, plot_model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding, Bidirectional, TimeDistributed
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

In [83]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')
    print(tf.config.list_physical_devices(device_type=None))
    tf.config.optimizer.set_jit(True) #@ 

Using CPU
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]


In [1100]:
#clean the already cleaned headlines but pre/appen token(<s>)
#8 bits will let you express 2^8 == 256 possible values.Thus restrict characters to 0 < ord() < 256. 
def clean_text(df, column):
    import re 
    #("".join(headline)).strip()
    headline = []
    for i in df[column].apply(lambda x: '<s>'+x+'<\s>'):
        #headline.append(re.sub('[!,*)@=#({|}_‑–?^;:{|}˚~\t\n“—’”/_]',r'',i))
        headline.append(i)
    return headline

https://realpython.com/python-encodings-guide/

In [1101]:
text = clean_text(data, 'headline')
txt = ''
# Count Unique Characters
for doc in text:
    for s in doc:
        txt += s
chars = sorted(set(txt))
print(chars)
print(len(chars)) #52+10+25

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '\xa0', '£', '¥', '®', '´', 'µ', 'Æ', 'É', '×', 'Ø', 'à', 'á', 'ä', 'å', 'é', 'ê', 'ë', 'í', 'ï', 'ñ', 'ó', 'ö', 'ú', 'ü', 'ē', 'Š', '˚', '\u200b', '\u200c', '\u200d', '‐', '‑', '–', '—', '’', '“', '”', '€', '™']
129


In [1102]:
#reserve 0 for padding: check if it is found.
#NOTE:  (len(bytes(i, encoding = 'utf-8')) > 1) == ord(i) > 127
for i in chars: 
    if (ord(i) == 0) | (ord(i) > 127):
        print(i,':',ord(i))
# {char: i for i, char in zip(map(ord, chars),chars)}

  : 160
£ : 163
¥ : 165
® : 174
´ : 180
µ : 181
Æ : 198
É : 201
× : 215
Ø : 216
à : 224
á : 225
ä : 228
å : 229
é : 233
ê : 234
ë : 235
í : 237
ï : 239
ñ : 241
ó : 243
ö : 246
ú : 250
ü : 252
ē : 275
Š : 352
˚ : 730
​ : 8203
‌ : 8204
‍ : 8205
‐ : 8208
‑ : 8209
– : 8211
— : 8212
’ : 8217
“ : 8220
” : 8221
€ : 8364
™ : 8482


In [1103]:
def encode2bytes(text):
    #text = tf.strings.unicode_split(text, 'UTF-8').to_list()
    final_list = []
    for sent in text:
        temp_list = []
        for char in sent:
            if ord(char) < 128 :
                temp_list.append(ord(char))
        final_list.append(temp_list)
    return final_list

In [1104]:
b_text = encode2bytes(text)

In [1105]:
max_sentence_len = max([len(sentence) for sentence in b_text])
max_sentence_len

412

In [1112]:
for i in range(0,21):
    print(data.headline[i])
    print(text[i])
    print(b_text[i])
    print('\n'*3)

Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes
<s>Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\s>
[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 65, 110, 110, 111, 117, 110, 99, 101, 115, 32, 80, 114, 105, 99, 105, 110, 103, 32, 79, 102, 32, 36, 53, 48, 48, 32, 77, 105, 108, 108, 105, 111, 110, 32, 79, 102, 32, 83, 101, 110, 105, 111, 114, 32, 78, 111, 116, 101, 115, 60, 92, 115, 62]




Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed
<s>Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\s>
[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 70, 105, 108, 101, 115, 32, 70, 111, 114, 32, 80, 111, 116, 101, 110, 116, 105, 97, 108, 32, 83, 101, 110, 105, 111, 114, 32, 78, 111, 116, 101, 115, 32, 79, 102, 102, 101, 114, 105, 110, 103, 32, 83, 105, 

In [1113]:
def split_X_y(text):
    X = []
    y = []
    for i in text:
        X.append(i[0:-1])
        y.append(i[1:])
    return X,y

In [1150]:
X, y = split_X_y(b_text)

In [1118]:
#Each sentence is represented as the concatenation of bytes that form its characters in utf-8 encoding.
num = np.random.randint(0, len(X))
print(X[num])
print(text[num])
print(y[num])

[60, 115, 62, 86, 101, 114, 105, 122, 111, 110, 32, 67, 111, 109, 109, 117, 110, 105, 99, 97, 116, 105, 111, 110, 115, 32, 73, 110, 99, 46, 32, 80, 114, 111, 118, 105, 100, 101, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 71, 117, 105, 100, 97, 110, 99, 101, 32, 102, 111, 114, 32, 116, 104, 101, 32, 83, 101, 99, 111, 110, 100, 32, 81, 117, 97, 114, 116, 101, 114, 32, 111, 102, 32, 50, 48, 49, 57, 60, 92, 115]
<s>Verizon Communications Inc. Provides Earnings Guidance for the Second Quarter of 2019<\s>
[115, 62, 86, 101, 114, 105, 122, 111, 110, 32, 67, 111, 109, 109, 117, 110, 105, 99, 97, 116, 105, 111, 110, 115, 32, 73, 110, 99, 46, 32, 80, 114, 111, 118, 105, 100, 101, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 71, 117, 105, 100, 97, 110, 99, 101, 32, 102, 111, 114, 32, 116, 104, 101, 32, 83, 101, 99, 111, 110, 100, 32, 81, 117, 97, 114, 116, 101, 114, 32, 111, 102, 32, 50, 48, 49, 57, 60, 92, 115, 62]


#  ###########OLD#########################

In [None]:
# Check UTF-8 Encoding 
byte_text = []
for i in text:
    i = i.encode('utf-8')
    byte_text.append(i)
byte_text[0:5]

In [None]:
def encode2bytes(text):
    final_list = []
    for sent in text:
        sent = sent.encode('utf-8')
        temp_list = []
        for char in sent:
            temp_list.append(char)
        final_list.append(temp_list)
    return final_list

In [25]:
char2ind = {char: index +1 for index, char in enumerate(chars)} #index +1 to leave 0 for padding
max_sentence_len = max([len(sentence) for sentence in text])

In [47]:
from keras.preprocessing.text import Tokenizer
# Initialization
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK', lower = False)
# Fitting
tk.fit_on_texts(text)

In [27]:
# Use char_dict to replace the tk.word_index
tk.word_index = char2ind 
# Add 'UNK' to the vocabulary 
tk.word_index[tk.oov_token] = max(char2ind.values()) + 1

# invert encoding
#index2char = {char: index for index, char in tk.word_index.items()}
chars.append(tk.oov_token)
chars.insert(0,'')
index2char = np.array(chars)

In [28]:
print(tk.word_index)
print()
print(index2char)

{' ': 1, '.': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, '<': 13, '>': 14, 'A': 15, 'B': 16, 'C': 17, 'D': 18, 'E': 19, 'F': 20, 'G': 21, 'H': 22, 'I': 23, 'J': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'Q': 31, 'R': 32, 'S': 33, 'T': 34, 'U': 35, 'V': 36, 'W': 37, 'X': 38, 'Y': 39, 'Z': 40, '\\': 41, 'a': 42, 'b': 43, 'c': 44, 'd': 45, 'e': 46, 'f': 47, 'g': 48, 'h': 49, 'i': 50, 'j': 51, 'k': 52, 'l': 53, 'm': 54, 'n': 55, 'o': 56, 'p': 57, 'q': 58, 'r': 59, 's': 60, 't': 61, 'u': 62, 'v': 63, 'w': 64, 'x': 65, 'y': 66, 'z': 67, 'µ': 68, 'Æ': 69, 'É': 70, 'Ø': 71, 'à': 72, 'á': 73, 'ä': 74, 'å': 75, 'é': 76, 'ê': 77, 'ë': 78, 'í': 79, 'ï': 80, 'ñ': 81, 'ó': 82, 'ö': 83, 'ú': 84, 'ü': 85, 'ē': 86, 'Š': 87, 'UNK': 88}

['' ' ' '.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '<' '>' 'A' 'B' 'C'
 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U'
 'V' 'W' 'X' 'Y' 'Z' '\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' '

In [29]:
print(max(char2ind.values())),print(max(tk.word_index.values())), print(len(tk.word_index));

88
88
88


In [50]:
sequences = list()
for line in text:
    # integer encode line
    encoded_seq = np.array([tk.word_index[char] for char in line])
    #encoded_seq = np.array([ord(char) for char in line])

    # store
    sequences.append(encoded_seq)

In [56]:
# tranform text characters to unique index
sequences = tk.texts_to_sequences(text)
print(text[0])
print(sequences[0])


<s>Agilent Technologies Announces Pricing Of 500 Million Of Senior Notes<\s>
[13, 4, 14, 23, 22, 10, 11, 3, 5, 9, 2, 34, 3, 12, 17, 5, 6, 11, 6, 22, 10, 3, 4, 2, 23, 5, 5, 6, 16, 5, 12, 3, 4, 2, 29, 7, 10, 12, 10, 5, 22, 2, 36, 26, 2, 54, 25, 25, 2, 38, 10, 11, 11, 10, 6, 5, 2, 36, 26, 2, 20, 3, 5, 10, 6, 7, 2, 43, 6, 9, 3, 4, 13, 24, 4, 14]


In [72]:
X = []
y = []
for i in sequences:
    X.append(i[0:-1])
    y.append(i[1:])

# ###########OLD#########################

Masking is a way to tell sequence-processing layers that certain timesteps in an input are missing, and thus should be skipped when processing the data.

Padding is a special form of masking where the masked steps are at the start or at the beginning of a sequence. Padding comes from the need to encode sequence data into contiguous batches: in order to make all sequences in a batch fit a given standard length, it is necessary to pad or truncate some sequences.
https://stackoverflow.com/questions/53172852/masking-zero-inputs-in-lstm-in-keras-without-using-embedding

In [1153]:
X = pad_sequences(X, maxlen = max_sentence_len, padding = 'post')
y = pad_sequences(y, maxlen = max_sentence_len, padding = 'post')

In [1121]:
# Add padding for longer sentences
print(text[num])
print(X[num])
print(y[num])

<s>Verizon Communications Inc. Provides Earnings Guidance for the Second Quarter of 2019<\s>
[ 60 115  62  86 101 114 105 122 111 110  32  67 111 109 109 117 110 105
  99  97 116 105 111 110 115  32  73 110  99  46  32  80 114 111 118 105
 100 101 115  32  69  97 114 110 105 110 103 115  32  71 117 105 100  97
 110  99 101  32 102 111 114  32 116 104 101  32  83 101  99 111 110 100
  32  81 117  97 114 116 101 114  32 111 102  32  50  48  49  57  60  92
 115   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   

In [1122]:
#split input / output
# X ,y = X_char[:,:-1], X_char[:,1:]
# print('X: ' , X[0], len(X[0]) ,'\n', 'y: ', y[0], len(y[0]))

In [1123]:
X.shape, y.shape

((17032, 412), (17032, 412))

### Another Option of Splitting Sequence
*But misses input/output since we had to pad before splitting*

In [822]:
padded_seq = pad_sequences(b_text, maxlen = max_sentence_len, padding = 'post')

Use the `tf.data.Dataset.from_tensor_slices` function to convert a text vector into a stream of character indices.

In [823]:
# Create Training Sequences
char_dataset = tf.data.Dataset.from_tensor_slices(padded_seq)
print(char_dataset)
for i in char_dataset.take(10):
     print("".join(map(chr, i)))

<TensorSliceDataset shapes: (401,), types: tf.int32>
<s>Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\s>                                                                                                                                                                                                                                                                                                                                    
<s>Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\s>                                                                                                                                                                                                                                                                                                                         
<s>Agilent Technologies Q2 GAAP Earnings Per Share $0.32<\s>                                                                                   

In [824]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

___________________________________________________________________________________________________________________________________

In [1335]:
# seq_data = char_dataset.map(create_seq_targets)
# seq_data
train_size = len(text) * 50//100
train_seq_data = tf.data.Dataset.from_tensor_slices((X[:train_size],y[:train_size]))
test_seq_data = tf.data.Dataset.from_tensor_slices((X[train_size:],y[train_size:]))

In [1336]:
#Checking OR next(itr(...))
for input_txt, target_txt in  seq_data.take(5):
    print('--------------------------------Headline--------------------------------')
    print(input_txt.numpy())
    print("".join(map(chr, input_txt.numpy())))
#     print(''.join(index2char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print("".join(map(chr, target_txt.numpy())))
    # There is an extra whitespace!
#     print(''.join(index2char[target_txt.numpy()]))

--------------------------------Headline--------------------------------
[ 60 115  62  65 103 105 108 101 110 116  32  84 101  99 104 110 111 108
 111 103 105 101 115  32  65 110 110 111 117 110  99 101 115  32  80 114
 105  99 105 110 103  32  79 102  32  36  53  48  48  32  77 105 108 108
 105 111 110  32  79 102  32  83 101 110 105 111 114  32  78 111 116 101
 115  60  92 115   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

In [1356]:
# Batch size
batch_size = 128

# Buffer size to shuffle the dataset so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements
buffer_size = 10000

dataset = train_seq_data.shuffle(buffer_size, seed = 42).batch(batch_size, drop_remainder=True)

In [1357]:
dataset

<BatchDataset shapes: ((128, 412), (128, 412)), types: (tf.int32, tf.int32)>

In [1358]:
train_example_batch, train_label_batch = next(iter(dataset.batch(1)))
train_example_batch

<tf.Tensor: shape=(1, 32, 412), dtype=int32, numpy=
array([[[ 60, 115,  62, ...,   0,   0,   0],
        [ 60, 115,  62, ...,   0,   0,   0],
        [ 60, 115,  62, ...,   0,   0,   0],
        ...,
        [ 60, 115,  62, ...,   0,   0,   0],
        [ 60, 115,  62, ...,   0,   0,   0],
        [ 60, 115,  62, ...,   0,   0,   0]]])>

## Build Language Model

In [1359]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
#https://datascience.stackexchange.com/questions/41921/sparse-categorical-crossentropy-vs-categorical-crossentropy-keras-accuracy
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

#### Tips for LSTM Inputs 
- The LSTM input layer must be 3D.
- The meaning of the 3 input dimensions are: samples, time steps, and features (sequences, sequence_length, characters).
- The LSTM input layer is defined by the input_shape argument on the first hidden layer.
- The input_shape argument takes a tuple of two values that define the number of time steps and features.
- The number of samples is assumed to be 1 or more.
- The reshape() function on NumPy arrays can be used to reshape your 1D or 2D data to be 3D.
- The reshape() function takes a tuple as an argument that defines the new shape
- The LSTM return the entire sequence of outputs for each sample (one vector per timestep per sample), if you set return_sequences=True.
- Stateful RNN only makes sense if each input sequence in a batch starts exactly where the corresponding sequence in the previous batch left off. Our RNN model is stateless since each sample is different from the other and they dont form a text corpus but are separate headlines.

#### Tips for Embedding Layer
- Gives relationship between characters.
- Dense vector representation (n-Dimensional) of float point values. Map(char/byte) to a dense vector.
- Embeddings are trainable weights/paramaeters by the model equivalent to weights learned by dense layer.
- In our case each unique character/byte is represented with an N-Dimensional vector of floating point values, where the learned embedding forms a lookup table by "looking up" each characters dense vector in the table to encode it.
- A simple integer encoding of our characters is not efficient for the model to interpret since a linear classifier only learns the weights for a single feature but not the relationship (probability distribution) between each feature(characters) or there encodings.
- A higher dimensional embedding can capture fine-grained relationships between characters, but takes more data to learn.(256-Dimensions our case)


In [1360]:

# stateful=True: preserve hidden(final) state for next batch's intial state
def create_model(batch_size):
    model = Sequential(name = 'CharLSTM')
    model.add(Embedding(127, 256,batch_input_shape=[batch_size, None], mask_zero=True, name ='EmbedLayer'))
    model.add(Bidirectional(LSTM(1024, return_sequences=True,stateful=False,recurrent_initializer='glorot_uniform'), name = 'BiLSTM'))
    model.add(TimeDistributed(Dense(127, name = 'TimeDistDense')))
    model.compile(optimizer=tf.optimizers.SGD(learning_rate=1e-3), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))
    return model

In [1361]:
model = create_model(batch_size)

In [1362]:
model.summary()

Model: "CharLSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EmbedLayer (Embedding)       (128, None, 256)          32512     
_________________________________________________________________
BiLSTM (Bidirectional)       (128, None, 2048)         10493952  
_________________________________________________________________
time_distributed_15 (TimeDis (None, None, 127)         260223    
Total params: 10,786,687
Trainable params: 10,786,687
Non-trainable params: 0
_________________________________________________________________


In [1363]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(input_example_batch[0], '\n')
    print(target_example_batch[0])


tf.Tensor(
[ 60 115  62  75 101 115 115 108 101 114  32  84 111 112  97 122  32  77
 101 108 116 122 101 114  32  38  32  67 104 101  99 107  44  32  76  76
  80  32  70 105 108 101 115  32  83 104  97 114 101 104 111 108 100 101
 114  32  67 108  97 115 115  32  65  99 116 105 111 110  32  76  97 119
 115 117 105 116  32  65 103  97 105 110 115 116  32  66  97 120 116 101
 114  32  73 110 116 101 114 110  97 116 105 111 110  97 108  32  73 110
  99  60  92 115   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0

In [1371]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

  # Display the dimensions of the predictions
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, byte/character)")


(128, 412, 127)  <=== (batch_size, sequence_length, byte/character)


#### Logits Predicting Log-Likelihood from Ouput Layer:
For each character/byte the model looks up the embedding, runs the LSTM one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character/Byte.
This distribution, for each predicted character/byte, is defined by the logits over the characters(i.e 1-127 Decimal Points bytes).

In [1374]:
# sum(np.where(example_batch_predictions[0][1] < 0, example_batch_predictions[0][1] * -1, example_batch_predictions[0][1])
example_batch_predictions[0]

<tf.Tensor: shape=(412, 127), dtype=float32, numpy=
array([[ 0.00267012, -0.00507424,  0.00358143, ...,  0.00626232,
         0.01144155, -0.00284741],
       [ 0.00349888, -0.00954466, -0.00494754, ...,  0.00844962,
         0.01703026,  0.00099191],
       [-0.00170235, -0.01700655, -0.00229823, ...,  0.00122966,
         0.01756719,  0.00086103],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)>

In [1377]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1) + 1

In [1378]:
sampled_indices

<tf.Tensor: shape=(412, 1), dtype=int64, numpy=
array([[107],
       [  2],
       [ 66],
       [ 23],
       [ 55],
       [ 14],
       [ 23],
       [105],
       [ 43],
       [ 51],
       [105],
       [ 16],
       [ 35],
       [  8],
       [118],
       [109],
       [ 97],
       [ 18],
       [115],
       [ 84],
       [ 84],
       [ 48],
       [107],
       [  3],
       [102],
       [  6],
       [ 48],
       [ 92],
       [ 12],
       [ 67],
       [ 55],
       [ 88],
       [ 28],
       [ 23],
       [ 88],
       [122],
       [ 13],
       [ 81],
       [ 18],
       [ 12],
       [122],
       [ 83],
       [ 12],
       [ 73],
       [123],
       [126],
       [  3],
       [ 99],
       [ 54],
       [ 94],
       [ 98],
       [ 53],
       [ 27],
       [ 76],
       [ 83],
       [ 14],
       [ 91],
       [101],
       [ 35],
       [ 15],
       [ 54],
       [ 66],
       [101],
       [ 42],
       [ 45],
       [ 39],
       [ 36],
       [110],


In [1379]:
# Reformat to not be a lists of lists
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [1380]:
chr(18)

'\x12'

In [1381]:
print("Given the input seq: \n")
print("".join(map(chr,input_example_batch[0])))
print(len("".join(map(chr,input_example_batch[0]))))
print('\n')
print("Chars Predictions: \n")
print("".join(map(chr,sampled_indices)))
print(len("".join(map(chr,sampled_indices))))
print(sampled_indices)

Given the input seq: 

<s>NextEra Energy Partners, LP (NYSE:NEP) completed the acquisition of 39.2% stake in Central Penn Line from Cabot Oil & Gas Corporation (NYSE:COG), WGL Midstream, Inc. and EIF Vega Midstream, LLC.<\s                                                                                                                                                                                                                    
412


Chars Predictions: 

|d?F7;L,5]K+M?5LS[e#6Be*-'$no+du1`Z^WY, I?TO|jM\Y}U5"K)5{idbEzsvm	2[ma4V</MN*P^
G][_)[We[3Qh_<I+\o}Z5'7A|yyUTay5d3}[JL\,9iFNx#[>,si{yJA"2Gk%1O9?ZS?DJ<xXjdeY~V"@w^aRVd'%2tz.m{=MUY@4%K
rcR}U`7Wv?tSo?HAc2h#<mw"<P3}ot"?Ww<G
412
[107   2  66  23  55  14  23 105  43  51 105  16  35   8 118 109  97  18
 115  84  84  48 107   3 102   6  48  92  12  67  55  88  28  23  88 122
  13  81  18  12 122  83  12  73 123 126   3  99  54  94  98  53  27  76
  83  14  91 101  35  15  54  66 10

## Train Language Model

In [1382]:
# Directory where the checkpoints will be saved
import os 
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [1383]:
# https://www.kdnuggets.com/2019/03/train-keras-model-20x-faster-tpu-free.html
epochs = 10
history = model.fit(dataset, epochs=epochs, use_multiprocessing = True, workers=5, verbose = 2, callbacks=[checkpoint_callback])

Epoch 1/10
66/66 - 18825s - loss: 1.0356
Epoch 2/10
66/66 - 27071s - loss: 1.0360
Epoch 3/10
66/66 - 30669s - loss: 1.0349
Epoch 4/10


KeyboardInterrupt: 

In [1385]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_3'

In [1390]:
model = create_model(batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1a50b34ff28>

In [1391]:
model.build(tf.TensorShape([1, None]))

In [1392]:
model.summary()

Model: "CharLSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EmbedLayer (Embedding)       (1, None, 256)            32512     
_________________________________________________________________
BiLSTM (Bidirectional)       (1, None, 2048)           10493952  
_________________________________________________________________
time_distributed_17 (TimeDis (None, None, 127)         260223    
Total params: 10,786,687
Trainable params: 10,786,687
Non-trainable params: 0
_________________________________________________________________


In [1406]:
X_test, y_test = next(iter(test_seq_data))

In [1422]:
len("".join(map(chr,X_test.numpy())).strip(chr(0)))

55

In [1594]:
new = X[np.random.randint(0,len(data))]

In [1595]:
print("".join(map(chr, new)).strip(chr(0)))
print(len("".join(map(chr, new)).strip(chr(0))))

<s>F5 Networks, Inc. Provides Earnings Guidance for the First Quarter of Fiscal Year 2020 Ending December 31, 2019<\s
117


In [1588]:
prediction = model(new.reshape(1,-1))

In [1589]:
# prediction.numpy()[:,54:,:] 
prediction = prediction[-1,:,:]

In [1591]:
print("".join(map(chr,np.argmax(prediction, axis = 1) + 1)))
print(len("s~P'PPsjjjs!!s!!\!!!!!\!!!!!!!!!!!f!f!!!ooff!!s!!!!!fo!!FffffQM:fIfff!eseHUssss.bD!!!!fffff"))

s~P'PPsjjjs!!s!!\!!!!!\!!!!!!!!!!!f!f!!!ooff!!s!!!!!f
o!!FffffQM:fIfff!eseHUssss.bD!!!!fffff!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
97


In [1529]:
p_i = np.zeros((prediction.shape))
for i in range(0, 412):
    p = np.exp(prediction[i])/np.sum(np.exp(prediction[i]))
    p_i[i] = p

In [1530]:
np.argmax(p_i, axis = 1)

array([125, 125,  70,  70, 101,  91,  32,  32, 118,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32, 114,  32,  32,  91, 101, 101,  32,
       114,  64, 101, 101, 101, 101,  15, 101,  15,  99, 110, 110,  97,
       101, 101, 101, 101, 109,  32,  32,  32,  32,  32,   7, 101,  32,
        32,  32, 100,  71, 101,  95,  32, 101,  32,  32,  32, 101,  32,
        32,  32,  32,  32,  32,  32, 101,  32,  37, 114, 101, 125, 125,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
        32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  3

In [1534]:
"".join(map(chr,np.argmax(p_i, axis = 1) + 1))

'~~GGf\\!!w!!!!!!!!!!s!!\\ff!sAffff\x10f\x10doobffffn!!!!!\x08f!!!eHf`!f!!!f!!!!!!!f!&sf~~!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [1463]:
prediction.shape

(412, 127)

In [870]:
model.evaluate(dataset, verbose=2, use_multiprocessing=1)

479/479 - 10507s - loss: 1.2081 - accuracy: 8.6585e-04


[1.2080851793289185, 0.0008658546721562743]

In [948]:
model = create_model(batch_size=128)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [1647]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 100

  # Converting our start string to numbers (vectorizing)
  input_eval = [ord(s) for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(chr(predicted_id))

  return (start_string + ''.join(text_generated))

In [1648]:
print(generate_text(model, start_string="<s>"))

u\F;q7z'^7XZFB]B2mLpx~fa-'?(*Q+u^4KMC#6]]:a.LS8jO
$[f*J1B#
$ILq&c


In [175]:


strs = "how much for the maple syrup? $%20.99? That's ”˚‑|–—=_ricidulous!!!???|||"

nstr = re.sub('[!,*)@=#-({|}_‑–?^;:{|}˚~\t\n“—’”/_]',r'',strs)
print(nstr)


how much for the maple syrup 20.99 Thats ricidulous


### Save Trained Vectors of Embeddings and Model Weights

In [1646]:

import io, csv

# save model weights
e = model.layers[0]
weights = e.get_weights()[0]
weights.shape # shape: (characters/bytes, embedding_dim) -->(127,256)

# save embeddings.
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
tsv_writer = csv.writer(out_m, delimiter='\t')

# for char in map(chr, (i for i in range(1,255+1))):
#     print(char)
for i in range(0,127):
    vec = weights[i] 
    tsv_writer.writerow(str(chr(i+1)))
#     out_m.write(chr(i+1), lineterminator='\n')# skip 0, it's padding.255 last vector
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()
    

### Stock Direction Prediction

By default, the output of a RNN layer contains a single vector per sample. This vector is the RNN cell output corresponding to the last timestep, containing information about the entire input sequence. The shape of this output is (batch_size, units) where units corresponds to the units argument passed to the layer's constructor.

In [None]:
def direction_model(batch_size):
    model = Sequential(name = 'RNNStocks')
    model.add(Embedding(iinput_dim = 127, output_dim = 256,
                        batch_input_shape=[batch_size, None], 
                        weights=[embedding_matrix],trainable=False, name ='EmbedLayer'))
    model.add(Bidirectional(LSTM(1024, return_sequences=False,stateful=False,recurrent_initializer='glorot_uniform'), name = 'BiLSTM'))
    #final state encodes full representation of passed headine
    model.add(Dense(512, activation = 'LeakyReLU',name = 'FullConnected'))
    model.add(Dense(1, name='Output', activation='sigmoid'))
    model.compile(optimizer=tf.optimizers.ADAM(learning_rate=1e-3), loss = 'binary_crossentropy')
    return model