In [1]:
import pandas as pd
import tensorflow as tf
import numpy
import pickle as pkl
import os
import finviz as fz
import yfinance as yf
import calendar
import datetime
import numpy as np

In [2]:
os.chdir(os.path.join(os.getcwd(), ".."))

In [3]:
def infer_year(current_month, current_year):
    dict_cont = {}
    dict_cont["current_year"] = current_year
    dict_cont["previous_month"] = current_month
    def _infer_year(x):
        delta_months = dict_cont["previous_month"] - x
        if x>=1 and delta_months>=0:
            dict_cont["current_year"]
            dict_cont["previous_month"] = x
        else:
            dict_cont["previous_month"] = x
            dict_cont["current_year"]-=1
        return dict_cont["current_year"]
    return _infer_year

# Inputs

In [4]:
perc_space = 0.8
perc_time = 0.8

WINDOW_SIZE = 21 # Include window + target
BATCH_SIZE = 8

## Paths

In [5]:
data_path = os.path.join(os.getcwd(), "data")
sp_100_file = os.path.join(data_path, "sp-100-index-07-02-2020.csv")

In [6]:
sp_100_df = pd.read_csv(sp_100_file)
simbols = sp_100_df.Symbol.values[:-1]
map_month2month_number = dict((v,k) for k,v in enumerate(calendar.month_abbr))

In [7]:
stock = {}
cont = 0
for simbol in simbols:
    try:
        insider_info = fz.get_insider(simbol)
        insider_data = pd.DataFrame.from_dict(insider_info)
        insider_data[["month_name", "day"]]= insider_data.Date.str.split(" ", expand=True)
        insider_data["month"] = insider_data["month_name"].map(map_month2month_number)
        insider_data["year"] = insider_data["month"].apply(infer_year(datetime.datetime.now().month,
                                                                      datetime.datetime.now().year))
        insider_data["date"] = pd.to_datetime(insider_data[["year", "month", "day"]], format="%y%m%d") 
        cont = cont + 1
    except Exception as inst:
        print(simbol, end=" ")
        print(type(inst), end=" ")# the exception instance
        print(inst) 
        continue
    curr_ticker = yf.Ticker(simbol)
    curr_hist = curr_ticker.history(period="1y")
    curr_table = pd.pivot_table(insider_data,
                                index=['date'],
                                columns=['Transaction'], aggfunc={"Transaction": len})
    curr_table = ~curr_table.isnull()
    curr_table.columns = [col[1] for col in curr_table.columns]
    curr_hist = curr_hist.merge(curr_table, how="left", left_index=True, right_index=True)
    curr_hist = curr_hist.fillna(False)
    if "Buy" not in curr_hist.columns:
        curr_hist["Buy"] = False
    if "Sale" not in curr_hist.columns:
        curr_hist["Sale"] = False
    if "Option Exercise" not in curr_hist.columns:
        curr_hist["Option Exercise"] = False
    stock[simbol] = curr_hist

BKNG <class 'IndexError'> list index out of range
BRK.B <class 'requests.exceptions.HTTPError'> 404 Client Error: Not Found for url: https://finviz.com/quote.ashx?t=BRK.B
C <class 'IndexError'> list index out of range
DD <class 'IndexError'> list index out of range
EXC <class 'IndexError'> list index out of range
GOOGL <class 'IndexError'> list index out of range
GS <class 'IndexError'> list index out of range
MO <class 'IndexError'> list index out of range
RTX <class 'IndexError'> list index out of range


## Creating frames for training

In [8]:
n_train_space = int(len(stock.keys())*perc_space)
n_train_out_space = len(stock.keys())-n_train_space

In [9]:
train_space = np.random.choice(list(stock.keys()), size=n_train_space, replace=False).tolist()
test_space = list(set(stock.keys()) - set(train_space))

In [10]:
min_date = stock[train_space[0]].index.min()
max_date = stock[train_space[0]].index.max()
samples = curr_hist.shape[0]
train_samples = int((samples-WINDOW_SIZE)*perc_time)

In [11]:
train_datasets = []
valid_datasets = []
test_datasets = []
for company in train_space:
    print("Company: ", company)
    curr_hist = stock[company]
    assert min_date == curr_hist.index.min()
    assert max_date == curr_hist.index.max()
    assert len(curr_hist) == samples
    stock[company] = stock[company].astype({'Open': np.float64,
                         'High': np.float64,
                         'Low': np.float64,
                         'Close': np.float64,
                         'Volume': np.float64,
                         'Dividends': np.float64,
                         'Stock Splits': np.float64,
                         'Buy': np.float64,
                         'Option Exercise': np.float64,
                         'Sale': np.float64})
    dataset = tf.data.Dataset.from_tensor_slices(stock[company])
    dataset = dataset.window(WINDOW_SIZE, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE))
    dataset = dataset.map(lambda window: (window[:-1], window[-1:, -3:]))
    train_datasets.append(dataset.take(train_samples))
    valid_datasets.append(dataset.skip(train_samples))
    
for company in test_space:
    print("Company: ", company)
    curr_hist = stock[company]
    assert min_date == curr_hist.index.min()
    assert max_date == curr_hist.index.max()
    assert len(curr_hist) == samples
    stock[company] = stock[company].astype({'Open': np.float64,
                         'High': np.float64,
                         'Low': np.float64,
                         'Close': np.float64,
                         'Volume': np.float64,
                         'Dividends': np.float64,
                         'Stock Splits': np.float64,
                         'Buy': np.float64,
                         'Option Exercise': np.float64,
                         'Sale': np.float64})
    dataset = tf.data.Dataset.from_tensor_slices(stock[company])
    dataset = dataset.window(WINDOW_SIZE, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE))
    dataset = dataset.map(lambda window: (window[:-1], window[-1:, -3:]))
    test_datasets.append(dataset)

Company:  CSCO
Company:  AMT
Company:  HON
Company:  V
Company:  T
Company:  MDLZ
Company:  DUK
Company:  SO
Company:  XOM
Company:  PG
Company:  CAT
Company:  AIG
Company:  LOW
Company:  EMR
Company:  COF
Company:  JNJ
Company:  ACN
Company:  MCD
Company:  GD
Company:  GILD
Company:  DIS
Company:  PM
Company:  MET
Company:  GM
Company:  GE
Company:  LLY
Company:  MSFT
Company:  TXN
Company:  AAPL
Company:  NKE
Company:  UPS
Company:  MMM
Company:  MA
Company:  QCOM
Company:  PFE
Company:  BMY
Company:  ABBV
Company:  BA
Company:  COP
Company:  COST
Company:  MDT
Company:  LMT
Company:  NFLX
Company:  NEE
Company:  DHR
Company:  PYPL
Company:  TMO
Company:  DOW
Company:  CHTR
Company:  BK
Company:  KO
Company:  MS
Company:  CVX
Company:  AXP
Company:  FB
Company:  F
Company:  BAC
Company:  VZ
Company:  USB
Company:  UNP
Company:  TGT
Company:  BIIB
Company:  NVDA
Company:  CVS
Company:  CRM
Company:  ORCL
Company:  CMCSA
Company:  OXY
Company:  FDX
Company:  SLB
Company:  BLK
Company: 

In [12]:
train_ds = None
val_ds = None
for sample in zip(train_datasets, valid_datasets):
    if train_ds is None:
        train_ds = sample[0]
        val_ds = sample[1]
    else:
        train_ds = train_ds.concatenate(sample[0])
        val_ds = val_ds.concatenate(sample[1])
        
test_ds = None
for sample in test_datasets:
    if test_ds is None:
        test_ds = sample
    else:
        test_ds = test_ds.concatenate(sample)

In [13]:
train_ds = train_ds.shuffle(buffer_size=5000)
train_ds = train_ds.batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)
test_ds = test_ds.batch(BATCH_SIZE)

In [19]:
def build_model():
    inputs = tf.keras.Input(shape=(20,10))
    x = tf.keras.layers.BatchNormalization(axis=2)(inputs)
    x = tf.keras.layers.GRU(64, return_sequences=False)(x)
    x = tf.keras.layers.Flatten()(x)
    sigmoids = tf.keras.layers.Dense(3, activation="sigmoid")(x)
    outputs = tf.keras.backend.expand_dims(sigmoids, axis=1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [20]:
model = build_model()
model.compile(loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy", "AUC"])

In [21]:
history = model.fit(train_ds, 
          validation_data=val_ds,
          epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:
model.evaluate(test_ds)



[0.12914402782917023, 0.24183303117752075, 0.7366621494293213]

# Comparing predictions and looking if it is learning anything
dataframe start from the 20, example df[20:] are all the samples in test, and each company has 232 records, so to iterate would be accooording to 232 and addding 20 to the origiinal df

In [26]:
out = model.predict(test_ds)

In [30]:
predicted_df = pd.DataFrame(np.squeeze(out), columns=["Buy", "Sell", "Option Exercise"])

In [31]:
predicted_df

Unnamed: 0,Buy,Sell,Option Exercise
0,0.024702,0.028202,0.010897
1,0.024804,0.028381,0.010907
2,0.024806,0.028279,0.010942
3,0.024803,0.028293,0.010982
4,0.024673,0.028150,0.010990
...,...,...,...
4403,0.028713,0.025139,0.006112
4404,0.028839,0.025163,0.005657
4405,0.030004,0.026465,0.005328
4406,0.027716,0.025217,0.007454


In [32]:
for company in test_space:
    print("Company: ", company)
    curr_hist = stock[company]
    break

Company:  SPG


In [43]:
a = iter(test_ds)

In [44]:
next(a)[0][0]

<tf.Tensor: shape=(20, 10), dtype=float64, numpy=
array([[1.5617e+02, 1.5829e+02, 1.5617e+02, 1.5740e+02, 1.4985e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5711e+02, 1.5834e+02, 1.5661e+02, 1.5764e+02, 2.7127e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5827e+02, 1.5855e+02, 1.5645e+02, 1.5701e+02, 1.5552e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5693e+02, 1.5795e+02, 1.5494e+02, 1.5574e+02, 1.0114e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5599e+02, 1.5620e+02, 1.5521e+02, 1.5546e+02, 8.4480e+05,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5574e+02, 1.5704e+02, 1.5528e+02, 1.5579e+02, 1.0197e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.5550e+02, 1.5634e+02, 1.5492e+02, 1.5523e+02, 1.1554e+06,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

In [48]:
curr_hist[20:] 

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Buy,Sale,Option Exercise
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-08-05,151.38,152.24,146.43,148.27,2052700.0,0.0,0.0,0.0,0.0,0.0
2019-08-06,148.55,149.42,146.73,147.03,1628300.0,0.0,0.0,0.0,0.0,0.0
2019-08-07,146.59,148.56,144.48,147.09,1792900.0,0.0,0.0,0.0,0.0,0.0
2019-08-08,147.31,148.88,146.03,148.65,1235500.0,0.0,0.0,0.0,0.0,0.0
2019-08-09,148.46,149.28,147.37,148.46,1229100.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2020-06-29,62.64,68.18,61.29,68.13,9412900.0,0.0,0.0,0.0,0.0,0.0
2020-06-30,71.36,71.50,67.57,68.38,10456500.0,0.0,0.0,0.0,0.0,0.0
2020-07-01,69.80,72.93,69.24,69.81,7979600.0,0.0,0.0,0.0,0.0,0.0
2020-07-02,71.81,72.96,68.68,68.81,6470600.0,0.0,0.0,0.0,0.0,0.0
