In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Trains on historical EOD to act as a single column in twitter dataset.

from pathlib import Path
from ams.config import constants
from ams.pipes.p_make_prediction.mp_process import get_stocks_merged
from ams.services.ticker_service import get_ticker_eod_data, make_one_hotted, get_ticker_info
import pandas as pd
from ams.twitter.twitter_ml_utils import combine_with_quarterly_stock_data
from ams.twitter.twitter_ml_utils import merge_with_stock_details

twit_root_path = constants.TWITTER_OUTPUT_RAW_PATH
src_path = Path(twit_root_path, "stock_merge_drop", "main")

df = get_stocks_merged(stock_merge_drop_path=src_path)

tickers = df["f22_ticker"].unique()

In [19]:

tickers = ['AAI', 'AAL', 'ACNAQ', 'ALGT', 'ASAI', 'CCAR', 'COMR', 'FLYI', 'FRNTQ', 'HA',
 'JBLU', 'MAIR', 'MDWY', 'MESA', 'MESAQ', 'PNCLQ', 'RENO', 'RJETQ', 'RYAAY', 'SKYW',
 'TOWR1', 'TRIP1', 'UAL', 'VA', 'VIRGY', 'VNGD']

max_tickers = 200
df_all = []
industries = ['Airlines']

for ndx, ticker in enumerate(tickers):
    df_eod = get_ticker_eod_data(ticker=ticker)
    df_all.append(df_eod)

    if max_tickers is not None and ndx >= max_tickers:
        break

df_eod_all = pd.concat(df_all, axis=0)

In [20]:
df_w_fundy, _ = combine_with_quarterly_stock_data(df=df_eod_all)

20:38:00 - ams.twitter.twitter_ml_utils:640 - INFO - Finished merging in quarterly stock data.


In [21]:
from typing import List

def merge_stock_data_w_details(df: pd.DataFrame, industries: List[str]):
    df_merged = merge_with_stock_details(df)

    df_merged.sort_values(by=["ticker", "date"], inplace=True)

    if df_merged.shape[0] == 0:
        logger.info("Not enough data after merge.")
        
    df_filtered = df_merged[df_merged["industry"].isin(industries)].copy()

    return df_filtered

df_merged = merge_stock_data_w_details(df=df_w_fundy, industries=industries)

In [24]:
df_merged["ticker"].unique()
df_merged.fillna(method="ffill", inplace=True)

In [25]:
import random
import numpy as np
import torch

# multivariate data preparation
from numpy import array
from numpy import hstack

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [34]:
cols = list(df_merged.columns)

col_drops = []
for c in cols:
    uniques = list(df_merged[c].unique())
    if len(uniques) == 1:
        col_drops.append(c)

df_dropped = df_merged.drop(columns=col_drops)

In [37]:
col_drops = ['calendardate', 'datekey', 'reportperiod']
df_hot_ready = df_dropped.drop(columns=col_drops)

df_all_tickers = get_ticker_info()
col_objs = [c for c in df_hot_ready.columns if str(df_dropped[c].dtype) == "object"]
col_objs.remove("date")

df_hotted = make_one_hotted(df=df_hot_ready, df_all_tickers=df_all_tickers, cols=col_objs)

rem_cols = list(df_hotted.columns)
rem_cols.remove("date")

In [32]:
df_col_ordered = df_hotted[rem_cols + ["stock_val_change", "date"]].copy()

df_col_ordered.fillna(0, inplace=True)

cols = df_col_ordered.columns

# print(list(cols))

df_flight = df_col_ordered[cols].copy()

# df_flight["date"]

KeyError: "['calendardate'] not found in axis"

In [None]:
dates = sorted(df_flight["date"].unique())

train_frac = 0.7

total_size = len(dates)
train_size = int(total_size * train_frac)
train_dates = dates[:train_size]
val_test_dates = dates[-train_size:]

test_frac = .50
test_size = int(len(val_test_dates) * test_frac)
val_dates = val_test_dates[:test_size]
test_dates = val_test_dates[-test_size:]

df_train = df_flight.loc[df_flight["date"].isin(train_dates)]
df_val = df_flight.loc[df_flight["date"].isin(val_dates)]
df_test = df_flight.loc[df_flight["date"].isin(test_dates)]

df_train = df_train.loc[:, df_train.columns != "date"]
df_val = df_val.loc[:, df_val.columns != "date"]
df_test = df_test.loc[:, df_test.columns != "date"]

print(list(df_test.columns)[-1])

In [None]:
class MV_LSTM(torch.nn.Module):
    def __init__(self, n_features, seq_length):
        super(MV_LSTM, self).__init__()
        self.n_features = n_features
        self.seq_len = seq_length
        self.n_hidden = 20 # number of hidden states
        self.n_layers = 1 # number of LSTM layers (stacked)

        self.l_lstm = torch.nn.LSTM(input_size = n_features,
                                 hidden_size = self.n_hidden,
                                 num_layers = self.n_layers,
                                 batch_first = True)
        # according to pytorch docs LSTM output is
        # (batch_size,seq_len, num_directions * hidden_size)
        # when considering batch_first = True
        self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 1)


    def init_hidden(self, batch_size):
        # even with batch_first = True this remains same as docs
        hidden_state = torch.zeros(self.n_layers, batch_size, self.n_hidden)
        cell_state = torch.zeros(self.n_layers, batch_size, self.n_hidden)
        self.hidden = (hidden_state, cell_state)


    def forward(self, x, future=0):
        batch_size, seq_len, _ = x.size()

        lstm_out, self.hidden = self.l_lstm(x, self.hidden)
        # lstm_out(with batch_first = True) is
        # (batch_size,seq_len,num_directions * hidden_size)
        # for following linear layer we want to keep batch_size dimension and merge rest
        # .contiguous() -> solves tensor compatibility error
        x = lstm_out.contiguous().view(batch_size,-1)
        return self.l_linear(x)

In [None]:

dataset_train = df_train.astype('float32').to_numpy()
dataset_val = df_val.astype('float32').to_numpy()
dataset_test = df_test.astype('float32').to_numpy()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_arr = scaler.fit_transform(dataset_train)
val_arr = scaler.transform(dataset_val)
test_arr = scaler.transform(dataset_test)

In [None]:
n_features = len(df_flight.columns) - 2 # this is number of parallel inputs
n_timesteps = 60 # this is number of timesteps

x_train, y_train = split_sequences(train_arr, n_timesteps)
x_val, y_val = split_sequences(val_arr, n_timesteps)
x_test, y_test = split_sequences(test_arr, n_timesteps)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

In [None]:
import time
import random

class Model(torch.nn.Module):
    def __init__(self, input_size, seq_len, hidden_size, output_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = torch.nn.LSTMCell(self.input_size, self.hidden_size)
        self.linear = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, future=0, y=None):
        outputs = []

        # reset the state of LSTM
        # the state is kept till the end of the sequence
        h_t = torch.zeros(input.size(0), self.hidden_size, dtype=torch.float32)
        c_t = torch.zeros(input.size(0), self.hidden_size, dtype=torch.float32)

        print(h_t.size())
        print(c_t.size())

        for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):
            h_t, c_t = self.lstm(input_t, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]

        for i in range(future):
            if y is not None and random.random() > 0.5:
                output = y[:, [i]]  # teacher forcing
            h_t, c_t = self.lstm(output, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]
        outputs = torch.stack(outputs, 1).squeeze(2)
        return outputs

In [None]:
def generate_sequence(scaler, model, x_sample, future=1000):
    """ Generate future values for x_sample with the model """
    y_pred_tensor = model(x_sample, future=future)
    y_pred = y_pred_tensor.cpu().tolist()
    y_pred = scaler.inverse_transform(y_pred)
    return y_pred

def to_dataframe(actual, predicted):
    return pd.DataFrame({"actual": actual, "predicted": predicted})


def inverse_transform(scalar, df, columns):
    for col in columns:
        print(df[col].values.shape)
        values = df[col].values.reshape(-1, 1)
        df[col] = scaler.inverse_transform(values)
    return df

In [None]:
def generate_batch_data(x, y, batch_size: int):
    for batch_ndx, i in enumerate(range(0, len(x) - batch_size, batch_size)):
        x_batch = x[i:i + batch_size, :, :]
        y_batch = y[i:i + batch_size]

        x_batch = torch.tensor(x_batch, dtype=torch.float32)
        y_batch = torch.tensor(y_batch, dtype=torch.float32)

        yield x_batch, y_batch, batch_ndx

class Optimization:
    """ A helper class to train, test and diagnose the LSTM"""

    def __init__(self, model, loss_fn, optimizer, scheduler):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_losses = []
        self.val_losses = []
        self.futures = []

    def train(
        self,
        x_train,
        y_train,
        x_val=None,
        y_val=None,
        batch_size=16,
        n_epochs=120,
        do_teacher_forcing=None,
    ):
        seq_len = x_train.shape[1]
        self.model.train()
        for epoch in range(n_epochs):
            train_loss = 0
            start_time = time.time()
            for x_batch, y_batch, batch_ndx in generate_batch_data(x_train, y_train, batch_size):

                self.model.init_hidden(x_batch.size(0))

                y_pred = self._predict(x_batch, y_batch, seq_len, do_teacher_forcing)
                loss = self.loss_fn(y_pred.view(-1), y_batch)

                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                train_loss += loss.item()

            train_loss /= batch_ndx
            elapsed = time.time() - start_time
            print('step : ' , epoch , 'loss : ' , loss.item())

            self._validation(x_val, y_val, batch_size)

            print(
                "Epoch %d Train loss: %.8f. Validation loss: %.8f. Avg future: %.2f. Elapsed time: %.2fs."
                % (epoch + 1, train_loss, self.val_losses[-1], np.average(self.futures), elapsed)
            )

    def _predict(self, x_batch, y_batch, seq_len, do_teacher_forcing):
        if do_teacher_forcing:
            future = random.randint(1, int(seq_len) / 2)
            limit = x_batch.size(1) - future
            y_pred = self.model(x_batch[:, :limit], future=future, y=y_batch[:, limit:])
        else:
            future = 0
            y_pred = self.model(x_batch)
        self.futures.append(future)
        return y_pred

    def _validation(self, x_val, y_val, batch_size):
        if x_val is None or y_val is None:
            return
        with torch.no_grad():
            val_loss = 0
            batch_ndx = 0
            for x_batch, y_batch, batch_ndx in generate_batch_data(x_val, y_val, batch_size):
#                 y_pred = self.model(x_batch)
#                 loss = self.loss_fn(y_pred, y_batch)
#                 val_loss += loss.item()
                output = self.model(x_batch)
                loss = self.loss_fn(output.view(-1), y_batch)
                val_loss += loss.item()

            val_loss /= batch_ndx
            self.val_losses.append(val_loss)

    def evaluate(self, x_test, y_test, batch_size, future=1):
        with torch.no_grad():
            test_loss = 0
            batch_ndx = 0
            actual, predicted = [], []
            for x_batch, y_batch, batch_ndx in generate_batch_data(x_test, y_test, batch_size):
                y_pred = self.model(x_batch, future=future)

                y_batch = y_batch.reshape(-1, 1)

#                 print(y_pred.shape)
#                 print(y_batch.shape)
#                 print(y_batch.reshape(-1, 1).shape)
#                 print(y_batch.shape[1])
#                 print(y_pred[:, -len(y_batch)])
#                 print(y_pred)

                y_pred = (
                    y_pred[:, -len(y_batch)] if y_pred.shape[1] > y_batch.shape[1] else y_pred
                )


                loss = self.loss_fn(y_pred, y_batch)
                test_loss += loss.item()
                actual += torch.squeeze(y_batch[:, -1]).data.cpu().numpy().tolist()
                predicted += torch.squeeze(y_pred[:, -1]).data.cpu().numpy().tolist()
            test_loss /= batch_ndx
            return actual, predicted, test_loss

    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")

In [None]:
model_1 = MV_LSTM(n_features, n_timesteps)
loss_fn_1 = torch.nn.MSELoss()
optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=1e-2)
scheduler_1 = optim.lr_scheduler.StepLR(optimizer_1, step_size=5, gamma=0.1)

optimization_1 = Optimization(model_1, loss_fn_1, optimizer_1, scheduler_1)

optimization_1.train(x_train, y_train, x_val, y_val, do_teacher_forcing=False, n_epochs=1)

In [None]:
actual_1, predicted_1, test_loss_1 = optimization_1.evaluate(x_test, y_test, future=1, batch_size=batch_size)
df_result_1 = to_dataframe(actual_1, predicted_1)

# df_result_1
df_result_1 = inverse_transform(scaler, df_result_1, ['actual', 'predicted'])
# df_result_1.plot(figsize=(14, 7))
# print("Test loss %.4f" % test_loss_1)

In [None]:
# create NN
mv_net = MV_LSTM(n_features, n_timesteps)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-5)

train_episodes = 120
batch_size = 16

mv_net.train()
X = x_train
y = y_train
for t in range(train_episodes):
    for b in range(0,len(X),batch_size):
        inpt = X[b:b+batch_size,:,:]
        target = y[b:b+batch_size]

        x_batch = torch.tensor(inpt, dtype=torch.float32)
        y_batch = torch.tensor(target, dtype=torch.float32)

        mv_net.init_hidden(x_batch.size(0))
    #    lstm_out, _ = mv_net.l_lstm(x_batch,nnet.hidden)
    #    lstm_out.contiguous().view(x_batch.size(0),-1)
        output = mv_net(x_batch)
        loss = criterion(output.view(-1), y_batch)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print('step : ' , t , 'loss : ' , loss.item())
