In [1]:
import numpy as np
import pandas as pd
import torch
from collections import defaultdict
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, functional
from sklearn.preprocessing import StandardScaler
import dill

In [5]:
def process_data(df, input_size, pred_type, batch_size):
 
    if pred_type == 'low' or pred_type == 'estimated_high':
        df = df[:-2]
    
    if pred_type == 'estimated_high':
        pred_type = 'high'
        
    values, last_close = adjust_prev_close(df, pred_type)
    
    if len(values) % input_size != 0:
        size = len(values) // input_size * input_size
        values = values[-size:]
    
    transform = StandardScaler().fit(values.reshape(-1,1))
    values = transform.transform(values.reshape(-1,1))[:,0]
    values = torch.from_numpy(values).float()

    len_dataset = (len(df)-input_size -1)//batch_size * batch_size 

    dataset = torch.zeros(len_dataset,input_size)
    dataset[-1] = values[-input_size:]

    for i in range(2,len_dataset+1):
        dataset[-i] = values[-(i+input_size-1):-(i-1)]

    predset = dataset[-1].unsqueeze(0)

    return dataset, predset, {'transform':transform,'last_close':last_close}
    
def adjust_prev_close(df,pred_type):
    last_close = df.iloc[-1]['close']
    prev_close = df['close'].shift(fill_value=0).values[1:]
    column = df[pred_type].values[1:]
    result = (column - prev_close)/prev_close
    return result, last_close

def get_data(df,input_size, batch_size):
    data, dl, tf = defaultdict(dict), defaultdict(dict), {}
    
    for pred_type in ['low', 'high', 'estimated_high']:
        train_data,pred_input,transform = process_data(df, input_size, pred_type, batch_size)
        data[pred_type]['train'] = torch.utils.data.TensorDataset(train_data)
        data[pred_type]['pred'] = torch.utils.data.TensorDataset(pred_input)
        dl[pred_type]['train'] = DataLoader(data[pred_type]['train'], batch_size=batch_size, shuffle=True, num_workers=0)   
        dl[pred_type]['val'] = DataLoader(data[pred_type]['train'], batch_size=batch_size, shuffle=False, num_workers=0)
        tf[pred_type] = transform
    return dl, tf

def get_all_data(input_size, filepath, batch_size):
    datasets, dataloaders, transforms = defaultdict(dict), defaultdict(dict), defaultdict(dict)
    d_all = pd.read_pickle(filepath)
    for ticker, dates in d_all.items():    
        for date,df in dates.items():
            print(ticker,date)
            try:
                dataloaders[ticker][date]\
                , transforms[ticker][date] = get_data(df,input_size, batch_size)
            except Exception as err:
                print(ticker,date,err)
    return dataloaders, transforms


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class MaskedLinear(nn.Linear):
    """Masked linear layer for MADE: takes in mask as input and masks out connections in the linear layers."""

    def __init__(self, input_size, output_size, mask):
        super().__init__(input_size, output_size)
        self.register_buffer("mask", mask)

    def forward(self, x):
        return F.linear(x, self.mask * self.weight, self.bias)


class PermuteLayer(nn.Module):
    """Layer to permute the ordering of inputs.

    Because our data is 2-D, forward() and inverse() will reorder the data in the same way.
    """

    def __init__(self, num_inputs):
        super(PermuteLayer, self).__init__()
        self.perm = np.array(np.arange(0, num_inputs)[::-1])

    def forward(self, inputs):
        return inputs[:, self.perm], torch.zeros(
            inputs.size(0), 1, device=inputs.device
        )

    def inverse(self, inputs):
        return inputs[:, self.perm], torch.zeros(
            inputs.size(0), 1, device=inputs.device
        )


class MADE(nn.Module):
    """Masked Autoencoder for Distribution Estimation.
    https://arxiv.org/abs/1502.03509

    Uses sequential ordering as in the MAF paper.
    Gaussian MADE to work with real-valued inputs"""

    def __init__(self, input_size, hidden_size, n_hidden):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_hidden = n_hidden

        masks = self.create_masks()

        # construct layers: inner, hidden(s), output
        self.net = [MaskedLinear(self.input_size, self.hidden_size, masks[0])]
        self.net += [nn.ReLU(inplace=True)]
        # iterate over number of hidden layers
        for i in range(self.n_hidden):
            self.net += [MaskedLinear(self.hidden_size, self.hidden_size, masks[i + 1])]
            self.net += [nn.ReLU(inplace=True)]
        # last layer doesn't have nonlinear activation
        self.net += [
            MaskedLinear(self.hidden_size, self.input_size * 2, masks[-1].repeat(2, 1))
        ]
        self.net = nn.Sequential(*self.net)

    def create_masks(self):
        """
        Creates masks for sequential (natural) ordering.
        """
        masks = []
        input_degrees = torch.arange(self.input_size)
        degrees = [input_degrees]  # corresponds to m(k) in paper

        # iterate through every hidden layer
        for n_h in range(self.n_hidden + 1):
            degrees += [torch.arange(self.hidden_size) % (self.input_size - 1)]
        degrees += [input_degrees % self.input_size - 1]
        self.m = degrees

        # output layer mask
        for (d0, d1) in zip(degrees[:-1], degrees[1:]):
            masks += [(d1.unsqueeze(-1) >= d0.unsqueeze(0)).float()]

        return masks
    
    def forward(self, z):
        """
        Run the forward mapping (z -> x) for MAF through one MADE block.
        :param z: Input noise of size (batch_size, self.input_size)
        :return: (x, log_det). log_det should be 1-D (batch_dim,)
        """
        x = torch.zeros_like(z)

        for i in range(self.input_size):
            h = self.net(x)
            mu, alpha = torch.split(h, h.size(-1) // 2, dim =-1)
            x[:,i] = mu[:,i] + z[:,i] * torch.exp(alpha[:,i])
        log_det = -torch.sum(alpha, dim = -1)

        # YOUR CODE ENDS HERE

        return x, log_det

    def inverse(self, x):
        """
        Run one inverse mapping (x -> z) for MAF through one MADE block.
        :param x: Input data of size (batch_size, self.input_size)
        :return: (z, log_det). log_det should be 1-D (batch_dim,)
        """
        # YOUR CODE STARTS HERE
        h =  self.net(x)
        mu, alpha = torch.split(h, h.size(-1) // 2, dim =-1)
        z = (x - mu) / torch.exp(alpha)
        log_det = -torch.sum(alpha, dim = -1)
        # YOUR CODE ENDS HERE

        return z, log_det


class MAF(nn.Module):
    """
    Masked Autoregressive Flow, using MADE layers.
    https://arxiv.org/abs/1705.07057
    """

    def __init__(self, input_size, hidden_size, n_hidden, n_flows):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_hidden = n_hidden
        self.n_flows = n_flows
        self.base_dist = torch.distributions.normal.Normal(0, 1)

        # need to flip ordering of inputs for every layer
        nf_blocks = []
        for i in range(self.n_flows):
            nf_blocks.append(MADE(self.input_size, self.hidden_size, self.n_hidden))
            nf_blocks.append(PermuteLayer(self.input_size))  # permute dims
        self.nf = nn.Sequential(*nf_blocks)

    def log_probs(self, x):
        """
        Obtain log-likelihood p(x) through one pass of MADE
        :param x: Input data of size (batch_size, self.input_size)
        :return: log_prob. This should be a Python scalar.
        """
        # YOUR CODE STARTS HERE
        sum_log_det = torch.zeros(x.size(0),1)
        
        for flow in self.nf:
            x, log_det = flow.inverse(x)
            if len(log_det.size()) ==1:
                 log_det = log_det.unsqueeze(1)
            sum_log_det += log_det

        log_prob = torch.mean(torch.sum(self.base_dist.log_prob(x),dim=-1) + sum_log_det)
        
        return log_prob

    def loss(self, x):
        """
        Compute the loss.
        :param x: Input data of size (batch_size, self.input_size)
        :return: loss. This should be a Python scalar.
        """
        return -self.log_probs(x)

    def sample(self, device, n):
        """
        Draw <n> number of samples from the model.
        :param device: [cpu,cuda]
        :param n: Number of samples to be drawn.
        :return: x_sample. This should be a numpy array of size (n, self.input_size)
        """
        with torch.no_grad():
            x_sample = torch.randn(n, self.input_size).to(device)
            for flow in self.nf[::-1]:
                x_sample, log_det = flow.forward(x_sample)
            x_sample = x_sample.view(n, self.input_size)
            x_sample = x_sample.cpu().data.numpy()

        return x_sample

In [7]:
def train(model,n_epochs,loader,device,optimizer):
    model.train()
    total_loss = 0.0
    batch_idx = 0.0

    for epoch in range(1,n_epochs + 1):
        for data in loader:
            batch_idx += 1

            if isinstance(data, list):
                data = data[0]

            batch_size = len(data)
            data = data.view(batch_size, -1)

            data = data.to(device)

            # run MAF
            loss = model.loss(data)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # save stuff
            total_loss += loss.item()

        total_loss /= batch_idx + 1
        #print("Average train log-likelihood: {:.6f}".format(-total_loss))

    return model,optimizer,total_loss

In [8]:
def main(input_size, data_path,batch_size, n_epochs, device):
    dataloaders, transforms = get_all_data(input_size,data_path,batch_size)
    d_predictions = defaultdict(dict)
    for ticker,dates in dataloaders.items():
        #exit_loop=False
        for date in dates:
            print(ticker,date)
            d_temp ={}
            for pred_type in ['low', 'high', 'estimated_high']:
                #try:
                model = MAF(input_size, hidden_size=10, n_hidden=1, n_flows=5).to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
                model,optimizer,train_loss = train(model,n_epochs, dataloaders[ticker][date][pred_type]['train'],device,optimizer)
                print(train_loss)
                preds=transforms[ticker][date][pred_type]['transform'].inverse_transform(model.sample(device, n=1))
                prev_close = transforms[ticker][date][pred_type]['last_close']
                pred = np.mean(preds * prev_close + prev_close)
                d_temp[pred_type] =pred
                #except Exception as err:
                #    print(pred_type,err)
                #    exit_loop = True
                #    break
            #if exit_loop:
            #    exit_loop=False
            #    break
            d_predictions[ticker][date] = d_temp
        dill.dump(d_predictions,open('predictions_flow_est_high.pkd','wb'))
        print('updated predictions dictionary')
main(input_size=5, data_path = 'd_dfs.pkd', batch_size = 50, n_epochs=100, device='cpu')

COST 2022-10-27 00:00:00
COST 2023-02-02 00:00:00
COST 2023-05-04 00:00:00
COST 2023-08-24 00:00:00
COST 2023-11-02 00:00:00
CE 2022-10-28 00:00:00
CE 2023-07-28 00:00:00
CE 2023-10-27 00:00:00
AON 2022-10-31 00:00:00
AON 2023-01-31 00:00:00
AON 2023-04-28 00:00:00
AON 2023-07-31 00:00:00
AON 2023-10-31 00:00:00
MKTX 2022-11-01 00:00:00
MKTX 2023-02-07 00:00:00
MKTX 2023-05-09 00:00:00
MKTX 2023-08-01 00:00:00
CAG 2022-11-02 00:00:00
CAG 2023-11-01 00:00:00
NSC 2022-11-03 00:00:00
NSC 2023-08-03 00:00:00
AMP 2022-11-04 00:00:00
AMP 2023-02-09 00:00:00
AMP 2023-08-04 00:00:00
AWK 2022-11-07 00:00:00
AWK 2023-02-06 00:00:00
AWK 2023-05-08 00:00:00
AWK 2023-08-07 00:00:00
WST 2022-11-08 00:00:00
WST 2023-01-24 00:00:00
WST 2023-04-25 00:00:00
WST 2023-07-25 00:00:00
POOL 2022-11-09 00:00:00
POOL 2023-05-16 00:00:00
POOL 2023-08-09 00:00:00
GWW 2022-11-10 00:00:00
GWW 2023-02-10 00:00:00
GWW 2023-05-05 00:00:00
GWW 2023-08-11 00:00:00
TFX 2022-11-14 00:00:00
TFX 2023-03-02 00:00:00
TFX 202

0.06454422691399325
0.06984474208393208
0.06924707649241711
WST 2023-01-24 00:00:00
0.06742475016971054
0.06924187514013101
0.06918213010511919
WST 2023-04-25 00:00:00
0.06470209207654262
0.06504807923712592
0.06434502448386191
WST 2023-07-25 00:00:00
0.06068969109960541
0.061898519989108135
0.06219647979398496
updated predictions dictionary
POOL 2022-11-09 00:00:00
0.07143405867615898
0.07179801871333112
0.07080151576109925
POOL 2023-05-16 00:00:00
0.0642984114551875
0.06515675790556395
0.06649082721085395
POOL 2023-08-09 00:00:00
0.06767920412389844
0.06726038688906764
0.06727916140590927
updated predictions dictionary
GWW 2022-11-10 00:00:00
0.0695364353746883
0.07004704808292876
0.06911203871225184
GWW 2023-02-10 00:00:00
0.06935938649273901
0.06828921738128042
0.06412447630532479
GWW 2023-05-05 00:00:00
0.06892002533199305
0.0623307304501454
0.06282975838393784
GWW 2023-08-11 00:00:00
0.06811329206050845
0.06119951737630778
0.06174172677606591
updated predictions dictionary
TFX 20

0.05001281040153182
0.05695087251114911
0.05746651061849425
HUM 2023-03-30 00:00:00
0.06476575398805098
0.06402076270389505
0.06387263236356855
HUM 2023-06-29 00:00:00
0.06255879687055299
0.06389545356191953
0.06543760399429172
HUM 2023-09-28 00:00:00
0.058648160562097554
0.0649854955024848
0.06496235896377382
updated predictions dictionary
ESS 2022-12-30 00:00:00
0.06819276722086323
0.0684483327104846
0.06871905432846218
updated predictions dictionary
CMCSA 2023-01-03 00:00:00
0.06681588893302906
0.07082105057230158
0.06897430218885962
CMCSA 2023-10-03 00:00:00
0.06623123251623385
0.053785447920282246
0.055086243724433506
updated predictions dictionary
CPB 2023-01-04 00:00:00
0.06211913896827173
0.06747931490485036
0.06793655531918782
updated predictions dictionary
JPM 2023-01-05 00:00:00
0.06415565085233689
0.06603011099381517
0.06683061385882445
JPM 2023-07-05 00:00:00
0.06884668831129193
0.06812828546242099
0.06692805058655416
updated predictions dictionary
MA 2023-01-06 00:00:00
0

0.0654471986553419
0.06678730247409827
0.06703063686214443
CAT 2023-07-19 00:00:00
0.06810374281170796
0.06684385527623944
0.06637421517748097
CAT 2023-10-20 00:00:00
0.06700962700317095
0.062451372772286795
0.06175614779109788
updated predictions dictionary
AOS 2023-04-27 00:00:00
0.068747491764685
0.06653516550844783
0.06688498686572887
updated predictions dictionary
SYF 2023-05-01 00:00:00
0.06500609489588538
0.06733237586911503
0.06771291088841877
updated predictions dictionary
DHI 2023-05-02 00:00:00
0.06581607542555575
0.06571009956364454
0.06691943911272584
updated predictions dictionary
STZ 2023-05-03 00:00:00
0.06303369249316224
0.06588515383467829
0.06602263596579738
updated predictions dictionary
TECH 2023-05-12 00:00:00
0.06922296216666601
0.06657564118954083
0.06797961367935591
TECH 2023-08-17 00:00:00
0.06747303578937638
0.06586656376180028
0.06688025610514942
updated predictions dictionary
PSX 2023-05-19 00:00:00
0.06299813171935816
0.06759827300807057
0.0665085734793297