# GAN Development - Kochems Approach


## Import libraries

In [8]:
%load_ext autoreload
%autoreload 2

from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import csv
from collections import defaultdict
from data_reader import DataReader

np.set_printoptions(threshold=np.inf)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read in Data

### Functions to preprocess data

In [74]:
def convert_ask_bid_int(dataset):
    mask_ask = np.char.endswith(dataset[:,:,2], 'ask')
    mask_bid = np.char.endswith(dataset[:,:,2], 'bid')
    dataset[:,:,2][mask_ask] = '1'
    dataset[:,:,2][mask_bid] = '0'
    dataset = dataset.astype(np.float32)
    return dataset
    
    
def get_dataset_max_price(dataset):
    last_row_prices = dataset[:, 0, 0]
    max_val = np.max(last_row_prices)
    return max_val

def get_dataset_min_price(dataset):
    first_row_prices = dataset[:, -1, 0]
    min_val = np.min(first_row_prices)
    return min_val

def make_histogram_from_dataset(dataset, bin_width = 0.5):
    X_train = []
    hist_max = get_dataset_max_price(dataset)
    hist_min = get_dataset_min_price(dataset)
    print("range: ", hist_min, " ", hist_max)
    num_bins = int(np.ceil((hist_max-hist_min) / bin_width))
    bins = np.linspace(hist_min, hist_max, num_bins)
    for i in range(len(dataset)):
        orderbook = dataset[i];
        price = orderbook[:,0]
        quantity = orderbook[:,1]
        quantity[orderbook[:, 2] == 0] *= -1
        hist, bin_edges = np.histogram(price, bins=bins, weights=quantity)
        X_train.append(hist)
    X_train = np.array(X_train)
    return X_train
        

### Actually reading in data

In [85]:
data_reader = DataReader("orderbook_snapshots.csv", rows_per_orderbook=100)
data_reader.read_csv()
X_train_raw = data_reader.get_data()
X_train_raw = convert_ask_bid_int(X_train_raw)
print(X_train_raw.shape)

(3639, 100, 3)


### Preprocess data

In [86]:
X_train = make_histogram_from_dataset(X_train_raw, bin_width=0.5)
print(X_train.shape)
print(X_train[0])
# print(X_train_raw)

range:  2286.1   2316.7
(3639, 61)


1. Import Libraries and Load Data (Use my script that transforms the Orderbook csv file into 1-row-per-timestamp format)

In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import csv
from collections import defaultdict

# Load the data

# Input and output file names
# TODO (CHANGE THESE FROM MINE TO YOURS)
input_filename = '/Users/sina/Downloads/SCRATCH_GENERATIVE_ADV_NET/training_data/orderbook_snapshots.csv'
output_filename = '/Users/sina/Downloads/SCRATCH_GENERATIVE_ADV_NET/training_data/reformatted.csv'

# Initialise a dictionary to hold the data in correct order
orders = defaultdict(lambda: {'ask_price': None, 'ask_qty': None, 'bid_price': None, 'bid_qty': None})

# Read original csv and populate the orders dictionary
with open(input_filename, mode='r', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        timestamp, _, price, quantity, order_type = row
        if order_type == '0_ask':
            orders[timestamp]['ask_price'] = price
            orders[timestamp]['ask_qty'] = quantity
        elif order_type == '0_bid':
            orders[timestamp]['bid_price'] = price
            orders[timestamp]['bid_qty'] = quantity

# Write the data to a new csv file
with open(output_filename, mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Timestamp', 'Best Ask Price Level', 'Best Ask Quantity', 'Best Bid Price Level', 'Best Bid Quantity'])
    for timestamp, order in orders.items():
        writer.writerow([timestamp, order['ask_price'], order['ask_qty'], order['bid_price'], order['bid_qty']])


data_path = output_filename
columns = ['timestamp', 'best_ask_price', 'best_ask_qty', 'best_bid_price', 'best_bid_qty']
data = pd.read_csv(data_path, header=None, names=columns, skiprows=1)


2. Preprocess Data

In [2]:
# Convert timestamp to a numerical format
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = (data['timestamp'] - data['timestamp'].min()).dt.total_seconds()

# Normalize the price and quantity columns
scaler = MinMaxScaler()
data[['best_ask_price', 'best_ask_qty', 'best_bid_price', 'best_bid_qty']] = \
    scaler.fit_transform(data[['best_ask_price', 'best_ask_qty', 'best_bid_price', 'best_bid_qty']])

# Convert to PyTorch tensor
data_tensor = torch.tensor(data.values, dtype=torch.float32)

3. GAN Architecture (Generator & Discriminator Models)

In [3]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(3, 128),  # Assuming the noise vector size is 2 and conditional input size is 1
            nn.ReLU(),
            nn.Linear(128, 4)  # Output size is 4 (best_ask_price, best_ask_qty, best_bid_price, best_bid_qty)
        )

    def forward(self, noise, price_level):
        # Given that price_level is already [batch_size, 1], no need to unsqueeze
        # print("Noise shape:", noise.shape)
        # print("Price level shape before concat:", price_level.shape)  # Adjusted print statement for clarity
        
        # Concatenate noise and price_level directly
        x = torch.cat([noise, price_level], dim=1)  # Both tensors should now be compatible for concat
        
        return self.model(x)


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(4, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, snapshot):
        return self.model(snapshot)

4. Instantiate Models & Optimisers

In [5]:
# Instantiate the generator and discriminator
generator = Generator()
discriminator = Discriminator()

# Define the optimisers
optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.001)
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=0.001)


5. Training Loop

In [6]:
criterion = nn.BCELoss()
n_epochs = 200  # ADJUST
batch_size = 64  # ADJUST
noise_dim = 2  # Size of noise vector

for epoch in range(n_epochs):
    for i in range(0, len(data_tensor), batch_size):
        # Prepare real data batch
        real_data = data_tensor[i:i+batch_size, 1:]  # Exclude timestamp from training
        real_labels = torch.ones(real_data.size(0), 1)
        fake_labels = torch.zeros(real_data.size(0), 1)
        
        # Train Discriminator
        optimizer_D.zero_grad()
        
        # Real data loss
        real_output = discriminator(real_data)
        d_loss_real = criterion(real_output, real_labels)
        
        # Generate fake data
        noise = torch.randn(real_data.size(0), noise_dim)
        # Conditional input, for now, let's use a random slice from the best_ask_price as an example
        conditional_input = real_data[:, 0].unsqueeze(1)  # This should be modified based on your specific conditional input


        # Right before generator(noise, conditional_input) call
        # print("Conditional input shape before generator:", conditional_input.shape)

        fake_data = generator(noise, conditional_input)
        
        # Fake data loss
        fake_output = discriminator(fake_data.detach())  # Detach to avoid training generator on these labels
        d_loss_fake = criterion(fake_output, fake_labels)
        
        # Combine loss and update discriminator
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_D.step()
        
        # Train Generator
        optimizer_G.zero_grad()
        
        # Trick discriminator into thinking the generated data is real
        output = discriminator(fake_data)
        g_loss = criterion(output, real_labels)
        
        g_loss.backward()
        optimizer_G.step()
        
        if i % 100 == 0:  # Adjust printing frequency based on your preference
            print(f"Epoch [{epoch+1}/{n_epochs}], Step [{i+1}/{len(data_tensor)//batch_size}], D Loss: {d_loss.item()}, G Loss: {g_loss.item()}")


Epoch [1/200], Step [1/149], D Loss: 1.3965988159179688, G Loss: 0.7215413451194763
Epoch [1/200], Step [1601/149], D Loss: 1.4688373804092407, G Loss: 0.5217849016189575
Epoch [1/200], Step [3201/149], D Loss: 1.3653583526611328, G Loss: 0.9481761455535889
Epoch [1/200], Step [4801/149], D Loss: 1.3138500452041626, G Loss: 1.0709651708602905
Epoch [1/200], Step [6401/149], D Loss: 1.1183561086654663, G Loss: 1.2066458463668823
Epoch [1/200], Step [8001/149], D Loss: 1.4968788623809814, G Loss: 0.562454879283905
Epoch [2/200], Step [1/149], D Loss: 1.1029284000396729, G Loss: 0.7625342607498169
Epoch [2/200], Step [1601/149], D Loss: 1.0214810371398926, G Loss: 0.7176451086997986
Epoch [2/200], Step [3201/149], D Loss: 1.2565090656280518, G Loss: 0.7303485870361328
Epoch [2/200], Step [4801/149], D Loss: 1.4943289756774902, G Loss: 0.712454080581665
Epoch [2/200], Step [6401/149], D Loss: 1.3722593784332275, G Loss: 0.9504902958869934
Epoch [2/200], Step [8001/149], D Loss: 1.227487206

6. Save Generator Model

In [8]:
# Save the generator's state dictionary
torch.save(generator.state_dict(), '/Users/sina/Downloads/SCRATCH_GENERATIVE_ADV_NET/Saved_Generator_States/generator_state_dict.pth')
print("Generator state has been saved.")

Generator state has been saved.


7. Generate Example Orderbook Snapshots

In [14]:
# Generate a sample order book snapshot
with torch.no_grad():
    test_noise = torch.randn(1, noise_dim)
    test_price_level = torch.tensor([[0.5]])  # Example price level, normalized
    generated_snapshot = generator(test_noise, test_price_level)
    inverse_transformed_snapshot = scaler.inverse_transform(generated_snapshot.numpy())
    print("Generated Order Book Snapshot:", inverse_transformed_snapshot)


Generated Order Book Snapshot: [[2.1985203e+03 4.8230276e+00 2.1969265e+03 4.6494180e-01]]
