# Deep Learning Assignment
### Elain Balderas and Nina McClure

**Set up**

In [112]:
### import libraries
import tensorflow as tf
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import time
import math
from tqdm import tqdm_notebook
import random

# representation of real numbers in TF, change here for 32/64 bits
real_type = tf.float32
# real_type = tf.float64

**Black Scholes**

We assume the risk-free rate is zero for simplicity, which leads to simplified formulas under Black Scholes. ***(or we can add it in????)***

Option value/price:
$$ C_t = S_t*N(d_1) - K*N(d_2)$$,

where $N()$ denotes the standard normal CDF and

$$ d_1 = (log(\frac{S_t}{K}) + 0.5*\sigma^2 * T) /\sqrt{T}$$,
$$ d_2 = d_1 - \sigma*\sqrt{T} $$

We assume the underlying assets follow a random walk, i.e., the underlying asset prices are log-normally distributed and follow a geometric Brownian motion:

$$ S_t = S_{t-1}\exp((- 0.5\sigma^2)t + \sigma\sqrt{t}Z) $$,

where $Z$ is a standard normal random variable.

The Greeks:

$$
Delta := \frac{\partial C}{\partial S_t} = N(d_1)
$$
$$
Vega := \frac{\partial C}{\partial \sigma} = \frac{1}{\sqrt{2\pi}}e^{-0.5*d_1^2} S_t\sqrt{T - t} 
$$
$$
Gamma := \frac{\partial^2 C}{\partial S_t^2} = \frac{1}{\sqrt{2\pi}}e^{-0.5*d_1^2}\frac{1}{S_t \sigma \sqrt{T - t}}
$$

***Train and test data simulation***

In [113]:
# define prices and greeks  
def bsPrice(spot, strike, vol, T):
    d1 = (np.log(spot/strike) + 0.5 * vol * vol * T) / vol / np.sqrt(T)
    d2 = d1 - vol * np.sqrt(T)
    return spot * norm.cdf(d1) - strike * norm.cdf(d2)

def bsDelta(spot, strike, vol, T):
    d1 = (np.log(spot/strike) + 0.5 * vol * vol * T) / vol / np.sqrt(T)
    return norm.cdf(d1)

def bsVega(spot, strike, vol, T):
    d1 = (np.log(spot/strike) + 0.5 * vol * vol * T) / vol / np.sqrt(T)
    return spot * np.sqrt(T) * norm.pdf(d1)

def bsGamma(spot, strike, vol, T):
    d1 = (np.log(spot/strike) + 0.5 * vol * vol * T) / vol / np.sqrt(T)
    return norm.pdf(d1) * (1 / (spot * vol * np.sqrt(T)))

# main class
class BlackScholes:
    
    def __init__(self, 
                 vol=0.2,
                 T1=1, 
                 T2=2, 
                 K= 1.1,
                 volMult=1.5):
        
        self.spot = 1
        self.vol = vol
        self.T1 = T1
        self.T2 = T2
        self.K = K
        self.volMult = volMult
                        
    # training set: returns S1 (mx1), C2 (mx1) and dC2/dS1 (mx1)
    def trainingSet(self, m, anti=True, seed=None):
    
        np.random.seed(seed)
        
        # 2 sets of normal returns
        returns = np.random.normal(size=[m, 2]) # Draws from standard normal
        
        # SDE
        vol0 = self.vol * self.volMult
        R1 = np.exp(-0.5*vol0*vol0*self.T1 + vol0*np.sqrt(self.T1)*returns[:,0]) # asset price return period 1
        R2 = np.exp(-0.5*self.vol*self.vol*(self.T2-self.T1) + self.vol*np.sqrt(self.T2-self.T1)*returns[:,1]) # asset price return period 2 with different volatility
        S1 = self.spot * R1 # spot * return gives asset price in period 1
        S2 = S1 * R2 # gives asset price in period 2

        # payoff
        pay = np.maximum(0, S2 - self.K) # compute payoff 
        
        X = S1.reshape((-1,1)) # this is our input (asset price) 
        #sigma = (np.ones(m)*self.vol).reshape((-1,1))
        #X = np.concatenate((A, sigma), axis = 1)
        Y = pay # this is our output (the payoff)
            
        # differentials - note that we are just looking at one time period
        Delta =  np.where(S2 > self.K, R2, 0.0).reshape((-1,1)) # this is  delta  - the differential is zero is price less than strike
        dvol = S1*np.exp(-0.5 * self.vol**2)*self.vol + returns[:,1] # dy/dvol
        Vega = np.where(S2 > self.K, dvol, 0.0).reshape((-1,1)) # vega
        Z = np.concatenate((Delta, Vega), axis = 1)

        # sizes X = m*1, Y = m*1, Z = m*2
        return X.reshape([-1,1]), Y.reshape([-1,1]), Z#Z.reshape([-1,1]) 
    
    # test set: returns a grid of uniform spots 
    # with corresponding ground true prices, deltas and vegas
    def testSet(self, lower=0.35, upper=1.65, num=1000, seed=None):
        
        spots = np.linspace(lower, upper, num).reshape((-1, 1))
        # compute prices, deltas and vegas
        prices = bsPrice(spots, self.K, self.vol, self.T2 - self.T1)
        d = bsDelta(spots, self.K, self.vol, self.T2 - self.T1)
        vegas = bsVega(spots, self.K, self.vol, self.T2 - self.T1)
        #gammas = bsGamma(spots, self.K, self.vol, self.T2 - self.T1)
        deltas = np.concatenate((d, vegas), axis = 1)
        return spots, spots, prices.reshape((-1, 1)) , deltas, vegas.reshape((-1, 1))#.reshape((-1, 1)) 
   

In [114]:
c = BlackScholes()
#X, Y, Z = c.trainingSet(m = 3)
#c.trainingSet(m = 3)
spots, spots, prices, h, vegas = c.testSet()

h.shape

(1000, 2)

**Feedforward network**

Activation functions are softplus.

$$ softplus(x) = log( 1+ exp(x) ) $$ 


The derivative of the softplus is 

$$ f'(x)= exp(x) / ( 1+ exp⁡(x) ) = 1 / ( 1 + exp(−x)),$$

 which is also called the logistic function.

In [115]:
def vanilla_net(
    input_dim,      # dimension of inputs, e.g. 10
    hidden_units,   # units in hidden layers, assumed constant, e.g. 20
    hidden_layers,  # number of hidden layers, e.g. 4
    seed):          # seed for initialization or None for random
    
    # set seed
    tf.random.set_seed(seed)
    
    # input layer (this will be the prices and differentials)
    xs = tf.compat.v1.placeholder(shape=[None, input_dim], dtype=real_type)
    
    # connection weights and biases of hidden layers
    ws = [None]
    bs = [None]
    # layer 0 (input) has no parameters
    
    # layer 0 = input layer
    zs = [xs] # eq.3, l=0
    
    # first hidden layer (index 1)
    # weight matrix
    ws.append(tf.compat.v1.get_variable("w1", [input_dim, hidden_units], \
        initializer = tf.keras.initializers.variance_scaling(), dtype=real_type))
    # bias vector
    bs.append(tf.compat.v1.get_variable("b1", [hidden_units], \
        initializer = tf.zeros_initializer(), dtype=real_type))
    # graph
    zs.append(zs[0] @ ws[1] + bs[1]) # eq. 3, l=1
    
    # second hidden layer (index 2) to last (index hidden_layers)
    for l in range(1, hidden_layers): 
        ws.append(tf.compat.v1.get_variable("w%d"%(l+1), [hidden_units, hidden_units], \
            initializer = tf.keras.initializers.variance_scaling(), dtype=real_type))
        bs.append(tf.compat.v1.get_variable("b%d"%(l+1), [hidden_units], \
            initializer = tf.zeros_initializer(), dtype=real_type))
        zs.append(tf.nn.softplus(zs[l]) @ ws[l+1] + bs[l+1]) # eq. 3, l=2..L-1

    # output layer (index hidden_layers+1)
    ws.append(tf.compat.v1.get_variable("w"+str(hidden_layers+1), [hidden_units, 1], \
            initializer = tf.keras.initializers.variance_scaling(), dtype=real_type))
    bs.append(tf.compat.v1.get_variable("b"+str(hidden_layers+1), [1], \
        initializer = tf.zeros_initializer(), dtype=real_type))
    # eq. 3, l=L
    zs.append(tf.nn.softplus(zs[hidden_layers]) @ ws[hidden_layers+1] + bs[hidden_layers+1]) 
    
    # result = output layer
    ys = zs[hidden_layers+1]
    
    # return input layer, (parameters = weight matrices and bias vectors), 
    # [all layers] and output layer
    return xs, (ws, bs), zs, ys

**Back-prop and twin net**

In [116]:
# compute d_output/d_inputs by (explicit) backprop in vanilla net
def backprop(
    weights_and_biases, # 2nd output from vanilla_net() 
    zs):                # 3rd output from vanilla_net()
    
    ws, bs = weights_and_biases
    L = len(zs) - 1
    
    # backpropagation, eq. 4, l=L..1
    zbar = tf.ones_like(zs[L]) # zbar_L = 1
    for l in range(L-1, 0, -1):
        zbar = (zbar @ tf.transpose(ws[l+1])) * tf.nn.sigmoid(zs[l]) # eq. 4
    # for l=0
    zbar = zbar @ tf.transpose(ws[1]) # eq. 4
    
    xbar = zbar # xbar = zbar_0
    
    # dz[L] / dx
    return xbar    

# combined graph for valuation and differentiation
def twin_net(input_dim, hidden_units, hidden_layers, seed):
    
    # first, build the feedforward net
    xs, (ws, bs), zs, ys = vanilla_net(input_dim, hidden_units, hidden_layers, seed)
    
    # then, build its differentiation by backprop
    xbar = backprop((ws, bs), zs)
    
    # return input x, output y and differentials d_y/d_z
    return xs, ys, xbar

***Training***

In [117]:
def diff_training_graph(
    # same as vanilla
    input_dim, 
    hidden_units, 
    hidden_layers, 
    seed, 
    # balance relative weight of values and differentials 
    # loss = alpha * MSE(values) + beta * MSE(greeks, lambda_j) 
    # see online appendix
    alpha, 
    beta,
    lambda_j):
    
    # net, now a twin
    inputs, predictions, derivs_predictions = twin_net(input_dim, hidden_units, hidden_layers, seed)
    
    # placeholder for labels, now also derivs labels
    labels = tf.compat.v1.placeholder(shape=[None, 1], dtype=real_type)
    derivs_labels = tf.compat.v1.placeholder(shape=[None, derivs_predictions.shape[2]], dtype=real_type) ### changed shape here
    
    # loss, now combined values + derivatives
    loss = alpha * tf.losses.mean_squared_error(labels, predictions) \
    + beta * tf. losses.mean_squared_error(derivs_labels * lambda_j, derivs_predictions * lambda_j)
    
    # optimizer, as vanilla
    learning_rate = tf.compat.v1.placeholder(real_type)
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate)
    
    # return all necessary tensors, including derivatives
    # predictions and labels
    return inputs, labels, derivs_labels, predictions, derivs_predictions, \
            learning_rate, loss, optimizer.minimize(loss)

def diff_train_one_epoch(inputs, labels, derivs_labels, 
                         # graph
                         lr_placeholder, minimizer,             
                         # training set, extended
                         x_train, y_train, dydx_train,          
                         # params
                         learning_rate, batch_size, session):   
    
    m, n = x_train.shape
    
    # minimization loop, now with Greeks
    first = 0
    last = min(batch_size, m)
    while first < m:
        session.run(minimizer, feed_dict = {
            inputs: x_train[first:last], 
            labels: y_train[first:last], ### here is where we set the targets
            derivs_labels: dydx_train[first:last], ### here is where we set the targets
            lr_placeholder: learning_rate
        })
        first = last
        last = min(first + batch_size, m)

In [118]:
def train(description,
          # neural approximator
          approximator,              
          # training params
          reinit=True, 
          epochs=100, 
          # one-cycle learning rate schedule
          learning_rate_schedule=[    (0.0, 1.0e-8), \
                                      (0.2, 0.1),    \
                                      (0.6, 0.01),   \
                                      (0.9, 1.0e-6), \
                                      (1.0, 1.0e-8)  ], 
          batches_per_epoch=16,
          min_batch_size=256,
          # callback function and when to call it
          callback=None,           # arbitrary callable
          callback_epochs=[]):     # call after what epochs, e.g. [5, 20]
              
    # batching
    batch_size = max(min_batch_size, approximator.m // batches_per_epoch)
    
    # one-cycle learning rate sechedule
    lr_schedule_epochs, lr_schedule_rates = zip(*learning_rate_schedule)
            
    # reset
    if reinit:
        approximator.session.run(approximator.initializer)
    
    # callback on epoch 0, if requested
    if callback and 0 in callback_epochs:
        callback(approximator, 0)
        
    # loop on epochs, with progress bar (tqdm)
    for epoch in tqdm_notebook(range(epochs), desc=description):
        
        # interpolate learning rate in cycle
        learning_rate = np.interp(epoch / epochs, lr_schedule_epochs, lr_schedule_rates)
        
        # train one epoch
        diff_train_one_epoch(
        approximator.inputs, 
        approximator.labels, 
        approximator.derivs_labels,
        approximator.learning_rate, 
        approximator.minimizer, 
        approximator.x, 
        approximator.y, 
        approximator.dy_dx,
        learning_rate, 
        batch_size, 
        approximator.session)
        
        # callback, if requested
        if callback and epoch in callback_epochs:
            callback(approximator, epoch)

    # final callback, if requested
    if callback and epochs in callback_epochs:
        callback(approximator, epochs)        


***Normalise data***

In [119]:
# basic data preparation
epsilon = 1.0e-08
def normalize_data(x_raw, y_raw, dydx_raw = None):
    
    # normalize dataset
    x_mean = x_raw.mean(axis=0)
    x_std = x_raw.std(axis=0) + epsilon
    x = (x_raw - x_mean) / x_std
    y_mean = y_raw.mean(axis=0)
    y_std = y_raw.std(axis=0) + epsilon
    y = (y_raw-y_mean) / y_std   
    # normalize derivatives
    dy_dx = dydx_raw / y_std * x_std 
    # weights of derivatives in cost function = (quad) mean size
    lambda_j = 1.0 / np.sqrt((dy_dx ** 2).mean(axis=0)).reshape(1, -1)
   
    return x_mean, x_std, x, y_mean, y_std, y, dy_dx, lambda_j

In [120]:
class Neural_Approximator():
    
    def __init__(self, x_raw, y_raw, 
                 dydx_raw=None):      # derivatives labels, 
       
        self.x_raw = x_raw
        self.y_raw = y_raw
        self.dydx_raw = dydx_raw

    def build_graph(self,         
                differential,       # differential or not           
                lam,                # balance cost between values and derivs  
                hidden_units, 
                hidden_layers, 
                weight_seed):
  
        self.graph = tf.Graph()     # Graphs are used to represent the function's computations.  
        
        with self.graph.as_default():
        
            # build the graph (add parameters)
            self.differential = differential          
            self.alpha = 1.0 / (1.0 + lam * self.n)
            self.beta = 1.0 - self.alpha             
            self.inputs, \
            self.labels, \
            self.derivs_labels, \
            self.predictions, \
            self.derivs_predictions, \
            self.learning_rate, \
            self.loss, \
            self.minimizer = diff_training_graph(self.n, hidden_units, \
                                                     hidden_layers, weight_seed, \
                                                     self.alpha, self.beta, self.lambda_j)
        
            # global initializer
            self.initializer = tf.compat.v1.global_variables_initializer()
            
        # done
        self.graph.finalize()
        self.session = tf.compat.v1.Session(graph=self.graph)
                        
    # prepare for training with m examples, standard or differential
    def prepare(self, 
                m, 
                differential,
                lam=1,              # balance cost between values and derivs  
                # standard architecture
                hidden_units=20, 
                hidden_layers=4, 
                weight_seed=None):

        # prepare dataset (normalise)
        self.x_mean, self.x_std, self.x, self.y_mean, self.y_std, self.y, self.dy_dx, self.lambda_j = \
            normalize_data(self.x_raw, self.y_raw, self.dydx_raw)
        
        # build graph        
        self.m, self.n = self.x.shape        
        self.build_graph(differential, lam, hidden_units, hidden_layers, weight_seed)
        
    def train(self,            
              description="training",
              # training params
              reinit=True, 
              epochs=100, 
              # one-cycle learning rate schedule
              learning_rate_schedule=[
                  (0.0, 1.0e-8), 
                  (0.2, 0.1), 
                  (0.6, 0.01), 
                  (0.9, 1.0e-6), 
                  (1.0, 1.0e-8)], 
              batches_per_epoch=16,
              min_batch_size=256,
              # callback and when to call it
              # we don't use callbacks, but this is very useful, e.g. for debugging
              callback=None,           # arbitrary callable
              callback_epochs=[]):     # call after what epochs, e.g. [5, 20]
              
        train(description, 
              self, 
              reinit, 
              epochs, 
              learning_rate_schedule, 
              batches_per_epoch, 
              min_batch_size,
              callback, 
              callback_epochs)
     
    def predict_values(self, x):
        # scale
        x_scaled = (x-self.x_mean) / self.x_std 
        # predict scaled
        y_scaled = self.session.run(self.predictions, feed_dict = {self.inputs: x_scaled})
        # unscale
        y = self.y_mean + self.y_std * y_scaled
        return y

    def predict_values_and_derivs(self, x):
        # scale
        x_scaled = (x-self.x_mean) / self.x_std
        # predict scaled
        y_scaled, dyscaled_dxscaled = self.session.run(
            [self.predictions, self.derivs_predictions], 
            feed_dict = {self.inputs: x_scaled})
        # unscale
        y = self.y_mean + self.y_std * y_scaled
        dydx = self.y_std / self.x_std * dyscaled_dxscaled
        return y, dydx

***Implementation***

In [126]:
def test(generator,         # how we want to generate the data (Black Scholes)
         sizes,             # size of the data
         nTest,             # number of test sets
         simulSeed=None,    # seed for training data generation
         testSeed=None,     # seed for test data generation
         weightSeed=None,   # seed for weight initialisation
         deltidx=0):        # ???

    ### simulation
    print("simulating training, valid and test sets")
    # generate train set
    xTrain, yTrain, dydxTrain = generator.trainingSet(max(sizes), seed=simulSeed)
    # generate test sets
    xTest, xAxis, yTest, dydxTest, vegas = generator.testSet(num=nTest, seed=testSeed) # vega is not being used???
    print("done")

    # neural approximator
    print("initializing neural appropximator")
    regressor = Neural_Approximator(xTrain, yTrain, dydxTrain) ### what are the targets?
    print("done")
    
    predvalues = {}    
    preddeltas = {}
    for size in sizes:        
            
        print("\nsize %d" % size)
        regressor.prepare(size, False, weight_seed=weightSeed)
            
        t0 = time.time()
        regressor.train("standard training")
        predictions, deltas = regressor.predict_values_and_derivs(xTest)
        predvalues[("standard", size)] = predictions
        preddeltas[("standard", size)] = deltas[:, deltidx]
        t1 = time.time()
        
        regressor.prepare(size, True, weight_seed=weightSeed)
            
        t0 = time.time()
        regressor.train("differential training")
        predictions, deltas = regressor.predict_values_and_derivs(xTest)
        predvalues[("differential", size)] = predictions
        preddeltas[("differential", size)] = deltas[:, deltidx]
        t1 = time.time()
        
    return xAxis, yTest, dydxTest[:, deltidx], vegas, predvalues, preddeltas


In [127]:
generator = BlackScholes()
print(xTest.shape, xAxis.shape, yTest.shape, vegas.shape)

(100, 1) (100, 1) (100, 1) (100, 1)


In [None]:
def graph(title, 
          predictions, 
          xAxis, 
          xAxisName, 
          yAxisName, 
          targets, 
          sizes, 
          computeRmse=False, 
          weights=None):
    
    numRows = len(sizes)
    numCols = 2

    fig, ax = plt.subplots(numRows, numCols, squeeze=False)
    fig.set_size_inches(4 * numCols + 1.5, 4 * numRows)

    for i, size in enumerate(sizes):
        ax[i,0].annotate("size %d" % size, xy=(0, 0.5), 
          xytext=(-ax[i,0].yaxis.labelpad-5, 0),
          xycoords=ax[i,0].yaxis.label, textcoords='offset points',
          ha='right', va='center')
  
    ax[0,0].set_title("standard")
    ax[0,1].set_title("differential")
    
    for i, size in enumerate(sizes):        
        for j, regType, in enumerate(["standard", "differential"]):

            if computeRmse:
                errors = 100 * (predictions[(regType, size)] - targets)
                if weights is not None:
                    errors /= weights
                rmse = np.sqrt((errors ** 2).mean(axis=0))
                t = "rmse %.2f" % rmse
            else:
                t = xAxisName
                
            ax[i,j].set_xlabel(t)            
            ax[i,j].set_ylabel(yAxisName)

            ax[i,j].plot(xAxis*100, predictions[(regType, size)]*100, 'co', \
                         markersize=2, markerfacecolor='white', label="predicted")
            ax[i,j].plot(xAxis*100, targets*100, 'r.', markersize=0.5, label='targets')

            ax[i,j].legend(prop={'size': 8}, loc='upper left')

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.suptitle("% s -- %s" % (title, yAxisName), fontsize=16)
    plt.show()

In [None]:
# simulation set sizes to perform
sizes = [1000, 1000]

# show delta?
showDeltas = True

# seed
# simulSeed = 1234
simulSeed = np.random.randint(0, 10000) 
print("using seed %d" % simulSeed)
weightSeed = None

# number of test scenarios
nTest = 100    

# go
generator = BlackScholes()
xAxis, yTest, dydxTest, vegas, values, deltas = \
    test(generator, sizes, nTest, simulSeed, None, weightSeed)

In [None]:
# show predicitions
graph("Black & Scholes", values, xAxis, "", "values", yTest, sizes, True)

# show deltas
graph("Black & Scholes", deltas, xAxis, "", "deltas", dydxTest, sizes, True)