In [1]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import *
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

In [3]:
def prob_t_is_max(max_so_far, t, T):
    coeff = 1.0/(T - t)
    prod = (1.0 - np.power(norm.cdf(max_so_far), T-t))
    return(coeff*prod)

def prob_any_next_t_is_max(max_so_far, t, T):
    prod = (1.0 - np.power(norm.cdf(max_so_far), T-t))
    return(prod)

def log_utility(x):
    y = 2.0*np.log(1 + np.power(x,1/2))
    return(np.sum(y))

def log_utility_opt_x(T, pi_cp):
    frac = (2*T)/pi_cp
    x_1 = frac + np.sqrt((frac - 1)*(frac + 1))
    x_2 = frac - np.sqrt((frac - 1)*(frac + 1))
    return((x_1, x_2))


In [4]:
def batch_sample_list(datalist, batchsize):
    #list of training, target pair tuples
    remainder = len(datalist) % batchsize
    diff = batchsize - remainder
    tail = datalist[-diff:] + datalist[0:remainder]
    out = [ datalist[i*batchsize:(i+1)*batchsize] for i in range(int(float(len(datalist))/float(batchsize)))]
    out = out + [tail]
    return(out)

def batch_data_arrays(data, labels, batchsize, sampledim = 1):
    remainder = data.shape[sampledim] % batchsize
    diff = batchsize - remainder
    tail = data[0,-diff:]
    out = [ (torch.Tensor(data[:,i*batchsize:(i+1)*batchsize].T), torch.Tensor(labels[i*batchsize:(i+1)*batchsize])) for i in range(int(float(data.shape[sampledim])/float(batchsize))) ]
    return(out)
    

def torch_reshape_data(databatch):
    #flattens array inputs for a single list of training, target pairs
    inputs = []
    labels = []
    for sample in databatch:
        inputs.append(sample[0].flatten())
        labels.append(sample[1].flatten())
    return(torch.Tensor(np.asarray(inputs)), torch.Tensor(np.asarray(labels)))

In [5]:
#just try a multilayer NN
class nnet(nn.Module):
    def __init__(self, params):
        super(nnet, self).__init__()
        self.D_in = params['FEATURE_DIM']
        self.H1 = params['HIDDEN_1']
        self.D_out = params['OUTPUT_DIM']
        self.l1 = nn.Linear(self.D_in, self.H1)
        self.l2 = nn.Linear(self.H1, self.D_out)
    
    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        x = self.l2(x)
        return(x)
    
    
def train(net_obj, loss_func, opt_func, trainX, trainY, valX, valY, batchsize=100, epochs=50, verbose=True):
    print("Training")
    train_batches = batch_data_arrays(trainX, trainY, batchsize)
    num_batches = len(train_batches)
    
    train_epoch_loss = []
    val_epoch_loss = []
    
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(train_batches):
            inputs, labels = data[0], data[1].unsqueeze(1)
            
            #make Variable
            inputs, labels = Variable(inputs), Variable(labels) 
            #inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
            
            opt_func.zero_grad()
            
            outputs = net_obj(inputs)
            loss = loss_func(outputs, labels)
            loss.backward()
            opt_func.step()
            
            running_loss += 0.0
            
        
        
        train_epoch_loss.append(running_loss)
        val_out = net_obj(torch.Tensor(valX.T))
        val_loss = loss_func(val_out, torch.Tensor(valY).unsqueeze(1))
        val_epoch_loss.append(val_loss)
        
        if verbose==True:
            print("==epoch " + str(epoch) + "==")
            print("training loss: " + str(running_loss))
            print("validation loss: " + str(val_loss))
        
    return(train_epoch_loss, val_epoch_loss)

In [35]:
T = 2
grid_file_name = "/home/chase/projects/peakload/notebooks/t" + str(T) + "_grid_search.txt"

with open(grid_file_name, 'rb') as d:
    grid_MC = np.load(grid_file_name)

In [36]:
np.max(grid_MC)


1.4833222224851716

In [37]:
#sample some paths

g = np.arange(0,1.1,0.1)
s_vals = np.arange(-2.0,2.0,0.1)
ramp_const = 0.3

N = 1000

path_x = []
path_s = []
best_responses = []

X = np.zeros((4, N))

Y = np.zeros((N,))



for i in range(N):
    x_1_i = np.random.randint(0,11) #we do better if we ignore states we'd never start in e.g. np.random.randint(4,11)
    X[0,i] = g[x_1_i]  #previous state

    s_1_i = np.random.randint(s_vals.shape[0])
    X[1,i] = s_vals[s_1_i] #maximum noise seen so far

    X[2,i] = 1 #rounds left to go
    
    X[3,i] = np.random.uniform(0,1) #linear bias term
    
    best_response_i = np.argmax(grid_MC[x_1_i, s_1_i, :])#pick best x_2 that maximizes expected reward given x_1, s_1
    Y[i] = g[best_response_i]

In [38]:
train_X = X[:,0:int(0.8*X.shape[1])]
train_Y = Y[0:int(0.8*Y.shape[0])]
val_X = X[:,int(0.8*X.shape[1]):]
val_Y = Y[int(0.8*Y.shape[0]):]

In [39]:
train_X.shape, train_Y.shape

((4, 800), (800,))

In [40]:
params = {'FEATURE_DIM': 4, 'OUTPUT_DIM': 1, 'HIDDEN_1': 4}

net = nnet(params)#.cuda()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(),lr=0.1, momentum=0.9)

In [41]:
train_loss, val_loss = train(net, criterion, optimizer, train_X, train_Y, val_X, val_Y, batchsize=100, epochs=500)

Training
==epoch 0==
training loss: 0.0
validation loss: tensor(0.1657, grad_fn=<MseLossBackward>)
==epoch 1==
training loss: 0.0
validation loss: tensor(0.2166, grad_fn=<MseLossBackward>)
==epoch 2==
training loss: 0.0
validation loss: tensor(0.0504, grad_fn=<MseLossBackward>)
==epoch 3==
training loss: 0.0
validation loss: tensor(0.0651, grad_fn=<MseLossBackward>)
==epoch 4==
training loss: 0.0
validation loss: tensor(0.0578, grad_fn=<MseLossBackward>)
==epoch 5==
training loss: 0.0
validation loss: tensor(0.0407, grad_fn=<MseLossBackward>)
==epoch 6==
training loss: 0.0
validation loss: tensor(0.0371, grad_fn=<MseLossBackward>)
==epoch 7==
training loss: 0.0
validation loss: tensor(0.0348, grad_fn=<MseLossBackward>)
==epoch 8==
training loss: 0.0
validation loss: tensor(0.0288, grad_fn=<MseLossBackward>)
==epoch 9==
training loss: 0.0
validation loss: tensor(0.0238, grad_fn=<MseLossBackward>)
==epoch 10==
training loss: 0.0
validation loss: tensor(0.0199, grad_fn=<MseLossBackward>)


==epoch 94==
training loss: 0.0
validation loss: tensor(0.0094, grad_fn=<MseLossBackward>)
==epoch 95==
training loss: 0.0
validation loss: tensor(0.0094, grad_fn=<MseLossBackward>)
==epoch 96==
training loss: 0.0
validation loss: tensor(0.0094, grad_fn=<MseLossBackward>)
==epoch 97==
training loss: 0.0
validation loss: tensor(0.0094, grad_fn=<MseLossBackward>)
==epoch 98==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 99==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 100==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 101==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 102==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 103==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackward>)
==epoch 104==
training loss: 0.0
validation loss: tensor(0.0093, grad_fn=<MseLossBackw

==epoch 214==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 215==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 216==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 217==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 218==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 219==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 220==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 221==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 222==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 223==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLossBackward>)
==epoch 224==
training loss: 0.0
validation loss: tensor(0.0086, grad_fn=<MseLos

==epoch 320==
training loss: 0.0
validation loss: tensor(0.0078, grad_fn=<MseLossBackward>)
==epoch 321==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 322==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 323==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 324==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 325==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 326==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 327==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 328==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 329==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLossBackward>)
==epoch 330==
training loss: 0.0
validation loss: tensor(0.0077, grad_fn=<MseLos

==epoch 427==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 428==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 429==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 430==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 431==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 432==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 433==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 434==
training loss: 0.0
validation loss: tensor(0.0070, grad_fn=<MseLossBackward>)
==epoch 435==
training loss: 0.0
validation loss: tensor(0.0069, grad_fn=<MseLossBackward>)
==epoch 436==
training loss: 0.0
validation loss: tensor(0.0069, grad_fn=<MseLossBackward>)
==epoch 437==
training loss: 0.0
validation loss: tensor(0.0069, grad_fn=<MseLos

In [42]:
check = torch.Tensor(np.array([0.9, 2.0, 1.0, 1.0])).unsqueeze(1).transpose(1,0)
print(check.shape)

torch.Size([1, 4])


In [43]:
check_out = net(check)
print(float(check_out.data[0,0]))

1.0781599283218384


In [44]:
#how does the approx value function perform against naive?
x_bar = 1
pi_cp_perc = 0.6
pi_cp = 0.6*T*log_utility(x_bar)

naive_reward = T*log_utility(x_bar) - pi_cp*x_bar

naive_opts = log_utility_opt_x(T, pi_cp)
print(naive_opts)

nopt_reward = T*log_utility(naive_opts[1]) - pi_cp *naive_opts[1]

print(naive_reward)
print(nopt_reward)

(4.591174268424552, 0.2178092012053261)
1.1090354888959124
1.1697236254452936


In [46]:
#how does the approx value function perform against naive?
MC = 100

winloss = []
rewards = []
best_responses = []
gaps = []
cvs = 0

x_1 = 0.7
for i in range(MC):
    smi = np.random.randint(0,s_vals.shape[0])
    #try with  noise we trained with
    sm = s_vals[smi]
    
    inputvec = torch.Tensor(np.array([x_1, sm, 1.0, 1.0])).unsqueeze(1).transpose(1,0)
    #inputvec = torch.Tensor(np.array([x_1, sm, 1.0])).unsqueeze(1).transpose(1,0) #without linear bias
    
    br = net(inputvec)
    best_response = float(br.data[0,0])
    
    if best_response > x_bar:
        best_response = x_bar
    if best_response < 0.0:
        best_response = 0.0
        
    if np.abs(x_1 - best_response) > ramp_const:
        cvs += 1
    
    best_responses.append(best_response)
    plays = np.array([x_1, best_response])
    noises = np.array([sm, np.random.choice(s_vals)])
    
    reward = log_utility(plays) - pi_cp*plays[np.argmax(noises)]
    rewards.append(reward)
    if reward > nopt_reward:
        winloss.append(1)
    else:
        winloss.append(0)
    gaps.append(reward - nopt_reward)
        
print("win/loss perc: " + str(np.mean(winloss)))
print("expected utility gap: " + str(np.mean(gaps)))
print("constraint violation perc: " + str(cvs/MC))

win/loss perc: 0.86
expected utility gap: 0.17289745530875766
constraint violation perc: 0.0


In [47]:
net.l1.weight.data

tensor([[ 0.0299,  0.5195, -0.7092, -0.7588],
        [ 0.3733,  1.8390, -0.5340, -0.5034],
        [-2.2650,  1.0337, -0.9802, -0.2564],
        [ 1.8102,  0.1315, -0.6048,  0.0329]])

In [48]:
net.l2.weight.data

tensor([[ 0.0003,  0.5817, -1.4422,  1.0523]])

In [None]:
#repeat process for T = 3

In [5]:
T = 3
grid_file_name = "/home/chase/projects/peakload/notebooks/t" + str(T) + "_grid_search.txt"

with open(grid_file_name, 'rb') as d:
    grid_MC = np.load(grid_file_name)

In [6]:
grid_MC.shape

(11, 40, 11, 40, 11)

In [7]:
a = np.zeros((2,2))
a[1,1] = 1.0
a[0,0] = 2.0



a[np.nonzero(a)]

array([2., 1.])

In [8]:
#maximum mean slice
#max_xi = 0
#m_run = 0#
#
#for i in range(grid_MC.shape[0]):
#    for k in range(grid_MC.shape[2]):
#        noise_outcomes = grid_MC[i,:,k,:,:] #all choices of x_3 w.r.t. s_1, s_2
#        mean_en = np.mean(grid_MC[i,:,k,:,:])
#        if mean_en > m_run:
#            m_run = mean_en
#            max_xij = (i,k)
#        
#print(g[i],g[k])
#print(np.mean(grid_MC[i,:,k,:,:]))

In [9]:
#sample some paths even split between x_1 and x_2

g = np.arange(0,1.1,0.1)
s_vals = np.arange(-2.0,2.0,0.1)
ramp_const = 0.3

N = 5000
half = int(N/2)

best_responses = []

X = np.zeros((4, N))

Y = np.zeros((N,))

ind = np.arange(0,N,1)

for i in list(ind[0:half]):
    x_1_i = np.random.randint(0,11)
    x_1 = g[x_1_i]
    X[0,i] = x_1  #previous state

    s_1_i = np.random.randint(s_vals.shape[0])
    X[1,i] = s_vals[s_1_i] #maximum noise seen so far

    X[2,i] = 2 #rounds left to go
    
    X[3,i] = np.random.uniform(0,1) #linear bias term
    
    best_response_i = np.argmax(grid_MC[x_1_i, s_1_i, :])#pick best x_2 that maximizes expected reward given x_1, s_1
    
    #max expected reward independent of noise
    #get all choices of x_2, x_3 with respect to noise s_1
    best_reward = 0.0
    best_x_2_i = 0
    outcomes = grid_MC[x_1_i,s_1_i,:,:,:]
    for x_2_i in range(g.shape[0]):
        x_2 = g[x_2_i]
        rewards_over_s_2 = []
        if np.abs(x_2 - x_1) > ramp_const:
            pass
        else:
            for s_2_i in range(s_vals.shape[0]):
                for x_3_i in range(g.shape[0]):
                    x_3 = g[x_3_i]
                    if np.abs(x_3 - x_2) > ramp_const:
                        pass
                    else:
                        rewards_over_s_2.append(grid_MC[x_1_i, s_1_i, x_2_i, s_2_i, x_3_i])
        expected_reward = np.mean(rewards_over_s_2)
        if expected_reward > best_reward:
            best_reward = expected_reward
            best_x_2_i = x_2_i
                
    Y[i] = g[best_x_2_i]
    
for i in list(ind[half:]):
    x_2_i = np.random.randint(0,11) #ignore values you'd never start from
    X[0,i] = g[x_2_i]  #previous state

    s_1_i = np.random.randint(s_vals.shape[0])
    s_2_i = np.random.randint(s_vals.shape[0])
    X[1,i] = np.max([s_vals[s_1_i], s_vals[s_2_i]]) #maximum noise seen so far, want to reflect true s_m distribution shape so include iid samples from previous rounds

    X[2,i] = 1 #rounds left to go
    
    X[3,i] = np.random.uniform(0,1) #linear bias term
    
    best_response_i = np.argmax(grid_MC[x_1_i, s_1_i, x_2_i, s_2_i, :])#pick best x_3 that maximizes expected reward given x_1, s_1, x_2, s_2
    Y[i] = g[best_response_i]

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [10]:
train_X = X[:,0:int(0.8*X.shape[1])]
train_Y = Y[0:int(0.8*Y.shape[0])]
val_X = X[:,int(0.8*X.shape[1]):]
val_Y = Y[int(0.8*Y.shape[0]):]

In [28]:
params = {'FEATURE_DIM': 4, 'OUTPUT_DIM': 1, 'HIDDEN_1': 4}

net3 = nnet(params)#.cuda()
criterion = nn.MSELoss()
optimizer = optim.SGD(net3.parameters(),lr=0.1, momentum=0.9)

In [29]:
train_loss, val_loss = train(net3, criterion, optimizer, train_X, train_Y, val_X, val_Y, batchsize=100, epochs=1000, verbose=False)

Training


In [59]:
#how does the approx value function perform against naive?
T = 3
x_bar = 1
pi_cp_perc = 0.6
pi_cp = 0.6*T*log_utility(x_bar)

naive_reward = T*log_utility(x_bar) - pi_cp*x_bar

naive_opts = log_utility_opt_x(T, pi_cp)
print(naive_opts)

nopt_reward = T*log_utility(naive_opts[1]) - pi_cp *naive_opts[1]

print(naive_reward)
print(nopt_reward)

(4.5911742684245525, 0.2178092012053261)
1.6635532333438685
1.7545854381679407


In [32]:
#how does the approx value function perform against naive?
MC = 100

winloss = []
rewards = []
best_responses = []
gaps = []
cvs = 0


######need to recurse on net()######################################
x_1 = 0.7
for i in range(MC):
    noises = []
    plays = []
    x_curr = x_1
    for i in range(T):

        t = i+1 #we've already set x_1
        
        si = np.random.randint(0,s_vals.shape[0])
        s = s_vals[si]
        noises.append(s)
        
        sm = np.max(noises) #max so far
    
        inputvec = torch.Tensor(np.array([x_curr, sm, T-t, 1.0])).unsqueeze(1).transpose(1,0)
    
        br = net3(inputvec)
        best_response = float(br.data[0,0])
        
        if best_response > x_bar:
            best_response = x_bar
        if best_response < 0.0:
            best_response = 0.0
        
        if np.abs(x_1 - best_response) > ramp_const:
            cvs += 1

        #attach previous play
        plays.append(x_curr)
        x_curr = best_response #update play
    
    reward = log_utility(np.array(plays)) - pi_cp*plays[np.argmax(noises)]
    rewards.append(reward)
    if reward > nopt_reward:
        winloss.append(1)
    else:
        winloss.append(0)
    gaps.append(reward - nopt_reward)
        
print("win/loss perc: " + str(np.mean(winloss)))
print("expected utility gap: " + str(np.mean(gaps)))
print("constraint violation perc: " + str(cvs/MC))

win/loss perc: 0.67
expected utility gap: 0.17411611076800748
constraint violation perc: 1.62


In [49]:
net.l1.weight.data

tensor([[ 0.0299,  0.5195, -0.7092, -0.7588],
        [ 0.3733,  1.8390, -0.5340, -0.5034],
        [-2.2650,  1.0337, -0.9802, -0.2564],
        [ 1.8102,  0.1315, -0.6048,  0.0329]])

In [33]:
net3.l1.weight.data

tensor([[ 4.9488,  1.1332, -2.3304, -0.1350],
        [13.9292,  0.3053, -6.9772, -0.0780],
        [-2.1691, -0.2620, -0.7883, -0.1084],
        [-5.3243,  1.2536, -1.2958,  0.1740]])

In [50]:
net.l2.weight.data

tensor([[ 0.0003,  0.5817, -1.4422,  1.0523]])

In [34]:
net3.l2.weight.data

tensor([[ 0.6252, -1.0269, -0.9762, -0.8576]])

In [None]:
#need to normalize rounds_left_to_go input data
#parameter tuning, training is sensitive
#call this function a policy since its a mapping from state to action

In [58]:
x_1 = 0.7
ramp_const = 0.3
g[(g > x_1 - ramp_const) & (g < x_1 + ramp_const)]
x_2_i = np.random.choice(g[(g > x_1 - ramp_const) & (g < x_1 + ramp_const)])
print(x_2_i)

0.9


In [6]:
#forward simulate, take the best result
#how does the approx value function perform against naive?
T = 3
x_bar = 1
pi_cp_perc = 0.6
pi_cp = 0.6*T*log_utility(x_bar)

naive_reward = T*log_utility(x_bar) - pi_cp*x_bar

naive_opts = log_utility_opt_x(T, pi_cp)
print(naive_opts)

nopt_reward = T*log_utility(naive_opts[1]) - pi_cp *naive_opts[1]

print(naive_reward)
print(nopt_reward)

#sample some paths even split between x_1 and x_2

g = np.arange(0,1.1,0.1)
s_vals = np.arange(-2.0,2.0,0.1)
ramp_const = 0.3

N = 5000
half = int(N/2) #divide up by number of possible initial states, x_1, x_2 for T = 3

best_responses = []

X = np.zeros((4, N))

Y = np.zeros((N,))

ind = np.arange(0,N,1)

sim_MC = 100

for i in list(ind[0:half]):
    x_1_i = np.random.randint(0,11)
    x_1 = g[x_1_i]
    X[0,i] = x_1  #previous state

    s_1_i = np.random.randint(s_vals.shape[0])
    s_1 = s_vals[s_1_i]
    X[1,i] = s_1 #maximum noise seen so far

    X[2,i] = 2 #rounds left to go
    
    X[3,i] = np.random.uniform(0,1) #linear bias term

    #forward simulate x_2, x_3
    best_x_2_i = 0
    best_reward = 0.0
    for x_2_i in range(g.shape[0]):
        x_2 = g[x_2_i]
        if np.abs(x_2 - x_1) > ramp_const:
            pass
        else:
            for s in range(sim_MC):
                g_const = g[(g > x_2 - ramp_const) & (g < x_2 + ramp_const)]
                x_3 = np.random.choice(g_const)
                s_2 = np.random.choice(s_vals)
                s_3 = np.random.choice(s_vals)
                plays = np.array([x_1, x_2, x_3])
                noises = np.array([s_1, s_2, s_3])

                sim_reward = log_utility(plays) - pi_cp*plays[np.argmax(noises)] 
                if sim_reward > best_reward:
                    best_reward = sim_reward
                    best_x_2 = x_2
                
    Y[i] = best_x_2
    
for i in list(ind[half:]):
    x_2_i = np.random.randint(0,11)
    x_2 = g[x_2_i]
    x_1 = np.random.choice(g[(g > x_2 - ramp_const) & (g < x_2 + ramp_const)])
    X[0,i] = x_2  #previous state

    s_1_i = np.random.randint(s_vals.shape[0])
    s_1 = s_vals[s_1_i]
    s_2_i = np.random.randint(s_vals.shape[0])
    s_2 = s_vals[s_2_i]
    X[1,i] = np.max([s_1, s_2]) #maximum noise seen so far, want to reflect true s_m distribution shape so include iid samples from previous rounds

    X[2,i] = 1 #rounds left to go
    
    X[3,i] = np.random.uniform(0,1) #linear bias term
    
    g_const = g[(g > x_2 - ramp_const) & (g < x_2 + ramp_const)]
    best_x_3 = 0
    best_reward = 0.0
    for s in range(sim_MC):
        x_3 = np.random.choice(g_const)
        s_3 = np.random.choice(s_vals)
        
        noises = np.array([s_1, s_2, s_3])
        plays = np.array([x_1, x_2, x_3])
    
        sim_reward = log_utility(plays) - pi_cp*plays[np.argmax(noises)]
        if sim_reward > best_reward:
            best_reward = sim_reward
            best_x_3 = x_3
    Y[i] = best_x_3

(4.5911742684245525, 0.2178092012053261)
1.6635532333438685
1.7545854381679407


In [7]:
train_X = X[:,0:int(0.8*X.shape[1])]
train_Y = Y[0:int(0.8*Y.shape[0])]
val_X = X[:,int(0.8*X.shape[1]):]
val_Y = Y[int(0.8*Y.shape[0]):]

In [8]:
params = {'FEATURE_DIM': 4, 'OUTPUT_DIM': 1, 'HIDDEN_1': 4}

net3 = nnet(params)#.cuda()
criterion = nn.MSELoss()
optimizer = optim.SGD(net3.parameters(),lr=0.1, momentum=0.9)

In [9]:
train_loss, val_loss = train(net3, criterion, optimizer, train_X, train_Y, val_X, val_Y, batchsize=100, epochs=1000, verbose=False)

Training


In [14]:
#how does the approx value function perform against naive?
MC = 100

winloss = []
rewards = []
best_responses = []
gaps = []
cvs = 0


######need to recurse on net()######################################
x_1 = 0.7
for j in range(MC):
    noises = []
    plays = []
    x_curr = x_1
    for i in range(T):

        t = i+1 #we've already set x_1
        
        si = np.random.randint(0,s_vals.shape[0])
        s = s_vals[si]
        noises.append(s)
        
        sm = np.max(noises) #max so far
    
        inputvec = torch.Tensor(np.array([x_curr, sm, T-t, 1.0])).unsqueeze(1).transpose(1,0)
    
        br = net3(inputvec)
        best_response = float(br.data[0,0])
        
        if best_response > x_bar:
            best_response = x_bar
        if best_response < 0.0:
            best_response = 0.0
            
        #snap to nearest constraint
        if np.abs(x_curr - best_response) > ramp_const:
            if best_response > x_curr:
                best_response = x_curr + ramp_const
            elif best_response < x_curr:
                best_response = x_curr - ramp_const
        
        #if np.abs(x_1 - best_response) > ramp_const:
        #    cvs += 1

        #attach previous play
        plays.append(x_curr)
        x_curr = best_response #update play
    
    reward = log_utility(np.array(plays)) - pi_cp*plays[np.argmax(noises)]
    rewards.append(reward)
    if reward > nopt_reward:
        winloss.append(1)
    else:
        winloss.append(0)
    gaps.append(reward - nopt_reward)
        
print("win/loss perc: " + str(np.mean(winloss)))
print("expected utility gap: " + str(np.mean(gaps)))
print("constraint violation perc: " + str(cvs/MC))

win/loss perc: 0.84
expected utility gap: 0.17209520052769747
constraint violation perc: 0.0


In [None]:
#can it outperform a concentration-inequality based method on E[s_max]?

#to do
#refactor above simulation code for arbitrary T, divide samples amongst x_t evenly
#determine expected utility for T = 2, 10 (50?) 
#compare to naive opt for T = 2, 10 (50?)
#compare to grid search for T = 2, 3, 4