# Creating a simulation environment for the Contextual Bandit problem

### To-Do List:
- [ ] Create the basic environment
- [ ] Get the DQN to train on the basic environment
- [ ] Incorporate the ABAQUS simulator to calculate deformation values

The idea is to create an environment that functions as a stochastic non-stationary contextual bandit (similar to the one used in the `sspe.NonStationaryStochasticBandit` environment from Tensorflow). In a non-stationary environment, the reward distributions are perceived as random distributions that depend on the observation received from the environment $r_t \sim f(a_t, \mathcal{O})$.

In [39]:
import gym
import numpy as np
from gym import spaces, utils, error
from gym.utils import seeding
from sympy import symbols

gym.__version__

'0.21.0'

The general idea for this environment is as follows: the agent is presented with a context (the hole that is going to be drilled) and then chooses a single action* in the form of a fixture location. The algorithm would then return a value for the deformation in two directions which is then fed into a normal distribution that returns the reward. 

In [40]:
class ContextBandit(gym.Env):
    
    def __init__(self, contexts, actions):
        
        low = np.array([
            0,0,0,0
        ]).astype(np.float32)
        
        high = np.array([
            10, 10, 10, 100
        ]).astype(np.float32)
        
        self.viewer = None
        self.num_contexts = len(contexts)
        self.num_actions = len(actions)
        
        self.action_space = spaces.Discrete(self.num_actions)
        self.observation_space = spaces.Box(low, high)
        
        self._seed()
        self._reset()
        self.xvec, self.zvec = symbols('x z')
        self.a = 1
        self.b = 1
        self.c = 0
        
    def _reset(self):
        self.state = np.random.randint(self.num_contexts)
        return np.array(self.state)
    
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    
    def _render(self, mode='human', close=False):
        pass
    
    def step(self, action):
        
        assert self.action_space.contains(action)
        
        reward = 0
        done = True
        observation = self._get_observation(action, self.state)
        
        reward = self._get_reward(observation)
        
        if self.state == 0:
            self.state = 1
        else:
            self.state = 0
            
        return np.array(self.state), reward, done, {}
    
    def _get_observation(self, action, context):
        """
        Observation formulation: 
         - x_def
         - y_def
         - z_def
         - von mises
        """
        return np.random.randint(low=0, high=10, size=4)
    
    def _get_reward(self, observation):
        
        x = observation[0]
        z = observation[2]
        
        R = (1/(1 + ((self.xvec-self.c)/self.a)**(2*self.b)) + 1/(1 + ((self.zvec-self.c)/self.a)**(2*self.b)))/2
        return R.subs(self.xvec, x).subs(self.zvec, z)
    
context = np.random.randint(low=0, high=16, size=15)
action = np.asarray([1, 2, 3, 4, 5])
env = ContextBandit(context, action)
env.step(2)

(array(0), 43/884, True, {})

In [42]:
x = symbols('x')
y = symbols('y')
a = 1
b = 1
c = 0
R = (1/(1 + ((x-c)/a)**(2*b)) + 1/(1 + ((y-c)/a)**(2*b)))/2
R.subs(x, 0.01).subs(y, 5)

0.519180774230269