In [1]:
import gym
import gym_interf
import cv2

env = gym.make('interf-v1')

class Wrapper(gym.ObservationWrapper):
    def __init__(self, env):
        type(env).n_points = 256
        type(env).x_min = -4
        type(env).x_max = 4
        super().__init__(env)
    
    
    def observation(self, state):
        result = np.ndarray(shape=(16,64,64), dtype=np.uint8)
        for i, image in enumerate(state):
            resized = cv2.resize(image, (64, 64))
            result[i] = resized
        return result
    
#env = Wrapper(env)


In [2]:
%matplotlib inline
import math
import random
import tqdm
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque
from itertools import count
from copy import deepcopy
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display    
print("Is python : {}".format(is_ipython))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device : {}".format(device))


ACTIONS_NUM = 8
print("Number of actions : {}".format(ACTIONS_NUM))

Is python : True
Device : cuda
Number of actions : 8


In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory:

    def __init__(self, capacity = 40000):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
class DQN(nn.Module):
    def __init__(self, in_channels=16, num_actions=ACTIONS_NUM):
        
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.conv4 = nn.Conv2d(64,1024,kernel_size=4,stride=1)
        self.advantage = nn.Linear(512, num_actions)
        self.value = nn.Linear(512, 1)

    def forward(self, x):
        x = x.float() / 255
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        advantage,value = torch.split(x,512,dim=1)
        
        advantage = advantage.view(advantage.shape[0],-1)
        value = value.view(value.shape[0],-1)
        
        advantage = self.advantage(advantage)
        value = self.value(value)
        q_value = value.expand(value.shape[0],ACTIONS_NUM) +\
        advantage-torch.mean(advantage,dim=1).unsqueeze(1).expand(advantage.shape[0],ACTIONS_NUM)
        return q_value

In [5]:
policy_net = DQN().to(device)

#state = torch.load('drive/My Drive/colab/policy_net2')
#policy_net.load_state_dict(state)
policy_net.eval()

target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer =optim.Adam(policy_net.parameters(),lr=1e-5)

memory = ReplayMemory()

def select_action(state, eps_threshold):
    global steps_done
    sample = random.random()
    if sample > eps_threshold:
        with torch.no_grad():
            state=state.float()
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(ACTIONS_NUM)]], device=device, dtype=torch.long)

train_rewards = []

mean_size = 100
mean_step = 1

def plot_rewards(rewards = train_rewards, name = "Train"):
    plt.figure(2)
    plt.clf()
    plt.title(name)
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(rewards)
    if len(rewards) > mean_size:
        means = np.array([rewards[i:i+mean_size:] for i in range(0, len(rewards) - mean_size, mean_step)]).mean(1)
        means = np.concatenate((np.zeros(mean_size - 1), means))
        plt.plot(means)

In [6]:
BATCH_SIZE = 32
GAMMA = 0.99

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    
    
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    
    state_batch = torch.cat(batch.state)
    
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    
    state_batch=state_batch.float()
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    
    non_final_next_states=non_final_next_states.float()
    next_state_values = torch.zeros((BATCH_SIZE,1), device=device)
    next_state_actions = torch.zeros(BATCH_SIZE,dtype=torch.long, device=device)
    
    next_state_actions[non_final_mask] = policy_net(non_final_next_states).max(1)[1]
    next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_state_actions[non_final_mask].unsqueeze(1))
    next_state_values=next_state_values.squeeze(1)
    
    
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    
    
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1).detach())
    
    
    
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
    del non_final_mask
    del non_final_next_states
    del state_batch
    del action_batch
    del reward_batch
    del state_action_values
    del next_state_values
    del expected_state_action_values
    del loss

In [7]:
from tqdm import trange

def evaluate(n_games=10):
    n_solved = 0
    n_steps = 0
    dist = 0
    angle = 0
    delta_visib = 0
    for i in range(n_games):
        state = env.reset()
        istep = 0
        visib = env.visib
        while(True):
            state = np.array(state,dtype=np.float32)
            state = torch.tensor(state, dtype=torch.float32, device=device)
            state = state.unsqueeze(0)
            action = select_action(state, 0)
            state, reward, done, info = env.step(action)
            istep += 1
            if done:
                n_solved += istep < 200
                n_steps += istep
                dist += info['dist']
                angle += info['angle_between_beams']
                delta_visib = env.visib - visib
                break
    return n_solved / n_games, n_steps / n_games, dist / n_games, angle / n_games, delta_visib / n_games

In [None]:
%matplotlib inline

from matplotlib import pyplot as plt
from IPython.display import clear_output

from tensorboardX import SummaryWriter

writer = SummaryWriter('../a2c/runs/dqn_arsen')


mean_rewards = []
mean_steps = []
eval_solved_games = []
mean_eval_steps = []
n_frames = []



NUM_EPISODES = 200000


OPTIMIZE_MODEL_STEP = 4


TARGET_UPDATE=10000



STEPS_BEFORE_TRAIN = 30000



EPS_START = 1
EPS_END = 0.1
EPS_DECAY = 1000000

EPS_START_v2 = 0.1
EPS_END_v2 = 0.01

policy_net.train()
target_net.eval()
test_rewards = []


steps_done = 0

for e in range(NUM_EPISODES):

    state = env.reset() 
    state = torch.tensor(state, dtype=torch.uint8, device=device)
    state = state.unsqueeze(0)
    ep_rewards=0
    
    for t in range(180000):

        if steps_done<EPS_DECAY:
            if steps_done>STEPS_BEFORE_TRAIN:
                fraction=min(float(steps_done)/EPS_DECAY,1)
                eps_threshold= EPS_START + (EPS_END - EPS_START) * fraction
                action = select_action(state,eps_threshold)
            else:
                action=torch.tensor([[random.randrange(ACTIONS_NUM)]], device=device, dtype=torch.long)
        
        else:
            fraction=min(float(steps_done)/2*EPS_DECAY,1)
            eps_threshold= EPS_START_v2 + (EPS_END_v2 - EPS_START_v2) * fraction
            action = select_action(state,eps_threshold)
            
            
        
        next_state, reward, done,info = env.step(action.item())
        ep_rewards += reward
        
        next_state = torch.tensor(next_state,dtype=torch.uint8,device=device)
        next_state = next_state.unsqueeze(0)
        
        
        reward = torch.tensor([reward], device=device)
        if not done:
            memory.push(state, action,next_state, reward)
        else:
            next_state=None
            memory.push(state, action,next_state, reward)  
              
        steps_done+=1
        state=next_state
    
        if (steps_done > STEPS_BEFORE_TRAIN) and steps_done % OPTIMIZE_MODEL_STEP == 0:
            optimize_model()
        

        if steps_done % TARGET_UPDATE == 0:
            print("Target net updated!")
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(),'curr_policy_net')
            
        if steps_done % 10**4 == 0:
            policy_net.eval()
            n_solved, n_steps, dist, angle, delta_visib = evaluate(n_games=10)
        
            writer.add_scalar('eval_solved_games', n_solved, steps_done)
            writer.add_scalar('eval_steps', n_steps, steps_done)
            writer.add_scalar('dist_between_beams', dist, steps_done)
            writer.add_scalar('angle_between_beams', angle, steps_done)
            writer.add_scalar('reward', delta_visib, steps_done)
            policy_net.train()
            
            print('---- steps_done {} ----- Number of steps needed {} --- delta_visib = {}'.format(
                steps_done, n_steps, delta_visib)
            )
        
        if done:
            break


Target net updated!
---- steps_done 10000 ----- Number of steps needed 200.0 --- delta_visib = -0.0841806750543028


  band_width_y = InterfEnv.lamb / abs(wave_vector2[1])


Target net updated!
---- steps_done 20000 ----- Number of steps needed 200.0 --- delta_visib = -0.058427625671386055
Target net updated!
---- steps_done 30000 ----- Number of steps needed 200.0 --- delta_visib = -0.06179509777820136
Target net updated!
---- steps_done 40000 ----- Number of steps needed 200.0 --- delta_visib = -0.03885810518591383
Target net updated!
---- steps_done 50000 ----- Number of steps needed 200.0 --- delta_visib = -0.047975155422429014
Target net updated!
---- steps_done 60000 ----- Number of steps needed 200.0 --- delta_visib = -0.024277376148961527
Target net updated!
---- steps_done 70000 ----- Number of steps needed 200.0 --- delta_visib = -0.0249659298361983
Target net updated!
---- steps_done 80000 ----- Number of steps needed 200.0 --- delta_visib = 0.02316129006835333
Target net updated!
---- steps_done 90000 ----- Number of steps needed 180.8 --- delta_visib = 0.050301202946648506
Target net updated!
---- steps_done 100000 ----- Number of steps needed

Target net updated!
---- steps_done 730000 ----- Number of steps needed 162.8 --- delta_visib = 0.06847457323802779
Target net updated!
---- steps_done 740000 ----- Number of steps needed 143.0 --- delta_visib = 0.024583854977130205
Target net updated!
---- steps_done 750000 ----- Number of steps needed 11.4 --- delta_visib = 0.039292285517214844
Target net updated!
---- steps_done 760000 ----- Number of steps needed 180.4 --- delta_visib = 0.059555890364553374
Target net updated!
---- steps_done 770000 ----- Number of steps needed 29.8 --- delta_visib = 0.061481285745713335
Target net updated!
---- steps_done 780000 ----- Number of steps needed 181.6 --- delta_visib = 0.09221942143940902
Target net updated!
---- steps_done 790000 ----- Number of steps needed 125.2 --- delta_visib = 0.05408246121507303
Target net updated!
---- steps_done 800000 ----- Number of steps needed 142.4 --- delta_visib = 0.03923203892281855
Target net updated!
---- steps_done 810000 ----- Number of steps neede

Target net updated!
---- steps_done 1440000 ----- Number of steps needed 106.4 --- delta_visib = 0.021624555679776402
Target net updated!
---- steps_done 1450000 ----- Number of steps needed 9.2 --- delta_visib = 0.010323458420070075
Target net updated!
---- steps_done 1460000 ----- Number of steps needed 11.2 --- delta_visib = 0.07355027546974294
Target net updated!
---- steps_done 1470000 ----- Number of steps needed 11.2 --- delta_visib = 0.023302275724470422
Target net updated!
---- steps_done 1480000 ----- Number of steps needed 12.4 --- delta_visib = 0.02339532349143274
Target net updated!
---- steps_done 1490000 ----- Number of steps needed 10.8 --- delta_visib = 0.02458675120585794
Target net updated!
---- steps_done 1500000 ----- Number of steps needed 9.0 --- delta_visib = 0.011499845826673571
Target net updated!
---- steps_done 1510000 ----- Number of steps needed 87.4 --- delta_visib = 0.011009640420571353
Target net updated!
---- steps_done 1520000 ----- Number of steps ne

Target net updated!
---- steps_done 2150000 ----- Number of steps needed 13.2 --- delta_visib = 0.023705081716869948
Target net updated!
---- steps_done 2160000 ----- Number of steps needed 11.4 --- delta_visib = 0.015393843233729465
Target net updated!
---- steps_done 2170000 ----- Number of steps needed 10.4 --- delta_visib = 0.04964933445808878
Target net updated!
---- steps_done 2180000 ----- Number of steps needed 87.8 --- delta_visib = 0.015559765000257164
Target net updated!
---- steps_done 2190000 ----- Number of steps needed 10.2 --- delta_visib = 0.06856790866855604
Target net updated!
---- steps_done 2200000 ----- Number of steps needed 11.6 --- delta_visib = 0.06842886229751063
Target net updated!
---- steps_done 2210000 ----- Number of steps needed 11.6 --- delta_visib = 0.011464107805857804
Target net updated!
---- steps_done 2220000 ----- Number of steps needed 11.8 --- delta_visib = 0.09457490458173332
Target net updated!
---- steps_done 2230000 ----- Number of steps ne

Target net updated!
---- steps_done 2860000 ----- Number of steps needed 12.4 --- delta_visib = 0.07158688125763747
Target net updated!
---- steps_done 2870000 ----- Number of steps needed 11.0 --- delta_visib = 0.07969799693047815
Target net updated!
---- steps_done 2880000 ----- Number of steps needed 13.0 --- delta_visib = 0.055163356221892945
Target net updated!
---- steps_done 2890000 ----- Number of steps needed 8.0 --- delta_visib = 0.011510449674239965
Target net updated!
---- steps_done 2900000 ----- Number of steps needed 14.6 --- delta_visib = 0.0693036526524761
Target net updated!
---- steps_done 2910000 ----- Number of steps needed 11.2 --- delta_visib = 0.05719304336188151
Target net updated!
---- steps_done 2920000 ----- Number of steps needed 11.2 --- delta_visib = 0.08177500872422945
Target net updated!
---- steps_done 2930000 ----- Number of steps needed 13.0 --- delta_visib = 0.00284211649280357
Target net updated!
---- steps_done 2940000 ----- Number of steps needed

In [None]:
torch.save(policy_net.state_dict(),'policy_net')

In [None]:
!kill -9 -1

In [None]:
# TEST_EPS = 0.005
 
dist_all = []
action_all = []
visib_all = []
steps = []
TEST_EPS = 0.0
env.reset_actions = 1000
env.max_steps = 200
for _ in range(100):
  state = env.reset()
  total_reward = 0
  for i in count():
    state = np.array(state,dtype=np.float32)
    state = torch.tensor(state, dtype=torch.float32, device=device)
    state = state.unsqueeze(0)
    action = select_action(state, TEST_EPS)
    state, _, done, info = env.step(action)
    reward = -1.0+info['visib']
    total_reward+=reward
    # dist_all.append(info['dist'])
    # visib_all.append(info['visib'])
    # action_all.append(action)
    if done:
        steps.append(i)
        break
    

In [None]:
steps = np.array(steps)
steps[steps<100].shape

In [None]:
np.sort(steps)

In [None]:
steps = []
dist_all = []


In [None]:
# TEST_EPS = 0.005
 
action_all = []
visib_all = []
TEST_EPS = 0.0
env.reset_actions = 5000
env.max_steps = 200
state = env.reset()
total_reward = 0
for i in count():
  state = np.array(state,dtype=np.float32)
  state = torch.tensor(state, dtype=torch.float32, device=device)
  state = state.unsqueeze(0)
  action = select_action(state, TEST_EPS)
  state, _, done, info = env.step(action)
  reward = -1.0+info['visib']
  total_reward+=reward
  if i == 0:
    dist_all.append(info['dist'])
  visib_all.append(info['visib'])
  action_all.append(action)
  if done:
    
    steps.append(i)
    break

In [None]:
plt.scatter(steps,dist_all)

In [None]:
plt.plot(dist_all)

In [None]:
plt.plot(visib_all)

In [None]:
visib_all[-1]