# Carlos Guzman
## AI2 Final Project 
## TD-AC

In [None]:
import time
import random
import serial

import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

from CSI_Camera import CSI_Camera
from collections import namedtuple, deque

import torch
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim


In [None]:
class Environment(object):
    def __init__(self):
        self.camera = CSI_Camera(1)
        self.time_last_action = 0
        self.last_action = None
        self.last_frame = None
        self.spin_pen = 0
        self.ACTION_MAP = {0:'s', 1:'f', 2:'b', 3:'l', 4:'r'}
    def reset(self):
        # Check if cameras are opened.
        if not self.camera.isRunning() or not self.camera.isOpened():
            # Open and start camera
            self.camera.open()
            self.camera.start()

        # Initialize variables
        self.time_last_action = 0
        self.last_action = None

        # Read frame from camera
        _, frame = self.camera.read()

        self.last_frame = frame
        return self.get_state(frame)
    def close(self):
        # Stop the camera and close them
        if not self.camera.isOpened():
            self.camera.stop()
            self.camera.release()
    def get_state(self, frame):
        # frame = cv.cvtColor(frame, cv.COLOR_BGR2GRAY, dst=None)
        frame = cv.resize(frame, (240, 240), dst=None, interpolation=cv.INTER_CUBIC)
        state = T.ToTensor()(frame).unsqueeze(0)
        return state
    def step(self, action):
        try:
            action = self.ACTION_MAP[action]
            # reset if new action.
            if self.last_action != action:
                self.time_last_action = 1

            # Read new frame from camera
            _, frame = self.camera.read()
            state = self.get_state(frame)

            # Calculate reward of the last (state, action)
            reward, colision = self.reward(action, frame)

            # Accumlate time since executing same action.
            # Reset if new action is executed
            self.time_last_action += 1
            
            # Update last_action and last_frame
            self.last_action = action
            self.last_frame = frame

            return state, reward, colision
        except RuntimeError:
            print("Error: Camera not opened.")           
    def reward(self, action, frame):
        collision = False
        # if self.last_action == action:
        if action != 's':
            # Check for a collision by comparing the current frame and the last frame. 
            # Via the histograms comparison. 

            # Convert to HSV color space
            last_frame_hsv = cv.cvtColor(self.last_frame, cv.COLOR_BGR2HSV)
            frame_hsv = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
            
            # Histogram parameters
            hist_size = [50, 60]
            ranges = [0, 180, 0, 256]
            channels = [0, 1]

            # Calculate the Histogram of the last frame and normalize it.
            hist_last_frame = cv.calcHist([last_frame_hsv], channels, None, hist_size, ranges, accumulate=False)
            cv.normalize(hist_last_frame, hist_last_frame, alpha=0, beta=1, norm_type=cv.NORM_MINMAX)
            
            # Calculate the Histogram of the current frame and normalize it.
            hist_frame = cv.calcHist([frame_hsv], channels, None, hist_size, ranges, accumulate=False)
            cv.normalize(hist_frame, hist_frame, alpha=0, beta=1, norm_type=cv.NORM_MINMAX)

            # Calculate the correlation coefficient between the histograms
            correlation = cv.compareHist(hist_last_frame, hist_frame, 0)
            # If correlation is higher than 0.99 then we can say that it's a collision with a barrier.
            if correlation >= 0.99:
                # Collision
                collision = True
                reward = -10
        if not collision:
            if action == 's':
                # Reward for stopping
                reward = max(-5, -1 * self.time_last_action)
            elif action in ['l', 'r']:
                reward = 1
            elif action in ['f','b']:
                # Going forward or backwards
                reward = min(10, 2 * self.time_last_action)
        return reward, collision

In [None]:
class ActorLoss(nn.Module):
    def __init__(self):
        super(ActorLoss, self).__init__()
    def forward(self, action_log_probs, advantage):
        # TD Error
        return torch.mean(-action_log_probs * advantage)
        
class ActorCriticLoss(nn.Module):
    def __init__(self):
        super(ActorCriticLoss, self).__init__()
        self.actor_criterion = ActorLoss()
        self.critic_criterion = nn.HuberLoss(reduction='mean')
    def forward(self, action_prob, next_value, value, reward, policy_entropy, beta=0.001, discount=0.99):
        # Calculate State Action Value
        q_value = (reward + discount * next_value)
        # Estimate TD Advantage
        advantage = q_value - value
        #Calculate Log Action Probabilites 
        action_log_probs = torch.log(action_prob)

        # Actor Loss (TD Error)
        loss_actor = self.actor_criterion(action_log_probs, advantage).to(torch.float)
        # Critic Loss (Huber)
        loss_critic = self.critic_criterion(value, q_value).to(torch.float)
        # Use Entropy for Regularization. Promotes Exploration!
        regularizer = (policy_entropy * beta)
        return loss_actor + loss_critic + regularizer

class ActorCritic(nn.Module):
    def __init__(self, num_actions):
        super(ActorCritic, self).__init__()
        self.CNN = nn.Sequential(
            nn.Conv2d(in_channels=3,out_channels=16,kernel_size=5,stride=3),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=16,out_channels=16,kernel_size=5,stride=3),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=16,out_channels=32,kernel_size=3,stride=2),
            nn.LeakyReLU(),
            nn.AvgPool2d(kernel_size=2,stride=2),
            nn.Flatten(),
            nn.Linear(in_features=1152, out_features=288),
            nn.LeakyReLU(),
            nn.Linear(in_features=288, out_features=36),
            nn.LeakyReLU(),
        )
        self.Actor = nn.Sequential(
            nn.Linear(in_features=36, out_features=num_actions),
            nn.Softmax(dim=1)
        )
        self.Critic = nn.Sequential(
            nn.Linear(in_features=36, out_features=1)
        )
    def forward(self,state):
        features = self.CNN(state)
        
        value = self.Critic(features)
        policy = self.Actor(features)

        return value, policy   

In [None]:
class Agent(object):
    def __init__(self, LEARNING_RATE=25e-4):
        self.ACTION_MAP = {0:'s', 1:'f', 2:'b', 3:'l', 4:'r', 5:'(', 6:')'}
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.HISTORY = {'Returns': [], 'Collisions': [], 'Actions': []}

        self.driver = serial.Serial("/dev/ttyTHS1", 9600, timeout=1)
        self.driver.reset_input_buffer()
        self.last_action = None

        self.model = ActorCritic(5).to(self.DEVICE)
        self.optimizer = optim.Adam(params=self.model.parameters(), lr=LEARNING_RATE)
        self.criterion = ActorCriticLoss()

    
    def call_actor_critic(self, state):
        value, policy = agent.model(state.to(self.DEVICE))
        return value, policy

    def select_action(self, policy):
        policy_dist = torch.distributions.Categorical(policy.view(-1))
        action = policy_dist.sample().item()
        return action

    def execute_action(self, action):
        action = self.ACTION_MAP[action]
        if action in ['f','b'] and self.last_action in ['f','b'] and self.last_action != action:
            # Have to stop first to go opposite direction
            self.driver.write('s'.encode('ascii'))
            time.sleep(0.5)
        if action in ['l', 'r']:
            # Turning Left or Right
            self.driver.write('s'.encode('ascii'))
            time.sleep(0.5)
            self.driver.write(action.encode('ascii'))
        else:
            self.driver.write(action.encode('ascii'))
            time.sleep(0.5)
        self.last_action = action

    def wait_for_next_epsiode(self):
        spin = self.ACTION_MAP[random.randint(5,6)]
        self.driver.write('s'.encode('ascii'))
        time.sleep(1/1000 * 8)
        self.driver.write(spin.encode('ascii'))
    
    def optimize_policy(self, action_prob, next_value, value, reward, policy_entropy):    
        # Calculate Loss
        loss = self.criterion(action_prob, next_value, value, reward, policy_entropy)

        # Backprop & weight update
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [None]:
env = Environment()

In [None]:
# Hyperparameters
EPISODES = 32
STEPS = 128

agent = Agent(LEARNING_RATE=2.5e-6)

In [None]:
# Have robot spin to keep arduino on.
agent.wait_for_next_epsiode()
for episode in range(EPISODES):
    episode_reward = 0
    episode_collisions = 0
    action_sequence = []
    state = env.reset()
    for step in range(STEPS):
        print(f'\rStep {step+1}/{STEPS}',end='')
        value, policy = agent.call_actor_critic(state)

        with torch.autograd.no_grad():
        # Select action from policy
            action = agent.select_action(policy)

        action_prob = policy.view(-1)[action]

        # Calculate policy entropy
        policy_entropy = -torch.sum(policy.view(-1) * torch.log(policy.view(-1))) 

        if step == 0:
            # Stop robot from spinning.
            agent.execute_action(0)
        agent.execute_action(action)
        action_sequence.append(action)

        # Transition to next state
        new_state, reward, collision = env.step(action)

        episode_collisions += int(collision)
        episode_reward += reward
        # print(f'Step: {step+1}, Action: {action}, Reward: {reward}, Collided?: {collision}')#, end='')

        state = new_state
        if 0 < step < STEPS:
            with torch.inference_mode():
                next_value, _ = agent.call_actor_critic(state)
            agent.optimize_policy(action_prob, next_value.view(-1), value.view(-1), reward, policy_entropy)

    agent.HISTORY['Returns'].append(episode_reward)
    agent.HISTORY['Collisions'].append(episode_collisions)
    agent.HISTORY['Actions'].append(action_sequence)
    if episode % 5 == 0:
        print(f'\nEpisode {episode+1}/{EPISODES}: Reward {episode_reward}')

    # Have robot spin to keep arduino on.
    agent.wait_for_next_epsiode()
    
agent.execute_action(0)

In [None]:
torch.save(agent.HISTORY, 'A2C_History.pth')
torch.save(agent.model.state_dict(), 'A2C_Weights.pth')

In [None]:
env.close()