<a href="https://colab.research.google.com/github/dvisanth/Association/blob/master/AGI_sample_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import random

# Define the environment class
class Environment:
    def __init__(self, grid_size, agent_position, goal_position, obstacle_positions, reward_positions):
        self.grid_size = grid_size
        self.agent_position = agent_position
        self.goal_position = goal_position
        self.obstacle_positions = obstacle_positions
        self.reward_positions = reward_positions

    def get_current_state(self):
        return self.agent_position

    def get_possible_actions(self, state):
        row, col = state
        possible_actions = []
        if row > 0:  # Up
            possible_actions.append('up')
        if row < self.grid_size - 1:  # Down
            possible_actions.append('down')
        if col > 0:  # Left
            possible_actions.append('left')
        if col < self.grid_size - 1:  # Right
            possible_actions.append('right')
        return possible_actions

    def execute_action(self, action):
        row, col = self.agent_position
        if action == 'up' and row > 0 and (row - 1, col) not in self.obstacle_positions:
            self.agent_position = (row - 1, col)
        elif action == 'down' and row < self.grid_size - 1 and (row + 1, col) not in self.obstacle_positions:
            self.agent_position = (row + 1, col)
        elif action == 'left' and col > 0 and (row, col - 1) not in self.obstacle_positions:
            self.agent_position = (row, col - 1)
        elif action == 'right' and col < self.grid_size - 1 and (row, col + 1) not in self.obstacle_positions:
            self.agent_position = (row, col + 1)
        return self.agent_position

    def get_reward(self, position):
        if position == self.goal_position:
            return 1  # Positive reward for reaching the goal
        elif position in self.reward_positions:
            return 0.5  # Partial reward for collecting a reward
        else:
            return 0  # No reward for other positions

# Define the agent class
class Agent:
    def __init__(self, environment, start_position, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.environment = environment
        self.position = start_position
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = {}  # Dictionary to store Q-values

    def move(self, action):
        # Execute action in the environment and update agent's position
        next_position = self.environment.execute_action(action)
        self.position = next_position

    def update_q_value(self, state, action, reward, next_state):
        # Update Q-value using Q-learning update rule
        q_value = self.q_table.get((state, action), 0.0)
        max_next_q_value = max([self.q_table.get((next_state, a), 0.0) for a in self.environment.get_possible_actions(next_state)])
        new_q_value = q_value + self.learning_rate * (reward + self.discount_factor * max_next_q_value - q_value)
        self.q_table[(state, action)] = new_q_value

    def decide_action(self, state):
        if random.random() < self.exploration_rate:
            # Explore: randomly select an action
            return random.choice(self.environment.get_possible_actions(state))
        else:
            # Exploit: select action with the highest Q-value
            return max(self.environment.get_possible_actions(state), key=lambda a: self.q_table.get((state, a), 0.0))

# Define the grid-world environment
grid_size = 5
environment = [[0 for _ in range(grid_size)] for _ in range(grid_size)]
agent_symbol = 'A'
goal_symbol = 'G'
obstacle_symbol = 'X'
reward_symbol = 'R'
empty_symbol = ' '
agent_position = (0, 0)
goal_position = (4, 4)
obstacle_positions = [(1, 1), (2, 2)]
reward_positions = [(3, 3)]
environment[agent_position[0]][agent_position[1]] = agent_symbol
environment[goal_position[0]][goal_position[1]] = goal_symbol
for obstacle_pos in obstacle_positions:
    environment[obstacle_pos[0]][obstacle_pos[1]] = obstacle_symbol
for reward_pos in reward_positions:
    environment[reward_pos[0]][reward_pos[1]] = reward_symbol

# Define the number of episodes for training
num_episodes = 100

# Training loop
for episode in range(num_episodes):
    # Reset environment for new episode
    environment = Environment(grid_size, agent_position, goal_position, obstacle_positions, reward_positions)
    # Initialize cumulative reward for the episode
    total_reward = 0

    # Initialize agent with the environment and starting position
    agent = Agent(environment, agent_position)

    # Loop within each episode until terminal condition is met
    while True:
        # Get current state
        current_state = environment.get_current_state()

        # Agent decides on action
        action = agent.decide_action(current_state)

        # Agent executes action in the environment
        agent.move(action)

        # Agent receives reward from the environment
        reward = environment.get_reward(agent.position)
        total_reward += reward

        # Get next state after action execution
        next_state = environment.get_current_state()

        # Update Q-values based on observed reward and next state
        agent.update_q_value(current_state, action, reward, next_state)

        # Check if terminal condition is met
        if agent.position == environment.goal_position:
            break

    # Print total reward for the episode
    print("Episode:", episode, "Total Reward:", total_reward)

# Define the number of evaluation episodes
num_evaluation_episodes = 100

# Evaluation loop
total_rewards = []
for episode in range(num_evaluation_episodes):
    # Reset environment for new episode
    environment = Environment(grid_size, agent_position, goal_position, obstacle_positions, reward_positions)
    total_reward = 0

    # Initialize agent with the environment and starting position
    agent = Agent(environment, agent_position)

    # Loop within each episode until terminal condition is met
    while True:
        # Get current state
        current_state = environment.get_current_state()

        # Agent selects action based on learned policy
        action = agent.decide_action(current_state)

        # Agent executes action in the environment
        agent.move(action)

        # Agent receives reward from the environment
        reward = environment.get_reward(agent.position)
        total_reward += reward

        # Check if terminal condition is met
        if agent.position == environment.goal_position:
            break

    # Store total reward for the episode
    total_rewards.append(total_reward)

# Print average total reward across evaluation episodes
average_reward = sum(total_rewards) / num_evaluation_episodes
print("Average Total Reward across Evaluation Episodes:", average_reward)


Episode: 0 Total Reward: 213.0
Episode: 1 Total Reward: 82.0
Episode: 2 Total Reward: 1
Episode: 3 Total Reward: 531.0
Episode: 4 Total Reward: 116.0
Episode: 5 Total Reward: 42.5
Episode: 6 Total Reward: 74.5
Episode: 7 Total Reward: 22.5
Episode: 8 Total Reward: 75.0
Episode: 9 Total Reward: 1
Episode: 10 Total Reward: 380.0
Episode: 11 Total Reward: 104.5
Episode: 12 Total Reward: 5.5
Episode: 13 Total Reward: 438.0
Episode: 14 Total Reward: 261.5
Episode: 15 Total Reward: 153.5
Episode: 16 Total Reward: 151.0
Episode: 17 Total Reward: 621.0
Episode: 18 Total Reward: 338.0
Episode: 19 Total Reward: 5.0
Episode: 20 Total Reward: 82.5
Episode: 21 Total Reward: 179.0
Episode: 22 Total Reward: 73.5
Episode: 23 Total Reward: 871.5
Episode: 24 Total Reward: 161.5
Episode: 25 Total Reward: 442.5
Episode: 26 Total Reward: 214.5
Episode: 27 Total Reward: 521.5
Episode: 28 Total Reward: 26.0
Episode: 29 Total Reward: 136.0
Episode: 30 Total Reward: 56.5
Episode: 31 Total Reward: 70.0
Episode: