<a href="https://colab.research.google.com/github/faizankshaikh/ForaGym/blob/main/examples/trial1_foragymSimple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install required libraries

In [1]:
!git clone --quiet https://github.com/faizankshaikh/ForaGym.git
%cd ForaGym
!pip install -q -e .
!pip install stable-baselines3

# 2. Create Gym environment

In [1]:
# import important libs and modules
import gym
import foragym
import numpy as np

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
# initiatlize gym env
env = gym.make("foragym:foragym/ForaGym-v0")

In [3]:
# visualize transition matrix
num_states = env.NUM_STATES
num_actions = env.NUM_ACTIONS

for state in range(num_states):
    for action in range(num_actions):
        for items in env.P[state][action]:
            try:
                days_left, life_point, field, weather = env.decode(state)
                p, new_state, reward, is_dead = items
                print("="*10)
                print(f"enc_state: {state}, days_left: {days_left}, life_point: {life_point}, field: {field}, weather: {weather}")
                print(f"action: {env.ACTION_DICT[action]}, new_state: {new_state}, p: {round(p, 3):.3f}, reward: {reward}, is_dead: {is_dead}")
                print("="*10)
            except:
                pass

enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 2, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 3, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 4, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 5, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 6, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 7, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 8, p: 0.100, reward: -1, is_dead: True
enc_state: 74, days_left: 1, life_point: 1, field: 1, weather: 0
action: Wait, new_state: 9, p: 0

# 3. Solve the foraging task

## 3.1 Heuristic method

In [4]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        action = 0 #env.action_space.sample()
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [1 1 1 0 0]
--Current life: 2
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [0 0 1 0 0]
--Current life: 1
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 3
--State of Field: [0 1 1 0 1]
--Current life: 0
--Type of Weather: Clear

----------
Episode #2
Initial state:-
--Days left: 5
--State of Field: [1 1 1 0 0]
--Current life: 2
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [1 1 0 1 1]
--Current life: 1
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 3
--State of Field: [0 0 1 0 1]
--Current life: 0
--Type of Weather: Clear

----------


In [16]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        action = 0 #env.action_space.sample()
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-1.00 +/- 0.00


## 3.2 Value Iteration method

In [23]:
# find optimal policy
env.ACTION_DICT[2] = "Indifference"

num_states = env.NUM_STATES
num_actions = env.NUM_ACTIONS

V = np.zeros(num_states)

for state in range(num_states):
	_, life_points, _, _ = env.decode(state)
	if life_points == 0:
		V[state] = -1

optimal_policy = np.zeros([num_states, num_actions + 1])


for days_left in range(1, env.NUM_DAYS_LEFT):
	for life_point in range(1, env.NUM_LIFE_POINTS):
		for field in range(1, env.NUM_FIELDS + 1):
			for weather in range(env.NUM_WEATHER_TYPES):
				enc_state = env.encode(days_left, life_point, field, weather)
				actions_s = np.zeros(num_actions)

				for action in range(env.NUM_ACTIONS):
					for transition_prob, next_state, reward, done in env.P[enc_state][action]:
						if done:
							actions_s[action] += transition_prob * (reward)
						else:
							actions_s[action] += transition_prob * (reward + V[next_state])

				V[enc_state] = actions_s.max()
				if actions_s[0] != actions_s[1]:
					optimal_policy[enc_state, np.argmax(actions_s)] = 1.0
				else:
					optimal_policy[enc_state, 2] = 1.0

				print(f"days_left: {days_left}, life_point: {life_point}, field: {field}/{env.NUM_FIELDS}, weather: {env.WEATHER_DICT[weather]}:- action: {env.ACTION_DICT[np.argmax(optimal_policy[enc_state])]}, value: {actions_s}")

days_left: 1, life_point: 1, field: 1/5, weather: Clear:- action: Forage, value: [-1.  -0.8]
days_left: 1, life_point: 1, field: 1/5, weather: Rainy:- action: Forage, value: [-1.  -0.9]
days_left: 1, life_point: 1, field: 2/5, weather: Clear:- action: Forage, value: [-1.  -0.6]
days_left: 1, life_point: 1, field: 2/5, weather: Rainy:- action: Forage, value: [-1.  -0.7]
days_left: 1, life_point: 1, field: 3/5, weather: Clear:- action: Forage, value: [-1.  -0.4]
days_left: 1, life_point: 1, field: 3/5, weather: Rainy:- action: Forage, value: [-1.  -0.5]
days_left: 1, life_point: 1, field: 4/5, weather: Clear:- action: Forage, value: [-1.  -0.2]
days_left: 1, life_point: 1, field: 4/5, weather: Rainy:- action: Forage, value: [-1.  -0.3]
days_left: 1, life_point: 1, field: 5/5, weather: Clear:- action: Forage, value: [-1.  0.]
days_left: 1, life_point: 1, field: 5/5, weather: Rainy:- action: Forage, value: [-1.  -0.1]
days_left: 1, life_point: 2, field: 1/5, weather: Clear:- action: Wait, 

In [7]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        enc_state = env.encode(obs["days_left"], obs["life_points"], sum(obs["field_state"]), obs["weather_type"])
        action = np.argmax(optimal_policy[enc_state])
        if action > 1:
            action = 1 
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [1 1 1 0 0]
--Current life: 1
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.13
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 4
--State of Field: [0 0 1 0 1]
--Current life: 0
--Type of Weather: Clear

----------
Episode #2
Initial state:-
--Days left: 5
--State of Field: [0 0 1 0 1]
--Current life: 3
--Type of Weather: Rainy

Action to take: Forage
Chance to find food: 0.03
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [1 1 1 0 0]
--Current life: 1
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.08
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 3
--State of Field: [1 1 0 1 1]
--Current life: 0
--Type of Weather: Rainy

----------


In [8]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        enc_state = env.encode(obs["days_left"], obs["life_points"], sum(obs["field_state"]), obs["weather_type"])
        action = np.argmax(optimal_policy[enc_state])
        if action == 2:
            action = 1
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-0.67 +/- 0.47


## 3.3 DQN (Stable baselines)

In [17]:
# create model and train
model = DQN('MultiInputPolicy', env, learning_starts=1e4, gamma=0.9, exploration_fraction=0.3).learn(total_timesteps=int(2e4))

In [19]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        action, _ = model.predict(obs, deterministic=True)
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [0 1 1 0 1]
--Current life: 3
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.84
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [1 0 0 0 0]
--Current life: 1
--Type of Weather: Rainy

Action to take: Forage
Chance to find food: 0.88
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 3
--State of Field: [0 0 1 1 0]
--Current life: 2
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 2
--State of Field: [0 0 0 0 1]
--Current life: 1
--Type of Weather: Rainy

Action to take: Forage
Chance to find food: 0.84
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 1
--State of Field: [1 0 1 0 1]
--Current life: 2
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: True

Current game state:-
--Days left: 0
--State of Field: [1 0 1 0 0

In [20]:
# visualize policy
for idx in range(num_states):
    days_left, life_point, field, weather = env.decode(idx)
    field_state = np.zeros(env.NUM_FIELDS, dtype=int)
    field_state[:field] = 1
    obs = {
        "days_left": days_left,
        "life_points": life_point,
        "field_state": field_state,
        "weather_type": weather
    }

    if life_point>0 and days_left>0 and field>0:
        action, _ = model.predict(obs, deterministic=True)
        print(f"days_left: {days_left}, life_point: {life_point}, field: {field}/{env.NUM_FIELDS}, weather: {env.WEATHER_DICT[weather]}:- action: {env.ACTION_DICT[action]}")

days_left: 1, life_point: 1, field: 1/5, weather: Clear:- action: Forage
days_left: 1, life_point: 1, field: 1/5, weather: Rainy:- action: Forage
days_left: 1, life_point: 1, field: 2/5, weather: Clear:- action: Forage
days_left: 1, life_point: 1, field: 2/5, weather: Rainy:- action: Forage
days_left: 1, life_point: 1, field: 3/5, weather: Clear:- action: Forage
days_left: 1, life_point: 1, field: 3/5, weather: Rainy:- action: Forage
days_left: 1, life_point: 1, field: 4/5, weather: Clear:- action: Forage
days_left: 1, life_point: 1, field: 4/5, weather: Rainy:- action: Forage
days_left: 1, life_point: 1, field: 5/5, weather: Clear:- action: Forage
days_left: 1, life_point: 1, field: 5/5, weather: Rainy:- action: Forage
days_left: 1, life_point: 2, field: 1/5, weather: Clear:- action: Wait
days_left: 1, life_point: 2, field: 1/5, weather: Rainy:- action: Wait
days_left: 1, life_point: 2, field: 2/5, weather: Clear:- action: Wait
days_left: 1, life_point: 2, field: 2/5, weather: Rainy:-

In [21]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-0.64 +/- 0.48
