<a href="https://colab.research.google.com/github/faizankshaikh/ForaGym/blob/main/examples/trial1_foragymSimple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install required libraries

In [1]:
!git clone --quiet https://github.com/faizankshaikh/ForaGym.git
%cd ForaGym
!pip install -q -e .
!pip install stable-baselines3

# 2. Create Gym environment

In [1]:
# import important libs and modules
import gym
import foragym
import numpy as np

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
# initiatlize gym env
env = gym.make("foragym:foragym/ForaGym-v0")

In [3]:
# # visualize transition matrix
# for key, items in env.P.items():
#     try:
#         days_left, life_point, field, weather = env.decode(key)
#         p_wait, new_state_wait, reward_wait, is_dead_wait = items[0][0]
#         p_forage_failure, new_state_forage_failure, reward_forage_failure, is_dead_forage_failure = items[1][0]

#         print("="*10)
#         print(f"days_left: {days_left}, life_point: {life_point}, field: {field}, weather: {weather}, enc_state: {key}")
#         print(f"p_wait: {p_wait:.2f}, new_state_wait: {new_state_wait}, reward_wait: {reward_wait}, is_dead_wait: {is_dead_wait}")
#         print(f"p_forage_failure: {p_forage_failure:.2f}, new_state_forage_failure: {new_state_forage_failure}, reward_forage_failure: {reward_forage_failure}, is_dead_forage_failure: {is_dead_forage_failure}")
#         p_forage_success, new_state_forage_success, reward_forage_success, is_dead_forage_success = items[1][12]
#         print(f"p_forage_success: {p_forage_success:.2f}, new_state_forage_success: {new_state_forage_success}, reward_forage_success: {reward_forage_success}, is_dead_forage_success: {is_dead_forage_success}")
#         print("="*10)
#     except:
#         pass
    

# 3. Solve the foraging task

## 3.1 Heuristic method

In [4]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        action = 0 #env.action_space.sample()
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [0 0 0 0 1]
--Current life: 6
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [0 0 1 0 1]
--Current life: 5
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 3
--State of Field: [0 1 1 0 0]
--Current life: 4
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 2
--State of Field: [0 1 1 1 0]
--Current life: 3
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 1
--State of Field: [0 0 1 0 1]
--Current life: 2
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: True

Current game state:-
--Days left: 1
--State of Field: [1 1 0 1 1]
--Cu

In [5]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        action = 0 #env.action_space.sample()
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-0.83 +/- 0.37


## 3.2 Value Iteration method

In [6]:
# find optimal policy
num_states = env.NUM_STATES
num_actions = env.NUM_ACTIONS

V = np.zeros(num_states)
V[:95] = -1

optimal_policy = np.zeros([num_states, num_actions])


for days_left in range(1, env.NUM_DAYS_LEFT):
	for life_point in range(1, env.NUM_LIFE_POINTS):
		for field in range(env.NUM_FIELDS + 1):
			for weather in range(env.NUM_WEATHER_TYPES):
				enc_state = env.encode(days_left, life_point, field, weather)
				actions_s = np.zeros(num_actions)

				for action in range(env.NUM_ACTIONS):
					for transition_prob, next_state, reward, done in env.P[enc_state][action]:
						actions_s[action] += reward + transition_prob * V[next_state]
				V[enc_state] = actions_s.max()
				if actions_s[0] != actions_s[1]:
					optimal_policy[enc_state, np.argmax(actions_s)] = 1.0
				else:
					optimal_policy[enc_state, 1] = 1.0

				# print(f"days_left: {days_left}, life_point: {life_point}, field: {field}/{env.NUM_FIELDS}, weather: {env.WEATHER_DICT[weather]}:- action: {env.ACTION_DICT[np.argmax(optimal_policy[enc_state])]}, value: {V[enc_state]:.2f}")
	

In [7]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        enc_state = env.encode(obs["days_left"], obs["life_points"], sum(obs["field_state"]), obs["weather_type"])
        action = np.argmax(optimal_policy[enc_state])
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [0 1 0 1 1]
--Current life: 5
--Type of Weather: Rainy

Action to take: Forage
Chance to find food: 0.51
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [0 0 1 1 1]
--Current life: 6
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.08
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 3
--State of Field: [0 0 0 1 0]
--Current life: 4
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 2
--State of Field: [1 1 1 0 1]
--Current life: 3
--Type of Weather: Clear

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 1
--State of Field: [1 0 1 1 1]
--Current life: 2
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: True

Current game state:-
--Days left: 1
--State of Field: [1 1 0 0 0]


In [8]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        enc_state = env.encode(obs["days_left"], obs["life_points"], sum(obs["field_state"]), obs["weather_type"])
        action = np.argmax(optimal_policy[enc_state])
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-0.59 +/- 0.49


## 3.3 DQN (Stable baselines)

In [9]:
# create model and train
model = DQN('MultiInputPolicy', env, learning_starts=1e5, gamma=0.9, exploration_fraction=0.3).learn(total_timesteps=int(2e5))

In [10]:
# run algorithm on sample episodes
num_episodes = 2

for episode in range(num_episodes):
    print(f"Episode #{episode+1}")
    print("=" * 10)

    is_dead = False
    obs = env.reset()

    print("Initial state:-")
    env.render()
    print()

    while not is_dead:
        action, _ = model.predict(obs, deterministic=True)
        print(f"Action to take: {env.ACTION_DICT[action]}")
        obs, reward, is_dead, info = env.step(action)
        print(f'Chance to find food: {info["chance"]:.2f}')
        print(f"Reward?: {reward}")
        print(f"is Dead?: {is_dead}")
        print()
        print("Current game state:-")
        env.render()
        print()

    print("-" * 10)

Episode #1
Initial state:-
--Days left: 5
--State of Field: [0 0 0 1 0]
--Current life: 1
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.12
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 5
--State of Field: [1 1 0 0 1]
--Current life: 1
--Type of Weather: Clear

----------
Episode #2
Initial state:-
--Days left: 5
--State of Field: [0 0 0 0 1]
--Current life: 4
--Type of Weather: Rainy

Action to take: Forage
Chance to find food: 0.41
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 4
--State of Field: [0 0 1 1 1]
--Current life: 2
--Type of Weather: Rainy

Action to take: Wait
Chance to find food: 0.00
Reward?: 0
is Dead?: False

Current game state:-
--Days left: 3
--State of Field: [1 1 1 1 1]
--Current life: 1
--Type of Weather: Clear

Action to take: Forage
Chance to find food: 0.75
Reward?: -1
is Dead?: True

Current game state:-
--Days left: 3
--State of Field: [0 0 0 0 1]
--Current life: 1
--Type of Weather: Clear

----------


In [11]:
# # visualize policy
# for idx in range(num_states):
#     days_left, life_point, field, weather = env.decode(idx)
#     field_state = np.zeros(env.NUM_FIELDS, dtype=int)
#     field_state[:field] = 1
#     obs = {
#         "days_left": days_left,
#         "life_points": life_point,
#         "field_state": field_state,
#         "weather_type": weather
#     }

#     action, _ = model.predict(obs, deterministic=True)
#     print(f"days_left: {days_left}, life_point: {life_point}, field: {field}/{env.NUM_FIELDS}, weather: {env.WEATHER_DICT[weather]}:- action: {env.ACTION_DICT[action]}")

In [12]:
# evaluate algorithm
num_episodes = 1000
episode_reward = []

for episode in range(num_episodes):
    done = False
    obs = env.reset()

    total_reward = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        total_reward += reward

    episode_reward.append(total_reward)

print(f"mean_reward:{np.mean(episode_reward):.2f} +/- {np.std(episode_reward):.2f}")


mean_reward:-0.48 +/- 0.50
