# Reinforcement Learning Agent for Resource Optimization

This notebook demonstrates a reinforcement learning (RL) agent designed for optimizing resources in a smart environment.
We cover:
- Environment setup
- Defining the RL agent
- Training the agent using Q-learning
- Evaluation and example policy


In [None]:
!pip install numpy matplotlib gym

import numpy as np
import matplotlib.pyplot as plt
import gym


In [None]:
class ResourceEnv:
    def __init__(self, max_resources=5):
        self.max_resources = max_resources
        self.state = self.max_resources // 2  # initial resources
        self.action_space = [0,1]  # 0: consume, 1: replenish
        self.n_actions = len(self.action_space)
        self.n_states = self.max_resources + 1

    def reset(self):
        self.state = self.max_resources // 2
        return self.state

    def step(self, action):
        if action == 0:  # consume
            self.state = max(0, self.state - 1)
        else:  # replenish
            self.state = min(self.max_resources, self.state + 1)
        # reward: maintain mid-level resources
        reward = -abs(self.state - self.max_resources//2)
        done = False
        return self.state, reward, done, {}

    def render(self):
        print(f'Resources level: {self.state}')


In [None]:
class QLearningAgent:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((n_states, n_actions))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.n_actions)
        return np.argmax(self.Q[state])

    def update(self, state, action, reward, next_state):
        best_next = np.max(self.Q[next_state])
        self.Q[state, action] += self.alpha * (reward + self.gamma*best_next - self.Q[state, action])


In [None]:
env = ResourceEnv()
agent = QLearningAgent(env.n_states, env.n_actions)

episodes = 1000
rewards = []

for ep in range(episodes):
    state = env.reset()
    total_reward = 0
    for t in range(10):
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.update(state, action, reward, next_state)
        state = next_state
        total_reward += reward
    rewards.append(total_reward)

plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Training Progress')
plt.show()


In [None]:
state = env.reset()
print("Learned policy (0: consume, 1: replenish) over 10 steps:")
for t in range(10):
    action = np.argmax(agent.Q[state])
    print(f'Step {t+1}, State {state}, Action {action}')
    state, _, _, _ = env.step(action)


This notebook demonstrates a basic RL agent using Q-learning:
- Custom environment simulating resource management
- Q-learning update and epsilon-greedy policy
- Training progress visualization
- Evaluation of learned policy

The framework is ready for extension to more complex environments and larger state-action spaces.
