In [14]:
# Mathematics learning model

In [15]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces

In [16]:
class MathEnv(gym.Env):
    def __init__(self):
        super(MathEnv, self).__init__()
        self.num_range = 10  # Range of numbers
        self.action_space = spaces.Discrete(21)  # Answers from -10 to +10
        self.observation_space = spaces.Box(low=0, high=10, shape=(2,), dtype=np.int32)
        self.current_question = None
        self.correct_answer = None

    def reset(self):
        num1 = np.random.randint(0, self.num_range)
        num2 = np.random.randint(0, self.num_range)
        operation = np.random.choice(["+", "-"])
        self.correct_answer = num1 + num2 if operation == "+" else num1 - num2
        self.current_question = (num1, num2, operation)
        return np.array([num1, num2])

    def step(self, action):
        reward = 1 if action == self.correct_answer else -1
        done = True  # Single-step problem
        return np.array(self.current_question[:2]), reward, done, {}

    def render(self):
        print(f"Question: {self.current_question[0]} {self.current_question[2]} {self.current_question[1]}")

In [17]:
import numpy as np
import random

class QLearningAgent:
    def __init__(self, state_size, action_size, lr=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.99):
        self.q_table = np.zeros((state_size, state_size, action_size))
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.action_size = action_size

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return random.randint(0, self.action_size - 1)  # Explore
        return np.argmax(self.q_table[state[0], state[1]])  # Exploit

    def update_q(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state[0], next_state[1]])
        self.q_table[state[0], state[1], action] = self.q_table[state[0], state[1], action] + \
            self.lr * (reward + self.gamma * self.q_table[next_state[0], next_state[1], best_next_action] - self.q_table[state[0], state[1], action])

        self.epsilon *= self.epsilon_decay  # Reduce exploration over time

In [18]:
env = MathEnv()
agent = QLearningAgent(state_size=10, action_size=21)

episodes = 10000
for episode in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.update_q(state, action, reward, next_state)
        state = next_state

In [19]:
np.save("qtable.npy", agent.q_table)

In [20]:
def test_agent(question):
    num1, num2, operation = question
    correct_answer = num1 + num2 if operation == "+" else num1 - num2
    state = np.array([num1, num2])

    action = np.argmax(agent.q_table[state[0], state[1]])  # Get the best action from the Q-table
    print(f"Agent's Answer: {action}, Correct Answer: {correct_answer}")
    return action == correct_answer  # Returns True if correct

# Example
test_agent((4, 2, "+"))  # Test agent on 4 + 2
test_agent((7, 3, "-"))  # Test agent on 7 - 3

Agent's Answer: 2, Correct Answer: 6
Agent's Answer: 10, Correct Answer: 4


np.False_

In [24]:
import numpy as np

# Define Q-table
q_table = np.zeros((101, 101, 2, 200))  # (a, b, operation, result)

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995

# Training loop
for episode in range(1000):
    a = np.random.randint(0, 101)
    b = np.random.randint(0, 101)
    op = np.random.choice(['+', '-'])
    if op == '+':
        correct_result = a + b
    else:
        correct_result = a - b

    # Choose action (predicted result)
    if np.random.rand() < epsilon:
        predicted_result = np.random.randint(0, 200)
    else:
        predicted_result = np.argmax(q_table[a, b, 1 if op == '+' else 0])

    # Calculate reward
    reward = -abs(predicted_result - correct_result)

    # Update Q-table
    q_table[a, b, 1 if op == '+' else 0, predicted_result] += alpha * (
        reward + gamma * np.max(q_table[a, b, 1 if op == '+' else 0]) -
        q_table[a, b, 1 if op == '+' else 0, predicted_result]
    )

    # Decay epsilon
    epsilon *= epsilon_decay

# Test the agent
a, b, op = 4, 2, '+'
predicted_result = np.argmax(q_table[a, b, 1 if op == '+' else 0])
print(f"Predicted result for {a} {op} {b}: {predicted_result}")

Predicted result for 4 + 2: 0


In [25]:
import numpy as np

# Define Q-table
action_space = list(range(-100, 101))  # Allow results from -100 to 100
num_actions = len(action_space)
q_table = np.random.rand(101, 101, 2, num_actions) * 0.01  # Small random initialization

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01

# Training loop
num_episodes = 10000
for episode in range(num_episodes):
    a = np.random.randint(0, 101)
    b = np.random.randint(0, 101)
    op = np.random.choice(['+', '-'])
    if op == '+':
        correct_result = a + b
    else:
        correct_result = a - b

    # Choose action (predicted result)
    if np.random.rand() < epsilon:
        action_idx = np.random.randint(0, num_actions)  # Explore: random action
    else:
        action_idx = np.argmax(q_table[a, b, 1 if op == '+' else 0])  # Exploit: best action

    predicted_result = action_space[action_idx]

    # Calculate reward
    if predicted_result == correct_result:
        reward = 10  # High reward for correct prediction
    else:
        reward = -abs(predicted_result - correct_result)  # Penalize incorrect predictions

    # Update Q-table
    old_value = q_table[a, b, 1 if op == '+' else 0, action_idx]
    best_future_value = np.max(q_table[a, b, 1 if op == '+' else 0])
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * best_future_value)
    q_table[a, b, 1 if op == '+' else 0, action_idx] = new_value

    # Decay epsilon
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Print progress
    if episode % 1000 == 0:
        print(f"Episode {episode}, Epsilon: {epsilon:.3f}")

# Test the agent
def test_agent(a, b, op):
    action_idx = np.argmax(q_table[a, b, 1 if op == '+' else 0])
    predicted_result = action_space[action_idx]
    return predicted_result

# Test cases
print(f"Predicted result for 4 + 2: {test_agent(4, 2, '+')}")
print(f"Predicted result for 10 - 5: {test_agent(10, 5, '-')}")
print(f"Predicted result for 50 + 30: {test_agent(50, 30, '+')}")
print(f"Predicted result for 100 - 20: {test_agent(100, 20, '-')}")

Episode 0, Epsilon: 0.995
Episode 1000, Epsilon: 0.010
Episode 2000, Epsilon: 0.010
Episode 3000, Epsilon: 0.010
Episode 4000, Epsilon: 0.010
Episode 5000, Epsilon: 0.010
Episode 6000, Epsilon: 0.010
Episode 7000, Epsilon: 0.010
Episode 8000, Epsilon: 0.010
Episode 9000, Epsilon: 0.010
Predicted result for 4 + 2: 56
Predicted result for 10 - 5: 1
Predicted result for 50 + 30: 54
Predicted result for 100 - 20: 73


In [26]:
import numpy as np

# Define Q-table
action_space = list(range(-20, 21))  # Allow results from -20 to 20 (smaller range)
num_actions = len(action_space)
q_table = np.random.rand(21, 21, 2, num_actions) * 0.01  # Small random initialization

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01

# Training loop
num_episodes = 10000
for episode in range(num_episodes):
    a = np.random.randint(0, 21)  # Limit numbers to 0-20
    b = np.random.randint(0, 21)
    op = np.random.choice(['+', '-'])
    if op == '+':
        correct_result = a + b
    else:
        correct_result = a - b

    # Choose action (predicted result)
    if np.random.rand() < epsilon:
        action_idx = np.random.randint(0, num_actions)  # Explore: random action
    else:
        action_idx = np.argmax(q_table[a, b, 1 if op == '+' else 0])  # Exploit: best action

    predicted_result = action_space[action_idx]

    # Calculate reward
    if predicted_result == correct_result:
        reward = 10  # High reward for correct prediction
    else:
        reward = -abs(predicted_result - correct_result)  # Penalize incorrect predictions

    # Update Q-table
    old_value = q_table[a, b, 1 if op == '+' else 0, action_idx]
    best_future_value = np.max(q_table[a, b, 1 if op == '+' else 0])
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * best_future_value)
    q_table[a, b, 1 if op == '+' else 0, action_idx] = new_value

    # Decay epsilon
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Print progress
    if episode % 1000 == 0:
        print(f"Episode {episode}, Epsilon: {epsilon:.3f}")

# Test the agent
def test_agent(a, b, op):
    action_idx = np.argmax(q_table[a, b, 1 if op == '+' else 0])
    predicted_result = action_space[action_idx]
    return predicted_result

# Test cases
print(f"Predicted result for 4 + 2: {test_agent(4, 2, '+')}")
print(f"Predicted result for 10 - 5: {test_agent(10, 5, '-')}")
print(f"Predicted result for 15 + 3: {test_agent(15, 3, '+')}")
print(f"Predicted result for 20 - 10: {test_agent(20, 10, '-')}")

Episode 0, Epsilon: 0.995
Episode 1000, Epsilon: 0.010
Episode 2000, Epsilon: 0.010
Episode 3000, Epsilon: 0.010
Episode 4000, Epsilon: 0.010
Episode 5000, Epsilon: 0.010
Episode 6000, Epsilon: 0.010
Episode 7000, Epsilon: 0.010
Episode 8000, Epsilon: 0.010
Episode 9000, Epsilon: 0.010
Predicted result for 4 + 2: 8
Predicted result for 10 - 5: 8
Predicted result for 15 + 3: 18
Predicted result for 20 - 10: -16
