In [1]:
import csv
import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import torch
import pandas as pd
import numpy as np
import plotly.express as px

In [19]:
from environment.environment import Environment

from train.train import train_and_eval_agent

from agents.mab_agent import MAB_Agent
from agents.mc_agent import MC_Agent
from agents.sarsa_agent import SARSA_Agent
from agents.dqn_agent import DQN_Agent

In [20]:
tickers = ["AAPL", "AMZN", "GOOGL", "META", "MSFT", "NVDA", "TSLA"]

data = {i: {t: float(row[t]) for t in tickers} \
    for i, row in enumerate(csv.DictReader( \
    open("data/nasdaq_stock_prices.csv", mode='r'), delimiter=','))
}

In [21]:
INITIAL_BALANCE = 10000
WINDOW_SIZE = 5
EPISODES = 1000
VERBOSE = True
LOW_EPSILON = 0.1
HIGH_EPSILON = 0.5
GAMMA = 0.95
ALPHA = 0.3
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

environment = Environment(data, window_size=WINDOW_SIZE, initial_balance=INITIAL_BALANCE, verbose=False)

In [6]:
mab_agent_low = MAB_Agent(environment, epsilon=LOW_EPSILON)
mab_agent_high = MAB_Agent(environment, epsilon=HIGH_EPSILON)

mc_agent_low = MC_Agent(environment, epsilon=LOW_EPSILON, gamma=GAMMA)
mc_agent_high = MC_Agent(environment, epsilon=HIGH_EPSILON, gamma=GAMMA)

sarsa_agent_low = SARSA_Agent(environment, epsilon=LOW_EPSILON, gamma=GAMMA)
sarsa_agent_high = SARSA_Agent(environment, epsilon=HIGH_EPSILON, alpha=ALPHA, gamma=GAMMA)

dqn_agent_low = DQN_Agent(environment, epsilon=LOW_EPSILON, gamma=GAMMA, device=DEVICE)
dqn_agent_high = DQN_Agent(environment, epsilon=HIGH_EPSILON, gamma=GAMMA, device=DEVICE)

Device: mps
Device: mps


In [7]:
results_mab_agent_low = train_and_eval_agent(mab_agent_low, environment, episodes=EPISODES, train=True, verbose=VERBOSE)
results_mab_agent_high = train_and_eval_agent(mab_agent_high, environment, episodes=EPISODES, train=True, verbose=VERBOSE)

results_mc_agent_low = train_and_eval_agent(mc_agent_low, environment, episodes=EPISODES, train=True, verbose=VERBOSE)
results_mc_agent_high = train_and_eval_agent(mc_agent_high, environment, episodes=EPISODES, train=True, verbose=VERBOSE)

results_sarsa_agent_low = train_and_eval_agent(sarsa_agent_low, environment, episodes=EPISODES, train=True, verbose=VERBOSE)
results_sarsa_agent_high = train_and_eval_agent(sarsa_agent_high, environment, episodes=EPISODES, train=True, verbose=VERBOSE)

results_dqn_agent_low = train_and_eval_agent(dqn_agent_low, environment, episodes=EPISODES, train=True, verbose=VERBOSE)
results_dqn_agent_high = train_and_eval_agent(dqn_agent_high, environment, episodes=EPISODES, train=True, verbose=VERBOSE)

Training | Épisode 1/1000 — Total reward: 527.91
Training | Épisode 2/1000 — Total reward: 518.66
Training | Épisode 3/1000 — Total reward: 730.61
Training | Épisode 4/1000 — Total reward: -645.54
Training | Épisode 5/1000 — Total reward: 1638.40
Training | Épisode 6/1000 — Total reward: 1950.79
Training | Épisode 7/1000 — Total reward: 1850.39
Training | Épisode 8/1000 — Total reward: 477.73
Training | Épisode 9/1000 — Total reward: 4166.53
Training | Épisode 10/1000 — Total reward: -580.51
Training | Épisode 11/1000 — Total reward: 788.31
Training | Épisode 12/1000 — Total reward: 1197.45
Training | Épisode 13/1000 — Total reward: -2216.20
Training | Épisode 14/1000 — Total reward: 2040.45
Training | Épisode 15/1000 — Total reward: 2747.89
Training | Épisode 16/1000 — Total reward: -1595.33
Training | Épisode 17/1000 — Total reward: 764.25
Training | Épisode 18/1000 — Total reward: -816.45
Training | Épisode 19/1000 — Total reward: 210.26
Training | Épisode 20/1000 — Total reward: -1

In [8]:
# 1. Résultats formatés dans un DataFrame
results_df = pd.DataFrame({
    f"MAB {mab_agent_low.epsilon}": results_mab_agent_low,
    f"MAB {mab_agent_high.epsilon}": results_mab_agent_high,
    f"MC {mc_agent_low.epsilon}": results_mc_agent_low,
    f"MC {mc_agent_high.epsilon}": results_mc_agent_high,
    f"SARSA {sarsa_agent_low.epsilon}": results_sarsa_agent_low,
    f"SARSA {sarsa_agent_high.epsilon}": results_sarsa_agent_high,
    f"DQN {dqn_agent_low.epsilon}": results_dqn_agent_low,
    f"DQN {dqn_agent_high.epsilon}": results_dqn_agent_high
})

# 2. Lissage via moyenne glissante
window = 20
ma_df = results_df.rolling(window).mean()
ma_df.columns = [f"{col} (MA)" for col in ma_df.columns]

# 3. Fusion et passage au format long
merged_df = pd.concat([results_df, ma_df], axis=1).reset_index().rename(columns={"index": "Episode"})
long_df = merged_df.melt(id_vars="Episode", var_name="Agent", value_name="Reward")
long_df["Type"] = long_df["Agent"].apply(lambda x: "Moving Average" if "(MA)" in x else "Raw")

# 4. Graphique interactif
fig = px.line(
    long_df[long_df["Type"] == "Moving Average"],
    x="Episode",
    y="Reward",
    color="Agent",
    title="Agent Performance Comparison (Moving Average)",
    log_y=False,
    labels={"Reward": "Total Rewards ($)", "Episode": "Episode"},
    color_discrete_sequence=px.colors.qualitative.Set2
)

# 5. Ajout de phases d'entraînement
n = len(results_df)
fig.add_vrect(x0=0, x1=n//3, fillcolor="blue", opacity=0.05, line_width=0, annotation_text="Early Training", annotation_position="top left")
fig.add_vrect(x0=n//3, x1=2*n//3, fillcolor="green", opacity=0.05, line_width=0, annotation_text="Middle Training", annotation_position="top left")
fig.add_vrect(x0=2*n//3, x1=n, fillcolor="red", opacity=0.05, line_width=0, annotation_text="Late Training", annotation_position="top left")

fig.update_layout(
    font=dict(family="Arial", size=14),
    title_font=dict(size=20),
    xaxis_title="Episode",
    yaxis_title="Total Rewards ($)",
    legend_title_text="Agent",
    hovermode="x unified",
    margin=dict(t=60, l=20, r=20, b=20)
)

fig.show()

# 7. Résumé statistique
print("\n PERFORMANCE SUMMARY")
print(f"{'Agent':<15} {'Mean':>10} {'Std':>10} {'Median':>10} {'Min':>10} {'Max':>10}")
for name, data in zip(results_df.columns, results_df.values.T):
    print(f"{name:<15} {np.mean(data):>10.2f} {np.std(data):>10.2f} {np.median(data):>10.2f} {np.min(data):>10.2f} {np.max(data):>10.2f}")


 PERFORMANCE SUMMARY
Agent                 Mean        Std     Median        Min        Max
MAB 0.1            1064.97    2628.77     667.59   -4591.83   23739.30
MAB 0.5            1270.01    2934.89     918.79   -5979.07   17580.71
MC 0.1             5338.71    6206.86    3880.50   -4247.85   56342.49
MC 0.5             2933.91    4345.92    2073.31   -4966.09   34036.63
SARSA 0.1          1282.11    2832.44    1017.82   -5285.85   24098.02
SARSA 0.5          1328.40    3229.53     919.49   -5411.92   28379.69
DQN 0.05           3102.11    5166.73    1806.13   -5029.89   34621.89


In [9]:
EPISODES_TEST = 100

In [22]:
# Basculer l'environnement en mode test
environment.train_mode = False

# Optionnel : désactiver l'exploration pendant le test (si méthode train dispo)
for agent in [mab_agent_low, mab_agent_high, mc_agent_low, mc_agent_high, sarsa_agent_low, sarsa_agent_high]:
    if hasattr(agent, "train"):
        agent.train(False)

# Lancer les tests
test_results_mab_low = train_and_eval_agent(mab_agent_low, environment, episodes=EPISODES_TEST, train=False, verbose=True)
test_results_mab_high = train_and_eval_agent(mab_agent_high, environment, episodes=EPISODES_TEST, train=False, verbose=True)

test_results_mc_low = train_and_eval_agent(mc_agent_low, environment, episodes=EPISODES_TEST, train=False, verbose=True)
test_results_mc_high = train_and_eval_agent(mc_agent_high, environment, episodes=EPISODES_TEST, train=False, verbose=True)

test_results_sarsa_low = train_and_eval_agent(sarsa_agent_low, environment, episodes=EPISODES_TEST, train=False, verbose=True)
test_results_sarsa_high = train_and_eval_agent(sarsa_agent_high, environment, episodes=EPISODES_TEST, train=False, verbose=True)

test_results_dqn_low = train_and_eval_agent(dqn_agent_low, environment, episodes=EPISODES_TEST, train=False, verbose=True)
test_results_dqn_high = train_and_eval_agent(dqn_agent_high, environment, episodes=EPISODES_TEST, train=False, verbose=True)

# 6. Résumé statistique des tests
print("\n PERFORMANCE SUMMARY (TEST)")
print(f"{'Agent':<15} {'Mean':>10} {'Std':>10} {'Median':>10} {'Min':>10} {'Max':>10}")
for name, data in zip(results_df.columns, [test_results_mab_low, test_results_mab_high, test_results_mc_low, test_results_mc_high, test_results_sarsa_low, test_results_sarsa_high, test_results_dqn_low, test_results_dqn_high]):
    print(f"{name:<15} {np.mean(data):>10.2f} {np.std(data):>10.2f} {np.median(data):>10.2f} {np.min(data):>10.2f} {np.max(data):>10.2f}")

# Affichage synthétique
print("\n📊 Résultats de test sur les 100 derniers jours :")
print(f"MAB (ε={mab_agent_low.epsilon}):  {test_results_mab_low[0]:.2f}")
print(f"MAB (ε={mab_agent_high.epsilon}): {test_results_mab_high[0]:.2f}")
print(f"MC  (ε={mc_agent_low.epsilon}):   {test_results_mc_low[0]:.2f}")
print(f"MC  (ε={mc_agent_high.epsilon}):  {test_results_mc_high[0]:.2f}")
print(f"SARSA (ε={sarsa_agent_low.epsilon}):  {test_results_sarsa_low[0]:.2f}")
print(f"SARSA (ε={sarsa_agent_high.epsilon}): {test_results_sarsa_high[0]:.2f}")
print(f"DQN (ε={dqn_agent_low.epsilon}):  {test_results_dqn_low[0]:.2f}")
print(f"DQN (ε={dqn_agent_high.epsilon}): {test_results_dqn_high[0]:.2f}")

Testing | Épisode 1/100 — Total reward: 163.13
Testing | Épisode 2/100 — Total reward: -772.39
Testing | Épisode 3/100 — Total reward: 150.81
Testing | Épisode 4/100 — Total reward: -157.45
Testing | Épisode 5/100 — Total reward: 163.13
Testing | Épisode 6/100 — Total reward: 460.92
Testing | Épisode 7/100 — Total reward: 1067.97
Testing | Épisode 8/100 — Total reward: 163.13
Testing | Épisode 9/100 — Total reward: 163.13
Testing | Épisode 10/100 — Total reward: 55.78
Testing | Épisode 11/100 — Total reward: -51.39
Testing | Épisode 12/100 — Total reward: 1076.13
Testing | Épisode 13/100 — Total reward: 19.99
Testing | Épisode 14/100 — Total reward: 157.87
Testing | Épisode 15/100 — Total reward: 163.13
Testing | Épisode 16/100 — Total reward: 163.13
Testing | Épisode 17/100 — Total reward: 163.13
Testing | Épisode 18/100 — Total reward: 1230.41
Testing | Épisode 19/100 — Total reward: 521.59
Testing | Épisode 20/100 — Total reward: 1335.60
Testing | Épisode 21/100 — Total reward: 2393