In [1]:
# Setup
!pip install -q pandas numpy matplotlib seaborn tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from collections import defaultdict
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Setup completato!")

# Blackjack Environment
class BlackjackEnv:
    def __init__(self, num_decks=8):
        self.num_decks = num_decks
        self.reset_deck()

    def reset_deck(self):
        deck = []
        for _ in range(self.num_decks):
            for _ in range(4):
                deck.extend([11] + list(range(2, 11)) + [10, 10, 10])
        random.shuffle(deck)
        self.deck = deck

    def draw_card(self):
        if len(self.deck) < 20:
            self.reset_deck()
        return self.deck.pop()

    def get_hand_value(self, hand):
        value = sum(hand)
        aces = hand.count(11)
        while value > 21 and aces > 0:
            value -= 10
            aces -= 1
        is_soft = (aces > 0 and value <= 21)
        return value, is_soft

    def is_bust(self, hand):
        value, _ = self.get_hand_value(hand)
        return value > 21

    def dealer_play(self, dealer_hand):
        while True:
            value, is_soft = self.get_hand_value(dealer_hand)
            if value > 21:
                break
            if value >= 17 and not is_soft:
                break
            if value == 17 and is_soft:
                dealer_hand.append(self.draw_card())
            elif value < 17:
                dealer_hand.append(self.draw_card())
            else:
                break
        return dealer_hand

    def reset(self):
        player_hand = [self.draw_card(), self.draw_card()]
        dealer_hand = [self.draw_card(), self.draw_card()]
        return {
            'player_hand': player_hand,
            'dealer_hand': dealer_hand,
            'dealer_showing': dealer_hand[0]
        }

    def step(self, state, action):
        player_hand = state['player_hand'].copy()
        dealer_hand = state['dealer_hand'].copy()
        dealer_showing = state['dealer_showing']
        done = False
        reward = 0
        info = {}

        if action == 1:  # HIT
            player_hand.append(self.draw_card())
            if self.is_bust(player_hand):
                reward = -1
                done = True
                info['outcome'] = 'player_bust'
            else:
                return {
                    'player_hand': player_hand,
                    'dealer_hand': dealer_hand,
                    'dealer_showing': dealer_showing
                }, reward, done, info

        elif action == 0:  # STAND
            done = True
            dealer_hand = self.dealer_play(dealer_hand)
            player_value, _ = self.get_hand_value(player_hand)
            dealer_value, _ = self.get_hand_value(dealer_hand)

            if self.is_bust(dealer_hand):
                reward = 1
                info['outcome'] = 'dealer_bust'
            elif player_value > dealer_value:
                reward = 1
                info['outcome'] = 'player_wins'
            elif player_value < dealer_value:
                reward = -1
                info['outcome'] = 'dealer_wins'
            else:
                reward = 0
                info['outcome'] = 'push'

        return {
            'player_hand': player_hand,
            'dealer_hand': dealer_hand,
            'dealer_showing': dealer_showing
        }, reward, done, info

print("Environment pronto")
# SARSA Agent
def state_to_tuple(state, env):
    player_value, is_soft = env.get_hand_value(state['player_hand'])
    return (player_value, int(is_soft), state['dealer_showing'])

class SARSAAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95,
                 epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.01):
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.episode_rewards = []
        self.wins = 0
        self.losses = 0
        self.draws = 0

    def get_best_action(self, state, env):
        state_key = state_to_tuple(state, env)
        if state_key not in self.q_table:
            return random.choice([0, 1])
        q_values = self.q_table[state_key]
        if not q_values:
            return random.choice([0, 1])
        max_q = max(q_values.values())
        best_actions = [a for a, q in q_values.items() if q == max_q]
        return random.choice(best_actions)

    def choose_action(self, state, env, training=True):
        player_value, _ = env.get_hand_value(state['player_hand'])
        if player_value >= 21:
            return 0
        if training and random.random() < self.epsilon:
            return random.choice([0, 1])
        else:
            return self.get_best_action(state, env)

    def update_q_value(self, state, action, reward, next_state, next_action, env):
        state_key = state_to_tuple(state, env)
        next_state_key = state_to_tuple(next_state, env)
        current_q = self.q_table[state_key][action]
        next_q = self.q_table[next_state_key][next_action]
        new_q = current_q + self.lr * (reward + self.gamma * next_q - current_q)
        self.q_table[state_key][action] = new_q

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def train(self, env, num_episodes=50000, verbose_every=5000):
        print(f"Training SARSA: {num_episodes} episodi...")
        for episode in tqdm(range(num_episodes)):
            state = env.reset()
            action = self.choose_action(state, env, training=True)
            done = False
            episode_reward = 0
            steps = 0

            while not done and steps < 50:
                next_state, reward, done, info = env.step(state, action)

                if done:
                    state_key = state_to_tuple(state, env)
                    current_q = self.q_table[state_key][action]
                    new_q = current_q + self.lr * (reward - current_q)
                    self.q_table[state_key][action] = new_q
                else:
                    next_action = self.choose_action(next_state, env, training=True)
                    self.update_q_value(state, action, reward, next_state, next_action, env)
                    state = next_state
                    action = next_action

                episode_reward += reward
                steps += 1

            self.episode_rewards.append(episode_reward)
            if episode_reward > 0:
                self.wins += 1
            elif episode_reward < 0:
                self.losses += 1
            else:
                self.draws += 1

            self.decay_epsilon()

            if (episode + 1) % verbose_every == 0:
                recent_rewards = self.episode_rewards[-verbose_every:]
                avg_reward = np.mean(recent_rewards)
                win_rate = self.wins / (episode + 1)
                print(f"\nEp {episode + 1}: Avg Reward={avg_reward:.3f}, WinRate={win_rate:.3f}, Eps={self.epsilon:.4f}")

        print(f"\nâœ“ Training completato!")
        print(f"  Q-table: {len(self.q_table)} stati")
        print(f"  Win rate: {self.wins / num_episodes:.4f}")

    def evaluate(self, env, num_episodes=10000):
        print(f"\nEvaluation su {num_episodes} episodi...")
        wins = 0
        losses = 0
        draws = 0
        total_reward = 0

        for episode in tqdm(range(num_episodes)):
            state = env.reset()
            done = False
            episode_reward = 0
            steps = 0

            while not done and steps < 50:
                action = self.choose_action(state, env, training=False)
                state, reward, done, info = env.step(state, action)
                episode_reward += reward
                steps += 1

            total_reward += episode_reward
            if episode_reward > 0:
                wins += 1
            elif episode_reward < 0:
                losses += 1
            else:
                draws += 1

        win_rate = wins / num_episodes
        loss_rate = losses / num_episodes
        draw_rate = draws / num_episodes
        avg_reward = total_reward / num_episodes

        print(f"\nRisultati:")
        print(f"  Win Rate:  {win_rate:.4f}")
        print(f"  Loss Rate: {loss_rate:.4f}")
        print(f"  Draw Rate: {draw_rate:.4f}")
        print(f"  Avg Reward: {avg_reward:.4f}")

        return {'win_rate': win_rate, 'loss_rate': loss_rate, 'draw_rate': draw_rate, 'avg_reward': avg_reward}

print("Agente SARSA pronto")

Setup completato!
Environment pronto
Agente SARSA pronto
