In [13]:

# PIPELINE 3 - OFFLINE Q-LEARNING (ROBUSTA END-TO-END)


import pandas as pd
import json
import ast
import numpy as np
import random
from collections import defaultdict
from tqdm import tqdm

# STEP 0 - CSV REPAIR (NO ROWS DROPPED)

INPUT_CSV  = "blackjack_simulator.csv"
FIXED_CSV  = "blackjack_simulator_fixed.csv"

with open(INPUT_CSV, "r", encoding="utf-8", errors="ignore") as f:
    content = f.read()

quote_count = content.count('"')

# Se numero di virgolette dispari → stringa non chiusa a EOF
if quote_count % 2 != 0:
    print("⚠️ CSV invalido: stringa non chiusa a fine file → chiusura forzata")
    content += '"\n'

with open(FIXED_CSV, "w", encoding="utf-8") as f:
    f.write(content)

print("✓ CSV riparato:", FIXED_CSV)


# STEP 1 - LOAD DATASET

dataset = pd.read_csv(FIXED_CSV)
print(f"✓ Dataset caricato: {len(dataset)} righe")

# STEP 2 - PARSING FUNZIONI

def parse_hand(x):
    if not isinstance(x, str):
        return []
    try:
        return json.loads(x)
    except:
        try:
            return ast.literal_eval(x)
        except:
            return []

def card_value(card):
    c = str(card).strip().upper()
    if c in {"J", "Q", "K"}:
        return 10
    if c == "A":
        return 11
    try:
        return int(c)
    except:
        return 0

def hand_value_and_soft(hand):
    values = [card_value(c) for c in hand]
    total = sum(values)
    aces = hand.count("A")
    while total > 21 and aces > 0:
        total -= 10
        aces -= 1
    return total, int(aces > 0)

# STEP 3 - FEATURE ENGINEERING

dataset["initial_hand"] = dataset["initial_hand"].apply(parse_hand)

tmp = dataset["initial_hand"].apply(hand_value_and_soft)
dataset["player_sum"] = tmp.apply(lambda x: x[0]).astype(int)
dataset["player_is_soft"] = tmp.apply(lambda x: x[1]).astype(int)
dataset["dealer_up"] = dataset["dealer_up"].astype(int)

def extract_action(actions):
    if not isinstance(actions, str):
        return None
    try:
        acts = json.loads(actions)
    except:
        try:
            acts = ast.literal_eval(actions)
        except:
            return None
    if not isinstance(acts, list) or len(acts) == 0:
        return None
    last = acts[-1]
    if last == "H":
        return 1
    if last == "S":
        return 0
    return None

dataset["action"] = dataset["actions_taken"].apply(extract_action)
dataset = dataset.dropna(subset=["action"])
dataset["action"] = dataset["action"].astype(int)

def reward_from_win(w):
    if w > 0:
        return 1.0
    if w == 0:
        return 0.0
    return -1.0

dataset["reward"] = dataset["win"].apply(reward_from_win)

dataset = dataset[
    ["player_sum", "dealer_up", "player_is_soft", "action", "reward"]
].copy()

print(f"✓ Transizioni valide: {len(dataset)}")

# STEP 4 - OFFLINE Q-LEARNING

states  = list(zip(dataset["player_sum"],
                   dataset["dealer_up"],
                   dataset["player_is_soft"]))
actions = dataset["action"].tolist()
rewards = dataset["reward"].tolist()

class OfflineQLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9):
        self.Q = defaultdict(lambda: [0.0, 0.0])
        self.alpha = alpha
        self.gamma = gamma

    def update(self, s, a, r, s_next):
        q = self.Q[s][a]
        target = r + self.gamma * max(self.Q[s_next])
        self.Q[s][a] += self.alpha * (target - q)

agent = OfflineQLearningAgent()

print("\n1. Training Offline Q-learning...")
for i in tqdm(range(len(states) - 1), desc="Offline training"):
    agent.update(states[i], actions[i], rewards[i], states[i + 1])

print("✓ Training completato")
print(f"✓ Stati Q-table: {len(agent.Q)}")

def policy_action(state):
    return int(np.argmax(agent.Q[state])) if state in agent.Q else 0

# STEP 5 - BLACKJACK ENV (EVALUATION)

class BlackjackEnv:
    def __init__(self):
        self.deck = []

    def reset(self):
        self.deck = ['2','3','4','5','6','7','8','9','10','J','Q','K','A'] * 24
        random.shuffle(self.deck)
        self.player = [self.deck.pop(), self.deck.pop()]
        self.dealer = [self.deck.pop(), self.deck.pop()]
        return self.state()

    def card(self, c):
        return 10 if c in ['J','Q','K'] else 11 if c == 'A' else int(c)

    def value(self, h):
        v = sum(self.card(c) for c in h)
        aces = h.count('A')
        while v > 21 and aces > 0:
            v -= 10
            aces -= 1
        return v

    def state(self):
        return (
            self.value(self.player),
            self.card(self.dealer[0]),
            int('A' in self.player and self.value(self.player) <= 21)
        )

    def step(self, a):
        if a == 1:
            self.player.append(self.deck.pop())
            if self.value(self.player) > 21:
                return self.state(), -1, True
            return self.state(), 0, False

        while self.value(self.dealer) < 17:
            self.dealer.append(self.deck.pop())

        p, d = self.value(self.player), self.value(self.dealer)
        if d > 21 or p > d:
            return self.state(), 1, True
        if p == d:
            return self.state(), 0, True
        return self.state(), -1, True

# STEP 6 - EVALUATION

env = BlackjackEnv()
N = 1_000_000
w = l = d = 0

for _ in tqdm(range(N), desc="Evaluation"):
    s = env.reset()
    done = False
    while not done:
        s, r, done = env.step(policy_action(s))
    if r > 0:
        w += 1
    elif r < 0:
        l += 1
    else:
        d += 1

print("\n=== RISULTATI FINALI ===")
print(f"Win Rate:  {w/N:.4f}")
print(f"Loss Rate: {l/N:.4f}")
print(f"Draw Rate: {d/N:.4f}")


✓ CSV riparato: blackjack_simulator_fixed.csv
✓ Dataset caricato: 1000000 righe
✓ Transizioni valide: 1000000

1. Training Offline Q-learning...


Offline training: 100%|██████████| 999999/999999 [00:01<00:00, 573733.87it/s]


✓ Training completato
✓ Stati Q-table: 190


Evaluation: 100%|██████████| 1000000/1000000 [02:19<00:00, 7144.34it/s]


=== RISULTATI FINALI ===
Win Rate:  0.3814
Loss Rate: 0.5663
Draw Rate: 0.0523



