In [9]:
print("Hola Mundo")

Hola Mundo


## Pipeline y prototipo de detección de eventos
Este notebook añade un pipeline mínimo para:
- Cargar `data/dataset/england_epl.pkl`
- Convertir `annotations` a etiquetas por frame (asumiendo `position` en ms -> frame = position//1000)
- Preparar ventanas temporales (secuencias)
- Definir un modelo temporal pequeño (Conv1D) en PyTorch
- Hacer un smoke test para validar shapes

Eventos objetivo por defecto: `['Goal', 'Shots on target']`. Si quieres otros eventos, indícalos y lo modifico.

In [17]:
# Cargar datos y convertir anotaciones a etiquetas por frame
import pickle
import numpy as np
import os
from collections import defaultdict

# Preferir dataset pequeño para pruebas rápidas si existe
PKL_SMALL = 'data/dataset/england_epl_2015-2016.pkl'
PKL_FULL = 'data/dataset/england_epl.pkl'
DATA_PKL = PKL_SMALL if os.path.exists(PKL_SMALL) else PKL_FULL
TARGET_EVENTS = ['Goal', 'Shots on target']  # modificar si quieres otros eventos

with open(DATA_PKL, 'rb') as f:
    data = pickle.load(f)

X_all = data['X']  # lista de partidos; cada partido es lista de arrays por cámara
y_all = data['y']

print('matches:', len(X_all))

# Función para convertir annotations a array por frame
def annotations_to_frame_labels(annotations, num_frames, target_events):
    # annotations: lista de dicts con 'label' y 'position' (ms)
    n_classes = len(target_events)
    labels = np.zeros((num_frames, n_classes), dtype=np.float32)
    label_to_idx = {lab: i for i, lab in enumerate(target_events)}
    for ann in annotations:
        lab = ann.get('label')
        pos = ann.get('position')
        if lab in label_to_idx and pos is not None:
            try:
                pos_ms = int(pos)
            except Exception:
                continue
            frame = int(pos_ms // 1000)
            if frame < 0:
                continue
            if frame >= num_frames:
                frame = num_frames - 1
            labels[frame, label_to_idx[lab]] = 1.0
    return labels

# Ejemplo en primer partido
first_X = X_all[0]
num_frames = first_X[0].shape[0]
print('num_frames example:', num_frames)
labels0 = annotations_to_frame_labels(y_all[0]['annotations'], num_frames, TARGET_EVENTS)
print('labels0 shape:', labels0.shape, 'sum per class:', labels0.sum(axis=0))

# Combinar cámaras: concatenamos features por eje de características
def combine_cameras(X_match, mode='min'):
    """Combina una lista de arrays por frame.
    Si las longitudes difieren: mode='min' trunca a min, mode='pad' rellena con ceros
    """
    if not isinstance(X_match, list):
        raise ValueError('X_match debe ser una lista de arrays')
    lengths = [arr.shape[0] for arr in X_match]
    print('per-camera frame counts:', lengths)
    if len(set(lengths)) == 1:
        return np.concatenate(X_match, axis=1)
    if mode == 'min':
        min_len = min(lengths)
        truncated = [arr[:min_len] for arr in X_match]
        print(f'Warning: unequal frame counts {lengths}, truncating to min={min_len}')
        return np.concatenate(truncated, axis=1)
    elif mode == 'pad':
        max_len = max(lengths)
        padded = []
        for arr in X_match:
            if arr.shape[0] < max_len:
                pad = np.zeros((max_len - arr.shape[0], arr.shape[1]), dtype=arr.dtype)
                padded.append(np.vstack([arr, pad]))
            else:
                padded.append(arr)
        return np.concatenate(padded, axis=1)
    else:
        raise ValueError("mode must be 'min' or 'pad'")

X0_comb = combine_cameras(first_X, mode='min')
print('combined feature shape:', X0_comb.shape)

# Debug: imprimir conteo de frames por cámara para las primeras partidas

for mi, Xm in enumerate(X_all[:6]):
    lens = [arr.shape[0] for arr in Xm]
    print(f'match {mi}: {lens}')

print('Using dataset:', DATA_PKL)
print('Using dataset:', DATA_PKL)# Nota: para pruebas rápidas usamos el archivo smaller PKL si está presente    print(f'match {mi}: {lens}')    lens = [arr.shape[0] for arr in Xm]for mi, Xm in enumerate(X_all[:6]):Sample camera frame counts for first 6 matches:')

matches: 49
num_frames example: 5690
labels0 shape: (5690, 2) sum per class: [ 4. 24.]
per-camera frame counts: [5690, 5828]
combined feature shape: (5690, 1024)
match 0: [5690, 5828]
match 1: [5679, 5619]
match 2: [5400, 5399]
match 3: [5400, 5400]
match 4: [5964, 6174]
match 5: [5400, 5399]
Using dataset: data/dataset/england_epl_2015-2016.pkl
Using dataset: data/dataset/england_epl_2015-2016.pkl


In [18]:
# Dataset y DataLoader PyTorch (uso de secuencias)
import torch
from torch.utils.data import Dataset, DataLoader

SEQUENCE_LENGTH = 30  # seconds (frames), ajusta según necesites
STRIDE = 5

# Función robusta para combinar cámaras
def combine_cameras(X_match, mode='min'):
    """Combina una lista de arrays por frame.
    Si las longitudes difieren:
      - mode='min': trunca todas a la longitud mínima
      - mode='pad': rellena con ceros hasta la longitud máxima
    Retorna un array (num_frames, total_feat_dim)
    """
    if not isinstance(X_match, list):
        raise ValueError('X_match debe ser una lista de arrays')
    lengths = [arr.shape[0] for arr in X_match]
    if len(set(lengths)) == 1:
        return np.concatenate(X_match, axis=1)

    if mode == 'min':
        min_len = min(lengths)
        if min_len == 0:
            raise ValueError('Una de las cámaras tiene 0 frames')
        truncated = [arr[:min_len] for arr in X_match]
        print(f"Warning: unequal frame counts {lengths}, truncating to min={min_len}")
        return np.concatenate(truncated, axis=1)
    elif mode == 'pad':
        max_len = max(lengths)
        padded = []
        for arr in X_match:
            if arr.shape[0] < max_len:
                pad = np.zeros((max_len - arr.shape[0], arr.shape[1]), dtype=arr.dtype)
                padded.append(np.vstack([arr, pad]))
            else:
                padded.append(arr)
        return np.concatenate(padded, axis=1)
    else:
        raise ValueError("mode must be 'min' or 'pad'")

# Crear dataset pequeño de ejemplo con primeros 4 partidos
N_EXAMPLE_MATCHES = 4
X_small = []
y_small = []
for i, X_match in enumerate(X_all[:N_EXAMPLE_MATCHES]):
    combined = combine_cameras(X_match, mode='min')
    X_small.append(combined)
    num_frames = combined.shape[0]
    labels = annotations_to_frame_labels(y_all[i]['annotations'], num_frames, TARGET_EVENTS)
    y_small.append(labels)

# Normalizar features: calcular mean/std sobre X_small y aplicar
# Esto suele mejorar la convergencia del modelo LSTM
def compute_mean_std(X_list):
    # X_list: lista de arrays (T, D)
    all_concat = np.vstack([x for x in X_list])
    mean = all_concat.mean(axis=0)
    std = all_concat.std(axis=0) + 1e-6
    return mean, std

mean_feat, std_feat = compute_mean_std(X_small)
print('feature dim:', mean_feat.shape[0])
# aplicar normalización in-place creando nuevas matrices (no alterar original si deseas conservar)
X_small = [(x - mean_feat) / std_feat for x in X_small]
print('Normalized X_small using computed mean/std')

# Dataset que acepta arrays ya combinados o listas de arrays
class SoccerSequenceDataset(Dataset):
    def __init__(self, X_matches, y_matches, seq_len=SEQUENCE_LENGTH, stride=STRIDE):
        self.seq_len = seq_len
        self.stride = stride
        self.samples = []
        self.combined = []
        # X_matches puede contener arrays combinados (ndarray) o listas de arrays
        for xm in X_matches:
            if isinstance(xm, list):
                self.combined.append(np.concatenate(xm, axis=1))
            else:
                self.combined.append(xm)
        self.y_matches = y_matches
        for midx, arr in enumerate(self.combined):
            n = arr.shape[0]
            for start in range(0, n - seq_len + 1, stride):
                self.samples.append((midx, start))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        midx, start = self.samples[idx]
        x = self.combined[midx][start:start+self.seq_len]  # (seq_len, D)
        y = self.y_matches[midx][start:start+self.seq_len]  # (seq_len, C)
        # to tensors
        return torch.from_numpy(x).float(), torch.from_numpy(y).float()

# Crear DataLoader con los arrays ya combinados
ds = SoccerSequenceDataset(X_small, y_small, seq_len=SEQUENCE_LENGTH, stride=STRIDE)
loader = DataLoader(ds, batch_size=8, shuffle=True)

print('dataset samples:', len(ds))
for xb, yb in loader:
    print('batch x shape:', xb.shape, 'batch y shape:', yb.shape)
    break


feature dim: 1024
Normalized X_small using computed mean/std
dataset samples: 4400
batch x shape: torch.Size([8, 30, 1024]) batch y shape: torch.Size([8, 30, 2])


In [None]:
# Modelo temporal rápido (Conv1D) + entrenamiento corto con split train/val
try:
    import torch
    import torch.nn as nn
    from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
    import os
    
    class TemporalConvModel(nn.Module):
        def __init__(self, input_dim, n_classes, hidden=256, dropout=0.2):
            super().__init__()
            # Conv1d expects (B, C_in, L); we permute before convs
            self.net = nn.Sequential(
                nn.Conv1d(input_dim, hidden, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden, hidden // 2, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.Dropout(dropout)
            )
            self.fc = nn.Linear(hidden // 2, n_classes)
        def forward(self, x):
            # x: (B, T, D) -> conv needs (B, D, T)
            x = x.permute(0, 2, 1)
            out = self.net(x)  # (B, hidden2, T)
            out = out.permute(0, 2, 1)  # (B, T, hidden2)
            out = self.fc(out)  # (B, T, n_classes)
            return out

    # Preparar datos: split por partidos (train/val)
    n_matches = len(X_small)
    if n_matches < 2:
        raise RuntimeError('Se necesitan al menos 2 partidos en X_small para train/val split')
    split = int(0.75 * n_matches)
    train_X = X_small[:split]
    train_y = y_small[:split]
    val_X = X_small[split:]
    val_y = y_small[split:]

    # recompute datasets and loaders
    ds_train = SoccerSequenceDataset(train_X, train_y, seq_len=SEQUENCE_LENGTH, stride=STRIDE)
    ds_val = SoccerSequenceDataset(val_X, val_y, seq_len=SEQUENCE_LENGTH, stride=STRIDE)
    loader_train = DataLoader(ds_train, batch_size=8, shuffle=True)
    loader_val = DataLoader(ds_val, batch_size=8, shuffle=False)

    # compute pos_weight for BCE (balance classes) from train set
    all_y_train = np.vstack([m.reshape(-1, m.shape[-1]) for m in train_y])
    pos_counts = all_y_train.sum(axis=0)
    neg_counts = all_y_train.shape[0] - pos_counts
    pos_weights = []
    for p, n in zip(pos_counts, neg_counts):
        if p > 0:
            pos_weights.append(float(n / p))
        else:
            pos_weights.append(1.0)
    pos_weight_tensor = torch.tensor(pos_weights, dtype=torch.float32).to(torch.device('cpu'))

    input_dim = X_small[0].shape[1]
    n_classes = len(TARGET_EVENTS)
    model = TemporalConvModel(input_dim, n_classes, hidden=256, dropout=0.2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor.to(device))
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Entrenar varias épocas y evaluar
    N_EPOCHS = 10
    import math
    for epoch in range(N_EPOCHS):
        model.train()
        total_loss = 0.0
        n_batches = 0
        for xb, yb in loader_train:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
            n_batches += 1
        avg_loss = total_loss / max(1, n_batches)
        print(f'epoch {epoch} avg_loss {avg_loss:.4f}')

        # evaluación en val
        model.eval()
        y_trues = []
        y_probs = []
        with torch.no_grad():
            for xb, yb in loader_val:
                p = torch.sigmoid(model(xb.to(device))).cpu().numpy()
                y_trues.append(yb.numpy().reshape(-1, yb.shape[-1]))
                y_probs.append(p.reshape(-1, p.shape[-1]))
        if len(y_trues) == 0:
            print('No hay samples de validación en este split')
            continue
        y_trues = np.vstack(y_trues)
        y_probs = np.vstack(y_probs)
        for c, lab in enumerate(TARGET_EVENTS):
            try:
                auc = roc_auc_score(y_trues[:, c], y_probs[:, c])
            except Exception:
                auc = None
            try:
                ap = average_precision_score(y_trues[:, c], y_probs[:, c])
            except Exception:
                ap = None
            f1_val = f1_score(y_trues[:, c], (y_probs[:, c] > 0.2).astype(int), zero_division=0)
            print(f'Val - class {c} ({lab}): AUROC={auc} AP={ap} F1@0.2={f1_val}')

    # Guardar checkpoint
    ckpt_dir = 'checkpoints'
    os.makedirs(ckpt_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(ckpt_dir, 'temporal_conv.pt'))
    print('Saved checkpoint to', os.path.join(ckpt_dir, 'temporal_conv.pt'))

except ModuleNotFoundError as e:
    print('PyTorch no está instalado en este entorno. Para instalar, ejecuta en PowerShell:')
    print('CPU-only (Windows PowerShell):')
    print('pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu')
    print('DirectML (Intel Iris Xe, Windows) - experimental:')
    print('pip install --upgrade pip')
    print('pip install torch-directml -U')
    print('Si usas conda o GPU, revisa https://pytorch.org/get-started/locally/')
    print('Error:', e)

_IncompleteInputError: incomplete input (2422226349.py, line 120)