In [1]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset
from datasets import load_dataset, Dataset as HFDataset
from transformers import (
    VideoMAEFeatureExtractor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class MyVideoDataset(Dataset):
    def __init__(self, csv_path, feature_extractor, transform=None):
        """
        csv_path: CSV con colonne: path, start, end, label
        feature_extractor: es. VideoMAEFeatureExtractor di Hugging Face
        transform: eventuali transform personalizzati
        """
        self.df = pd.read_csv(csv_path)
        self.feature_extractor = feature_extractor
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        folder_path = row['path']   # es. '/path/part1'
        label = row['label']

        # Carica i 16 frame
        frames = sorted(os.listdir(folder_path))
        images_pil = []
        for frame_file in frames:
            frame_path = os.path.join(folder_path, frame_file)
            img = Image.open(frame_path).convert("RGB")
            if self.transform:
                img = self.transform(img)
            images_pil.append(img)

        # Con Hugging Face, tipicamente passiamo una lista di PIL Image (o np.array)
        # al FeatureExtractor
        encoding = self.feature_extractor(images_pil, return_tensors="pt")
        # Ritorna un dict con chiave 'pixel_values' contenente i tensori del batch
        # Se c'è la dimensione batch=1, di solito shape = [1, num_frames, 3, H, W]

        # Restituiamo anche la label
        encoding["labels"] = torch.tensor(label, dtype=torch.long)
        return encoding

    

# ESEMPIO: costruiamo dataset train e dataset val
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
train_data = MyVideoDataset(csv_path="/percorso/train.csv", feature_extractor=feature_extractor)
val_data   = MyVideoDataset(csv_path="/percorso/val.csv",   feature_extractor=feature_extractor)

# Convertiamo in Hugging Face dataset "al volo"
# (volendo potresti usare la classe MyVideoDataset direttamente in un PyTorch Trainer personalizzato,
#  ma qui mostriamo la via standard HF)
hf_train_data = HFDataset.from_dict({"input": list(range(len(train_data)))})
hf_val_data   = HFDataset.from_dict({"input": list(range(len(val_data)))})

# Definiamo funzioni di "mapping" per convertire l’indice in un batch (usando MyVideoDataset)
def collate_fn(examples):
    # Esempio semplificato: cumuliamo i dictionary (pixel_values, labels, etc.)
    # Dato che MyVideoDataset restituisce un dict, potremmo far passare direttamente
    # i tensori. Oppure creiamo una custom collate.
    # Qui usiamo la "transform" style di HF:
    pixel_values = []
    labels = []
    for e in examples:
        item = train_data[e["input"]] if "train" in e["__index_level_0__"] else val_data[e["input"]]
        pixel_values.append(item["pixel_values"])
        labels.append(item["labels"])
    # Concateniamo
    pixel_values = torch.cat(pixel_values, dim=0)  # shape: [batch_size, num_frames, 3, H, W]
    labels       = torch.stack(labels, dim=0)
    return {"pixel_values": pixel_values, "labels": labels}

# Riconosciamo che la HF Trainer di solito vuole una funzione .map() su dataset
# oppure un custom collator. Ecco un custom dataset:

def gen_batch(batch):
    # batch: list of indices
    pixel_values = []
    labels = []
    for index in batch["input"]:
        item = train_data[index]
        pixel_values.append(item["pixel_values"])
        labels.append(item["labels"])
    pixel_values = torch.cat(pixel_values, dim=0)
    labels = torch.stack(labels, dim=0)
    return {"pixel_values": pixel_values, "labels": labels}


hf_train_data.set_transform(gen_batch)
hf_val_data.set_transform(gen_batch)

model = VideoMAEForVideoClassification.from_pretrained(
    "MCG-NJU/videomae-base", 
    num_labels=2  # binario
)

# Impostiamo argomenti di training
training_args = TrainingArguments(
    output_dir="./video-checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=1,  # dipende dalla GPU
    per_device_eval_batch_size=1,
    logging_steps=10,
    learning_rate=1e-4,
    fp16=False,  # se vuoi mixed precision = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_data,
    eval_dataset=hf_val_data,
)

trainer.train()
trainer.save_model("./video-checkpoints/binary-video-model")