# Caricare le varie librerie pytorch e il modello scelto

In [None]:
# Download del dataset dal drive
import gdown
def download_google_file(shader_url, output_name):
  id_url = "https://drive.google.com/uc?id=" + shader_url.split("/")[5]
  gdown.download(id_url, output_name)

# video del drive (in mp4)
download_google_file("https://drive.google.com/file/d/1tCMqO-PpKseWG_20bjYJwq80oziIY7t2/view?usp=drive_link", "VIDEOS.zip")
!unzip VIDEOS.zip

# gt del drive (in rtf)
download_google_file("https://drive.google.com/file/d/1BF1dlcI0DYdeciJzWtV0-wMOp5O_pks7/view?usp=drive_link", "GT.zip")
!unzip GT.zip

# Funzioni di estrazione dei frame dal video

In [None]:
# Path
videos_path = "TRAINING_SET"
frames_path = "FRAMES"

#Pulizia folder
!rm -R FRAMES/TRAINING_SET/

In [None]:
import cv2, os, argparse, glob, PIL, tqdm
from PIL import Image
def extract_frames(video):
    # Process the video
    ret = True
    cap = cv2.VideoCapture(video)
    f = 0
    while ret:
        ret, img = cap.read()
        if ret:
            f += 1
            Image.fromarray(img).save(os.path.join(frames_path, video, "{:05d}.jpg".format(f)))
    cap.release()

# For all the videos
file_list = [path for path in glob.glob(os.path.join(videos_path,"**"), recursive=True)
             if os.path.isfile(path)]
print(file_list)
for video in tqdm.tqdm(file_list):
  if os.path.isdir(os.path.join(frames_path, video)):
    continue

  os.makedirs(os.path.join(frames_path, video))
  extract_frames(video)
  #os.system("ffmpeg -i {} -r 10 {}/{}/$Frame{}.jpg".format(video, frames_path, video, "%05d"))


In [None]:
# per leggere i file rtf
!pip install striprtf
!pip install pytorchvideo

In [None]:
import os
import os.path
import numpy as np
from PIL import Image
from torchvision import transforms
import torch
from typing import List, Union, Tuple, Any
from striprtf.striprtf import rtf_to_text
import albumentations


class VideoRecord(object):
    """
    Helper class for class VideoFrameDataset. This class
    represents a video sample's metadata.

    Args:
        root_datapath: the system path to the root folder of the videos.
        row: A list with four or more elements where
             1) The first element is the path to the video sample's frames excluding
             the root_datapath prefix
             2) The  second element is the starting frame id of the video
             3) The third element is the inclusive ending frame id of the video
             4) The fourth element is the label index.
             5) any following elements are labels in the case of multi-label classification
    """
    def __init__(self, row, root_datapath):
        self._data = row
        self._path = os.path.join(root_datapath, row[0])

    @property
    def path(self) -> str:
        return self._path

    @property
    def num_frames(self) -> int:
        return self.end_frame - self.start_frame + 1  # +1 because end frame is inclusive

    @property
    def start_frame(self) -> int:
        return int(self._data[1])

    @property
    def end_frame(self) -> int:
        return int(self._data[2])

    @property
    def label(self) -> Union[int, List[int]]:
        # just one label_id
        if len(self._data) == 4:
            return int(self._data[3])
        # sample associated with multiple labels
        else:
            return [int(label_id) for label_id in self._data[3:]]


class VideoFrameDataset(torch.utils.data.Dataset):
    r"""
    A highly efficient and adaptable dataset class for videos.
    Instead of loading every frame of a video,
    loads x RGB frames of a video (sparse temporal sampling) and evenly
    chooses those frames from start to end of the video, returning
    a list of x PIL images or ``FRAMES x CHANNELS x HEIGHT x WIDTH``
    tensors.

    More specifically, the frame range [START_FRAME, END_FRAME] is divided into NUM_SEGMENTS
    segments and FRAMES_PER_SEGMENT consecutive frames are taken from each segment.

    Note:
        A demonstration of using this class can be seen
        in ``demo.py``
        https://github.com/RaivoKoot/Video-Dataset-Loading-Pytorch

    Note:
        This dataset broadly corresponds to the frame sampling technique
        introduced in ``Temporal Segment Networks`` at ECCV2016
        https://arxiv.org/abs/1608.00859.

    Args:
        root_path: The root path in which video folders lie.
                   this is ROOT_DATA from the description above.
        num_segments: The number of segments the video should
                      be divided into to sample frames from.
        frames_per_segment: The number of frames that should
                            be loaded per segment. For each segment's
                            frame-range, a random start index or the
                            center is chosen, from which frames_per_segment
                            consecutive frames are loaded.
        imagefile_template: The image filename template that video frame files
                            have inside of their video folders as described above.
        transform: Transform pipeline that receives a list of numpy images/frames.
        test_mode: If True, frames are taken from the center of each
                   segment, instead of a random location in each segment.

    """
    def __init__(self,
                 root_path: str,
                 num_segments: int = 3,
                 frames_per_segment: int = 1,
                 imagefile_template: str='{:05d}.jpg',
                 transform=None,
                 totensor=True,
                 test_mode: bool = False):
        super(VideoFrameDataset, self).__init__()

        self.root_path = root_path
        self.num_segments = num_segments
        self.frames_per_segment = frames_per_segment
        self.imagefile_template = imagefile_template
        self.test_mode = test_mode

        if transform is None:
            self.transform = None
        else:
            additional_targets = {}
            for i in range(self.num_segments * self.frames_per_segment - 1):
                additional_targets["image%d" % i] = "image"
            self.transform = albumentations.Compose([transform],
                                                    additional_targets=additional_targets,
                                                    p=1)
        self.totensor = totensor
        self.totensor_transform = ImglistOrdictToTensor()

        self._parse_annotationfile()
        self._sanity_check_samples()

    def _load_image(self, directory: str, idx: int) -> Image.Image:
        return np.asarray(Image.open(os.path.join(directory, self.imagefile_template.format(idx))).convert('RGB'))

    def _parse_annotationfile(self):
        self.video_list = []
        for class_name in os.listdir(self.root_path):
            for video_name in os.listdir(os.path.join(self.root_path, class_name)):
                frames_dir = os.path.join(self.root_path, class_name, video_name)
                if os.path.isdir(frames_dir):
                    frame_path = os.path.join(class_name, video_name)
                    end_frame = len(os.listdir(frames_dir))
                    # E' stato aggiunto al codice questo if per effettuare la traduzione della
                    # stringa in modo da renderla compatibile con il codice già presente
                    # per ricavare il file rtf.
                    if("KFOLD" in frames_dir):
                      split_string = frames_dir.split("/")
                      split_string[1]="TRAINING_SET"
                      frames_dir = "/".join(split_string[:2] + split_string[2:]).replace("KFOLD/", "FRAMES/")
                    annotation_path = frames_dir\
                        .replace("\\", "/") \
                        .replace("FRAMES/", "GT/") \
                        .replace(".mp4", ".rtf")

                    with open(annotation_path, 'r') as file:
                        text = rtf_to_text(file.read())
                    if len(text):
                        label = 1
                        start_frame = int(text.split(",")[0])
                        if start_frame == 0:
                          start_frame = 1
                    else:
                        label = 0
                        start_frame = 1

                    self.video_list.append(VideoRecord(
                        [frame_path, start_frame, end_frame, label],
                        self.root_path))

    def _sanity_check_samples(self):
        for record in self.video_list:
            if record.num_frames <= 0 or record.start_frame == record.end_frame:
                print(f"\nDataset Warning: video {record.path} seems to have zero RGB frames on disk!\n")

            elif record.num_frames < (self.num_segments * self.frames_per_segment):
                print(f"\nDataset Warning: video {record.path} has {record.num_frames} frames "
                      f"but the dataloader is set up to load "
                      f"(num_segments={self.num_segments})*(frames_per_segment={self.frames_per_segment})"
                      f"={self.num_segments * self.frames_per_segment} frames. Dataloader will throw an "
                      f"error when trying to load this video.\n")

    def _get_start_indices(self, record: VideoRecord) -> 'np.ndarray[int]':
        """
        For each segment, choose a start index from where frames
        are to be loaded from.

        Args:
            record: VideoRecord denoting a video sample.
        Returns:
            List of indices of where the frames of each
            segment are to be loaded from.
        """
        # choose start indices that are perfectly evenly spread across the video frames.
        if self.test_mode:
            distance_between_indices = (record.num_frames - self.frames_per_segment + 1) / float(self.num_segments)

            start_indices = np.array([int(distance_between_indices / 2.0 + distance_between_indices * x)
                                      for x in range(self.num_segments)])
        # randomly sample start indices that are approximately evenly spread across the video frames.
        else:
            max_valid_start_index = (record.num_frames - self.frames_per_segment + 1) // self.num_segments

            start_indices = np.multiply(list(range(self.num_segments)), max_valid_start_index) + \
                      np.random.randint(max_valid_start_index, size=self.num_segments)

        return start_indices

    def __getitem__(self, idx: int) -> Union[
        Tuple[List[Image.Image], Union[int, List[int]]],
        Tuple['torch.Tensor[num_frames, channels, height, width]', Union[int, List[int]]],
        Tuple[Any, Union[int, List[int]]],
        ]:
        """
        For video with id idx, loads self.NUM_SEGMENTS * self.FRAMES_PER_SEGMENT
        frames from evenly chosen locations across the video.

        Args:
            idx: Video sample index.
        Returns:
            A tuple of (video, label). Label is either a single
            integer or a list of integers in the case of multiple labels.
            Video is either 1) a list of PIL images if no transform is used
            2) a batch of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1]
            if the transform "ImglistToTensor" is used
            3) or anything else if a custom transform is used.
        """
        record: VideoRecord = self.video_list[idx]

        frame_start_indices: 'np.ndarray[int]' = self._get_start_indices(record)

        return self._get(record, frame_start_indices)

    def _get(self, record: VideoRecord, frame_start_indices: 'np.ndarray[int]') -> Union[
        Tuple[List[Image.Image], Union[int, List[int]]],
        Tuple['torch.Tensor[num_frames, channels, height, width]', Union[int, List[int]]],
        Tuple[Any, Union[int, List[int]]],
        ]:
        """
        Loads the frames of a video at the corresponding
        indices.

        Args:
            record: VideoRecord denoting a video sample.
            frame_start_indices: Indices from which to load consecutive frames from.
        Returns:
            A tuple of (video, label). Label is either a single
            integer or a list of integers in the case of multiple labels.
            Video is either 1) a list of PIL images if no transform is used
            2) a batch of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1]
            if the transform "ImglistToTensor" is used
            3) or anything else if a custom transform is used.
        """

        frame_start_indices = frame_start_indices + record.start_frame
        images = list()

        # from each start_index, load self.frames_per_segment
        # consecutive frames
        for start_index in frame_start_indices:
            frame_index = int(start_index)

            # load self.frames_per_segment consecutive frames
            for _ in range(self.frames_per_segment):
                image = self._load_image(record.path, frame_index)
                images.append(image)

                if frame_index < record.end_frame:
                    frame_index += 1

        if self.transform is not None:
            transform_input = {"image": images[0]}
            for i, image in enumerate(images[1:]):
                transform_input["image%d" % i] = image
            images = self.transform(**transform_input)

        if self.totensor:
            images = self.totensor_transform(images)

        #È stato aggiunto il seguente frammento di codice per gestire l'input della rete SlowFaST_R50.
        #Questa rete prende in input una lista di due tensori: il primo rappresenta il percorso di Slow
        # e il secondo rappresenta il percorso di Fast. Viene restituita la lista e la label corrispondente
        frames_tensor = {'video': images.permute(1,0,2,3)}
        video_data = transform(frames_tensor)
        inputs = video_data["video"]
        inputs = [i[None, ...].squeeze() for i in inputs]
        return inputs, record.label

    def __len__(self):
        return len(self.video_list)


class ImglistOrdictToTensor(torch.nn.Module):
    """
    Converts a list or a dict of numpy images to a torch.FloatTensor
    of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH).
    Can be used as first transform for ``VideoFrameDataset``.
    """
    @staticmethod
    def forward(img_list_or_dict):
        """
        Converts each numpy image in a list or a dict to
        a torch Tensor and stacks them into a single tensor.

        Args:
            img_list_or_dict: list or dict of numpy images.
        Returns:
            tensor of size ``NUM_IMAGES x CHANNELS x HEIGHT x WIDTH``
        """
        if isinstance(img_list_or_dict, list):
            return torch.stack([transforms.functional.to_tensor(img)
                                for img in img_list_or_dict])
        else:
            return torch.stack([transforms.functional.to_tensor(img_list_or_dict[k])
                                for k in img_list_or_dict.keys()])

# Struttura del modello


Questi sono i valori delle trasformazioni standard utilizzate per addestrare la SlowFast_R50. È importante notare che alcune di queste trasformazioni standard sono state rimosse per motivi specifici:

1. Lambda: Questa operazione è gestita dalla classe ImglistOrdictToTensor.
2. ShortSideScale e CenterCrop: Sono state inserite nel preprocessing per migliorare le prestazioni.







In [None]:
#@title  { display-mode: "form" }
#@markdown Parametri SlowFast Transform


alpha = 4 #@param {type: "number"}
num_frames = 32 #@param {type: "number"}
sampling_rate = 2 #@param {type: "number"}

#@markdown ---

In [None]:
####################
# SlowFast transform
####################
import torch
import json
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)

mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            NormalizeVideo(mean, std),
            PackPathway()
        ]
    ),
)


In questa funzione, carichiamo il modello. La prima volta che viene eseguita, il modello viene scaricato dalla rete. Dalle volte successive, il modello sarà già presente in memoria. Successivamente, randomizziamo i pesi dell'ultimo blocco e ripetiamo la stessa operazione per i sei layer precedenti.<br>
Abbiamo scelto di ricaricare il modello ogni volta per garantire maggiore sicurezza e ridurre il rischio di riutilizzare i pesi dal fold precedente.

In [None]:
def clear_model():
    """
    Cancella gli ultimi layer del modello e ne aggiunge dei nuovi, impedendo il gradiente
    discendente sui layer precedenti (quelli già allenati dalla rete).

    Args:
        model (Model): la rete da modificare.

    Returns:
        Model: la rete modificata.
    """
    # Carico la rete
    model_name = "slowfast_r50"
    model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)
    # Disattivo il gradiente
    model.blocks[:-1].requires_grad_(False)
    # Ultimo layer
    model.blocks[-1].proj = torch.nn.Sequential(torch.nn.Linear(2304,128),torch.nn.ReLU(),torch.nn.Linear(128,2))
    torch.nn.init.xavier_uniform_(model.blocks[-1].proj[0].weight)
    torch.nn.init.xavier_uniform_(model.blocks[-1].proj[2].weight)
    model.blocks[-1].proj.requires_grad_(True)
    model.blocks[-1].output_pool=  torch.nn.Softmax()
    # Septultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_a.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_a.weight)
    # Sestultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_b.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_b.weight)
    # Quintultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_c.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[1].branch2.conv_c.weight)
    # Quartultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_a.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_a.weight)
    # Terzultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_b.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_b.weight)
    # Penultimo layer
    model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_c.requires_grad_(True)
    torch.nn.init.xavier_uniform_(model.blocks[-3].multipathway_blocks[1].res_blocks[2].branch2.conv_c.weight)

    return model

In [None]:
import torch
import json
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from typing import Dict

#creazione modello
model=clear_model()

# Gestione di KFold e di generazione dei dataloader


Si è deciso di usare la move al posto della copytree per ragioni di efficienza del disco, la move fa semplicemente una ridenominazione nel filesystem, la copytree invece deve riscrivere ogni singolo file sul disco, impiegando quantità di tempo molto maggiori

In [None]:
# Path
videos_path = "TRAINING_SET"
frames_path = "FRAMES"
initial_path="FRAMES/TRAINING_SET"
kfold_path="KFOLD"

In [None]:
import os
import random
import shutil

def k_fold_split(folder_path,destination_path, k):
    """
    Esegue la tecnica K-fold su una cartella contenente due sottocartelle etichettate come "1" e "0",
    prelevando in modo proporzionale da entrambe le cartelle.

    Args:
        folder_path (str): Percorso completo della cartella dei fold.
        destination_path (str): percorso completo della cartella di destinazione.
        k (int): Numero di fold desiderati.

    Returns:
        bool: True se la divisione K-fold è stata eseguita con successo, False altrimenti.
    """
    if not os.path.isdir(folder_path):
        return False

    label_folders = ["0", "1"]
    fold_names = [f"fold_{i+1}" for i in range(k)]

    # Ottieni la lista dei file nella cartella etichettata "1"
    label_1_folder = os.path.join(folder_path, "1")
    label_1_files = os.listdir(label_1_folder)

    # Ottieni la lista dei file nella cartella etichettata "0"
    label_0_folder = os.path.join(folder_path, "0")
    label_0_files = os.listdir(label_0_folder)

    # Calcola il numero di file per ogni fold in modo proporzionale
    total_files = len(label_1_files) + len(label_0_files)
    files_per_fold = total_files // k
    label_1_files_per_fold = round(len(label_1_files) * files_per_fold / total_files)
    label_0_files_per_fold = round(len(label_0_files) * files_per_fold / total_files)

    # Distribuisci casualmente i file etichettati "1" nei fold
    random.shuffle(label_1_files)
    for i, fold_name in enumerate(fold_names):
        destination_folder = os.path.join(destination_path, fold_name, "1")
        files_to_move = label_1_files[i * label_1_files_per_fold:(i+1) * label_1_files_per_fold]
        for file_name in files_to_move:
            source_file_path = os.path.join(label_1_folder, file_name)
            destination_file_path = os.path.join(destination_folder, file_name)
            shutil.move(source_file_path, destination_file_path)

    # Recupero dei file rimanenti etichettati "1"
    label_1_files = os.listdir(label_1_folder)
    random.shuffle(label_1_files)
    dim=len(label_1_files)
    for i, fold_name in enumerate(fold_names):
      if dim!=0:
        destination_folder = os.path.join(destination_path, fold_name, "1")
        file_name = label_1_files.pop()
        source_file_path = os.path.join(label_1_folder, file_name)
        destination_file_path = os.path.join(destination_folder, file_name)
        shutil.move(source_file_path, destination_file_path)
        dim-=1
      else:
        break


    # Distribuisci casualmente i file etichettati "0" nei fold
    random.shuffle(label_0_files)
    for i, fold_name in enumerate(fold_names):
        destination_folder = os.path.join(destination_path, fold_name, "0")
        files_to_move = label_0_files[i * label_0_files_per_fold:(i+1) * label_0_files_per_fold]
        for file_name in files_to_move:
            source_file_path = os.path.join(label_0_folder, file_name)
            destination_file_path = os.path.join(destination_folder, file_name)
            shutil.move(source_file_path, destination_file_path)

    # Recupero dei file rimanenti etichettati "0"
    label_0_files = os.listdir(label_0_folder)
    random.shuffle(label_0_files)
    dim=len(label_0_files)
    for i, fold_name in enumerate(fold_names):
      if dim!=0:
        destination_folder = os.path.join(destination_path, fold_name, "0")
        file_name = label_0_files.pop()
        source_file_path = os.path.join(label_0_folder, file_name)
        destination_file_path = os.path.join(destination_folder, file_name)
        shutil.move(source_file_path, destination_file_path)
        dim-=1
      else:
        break

    return True

In [None]:
import os
import shutil

def restore_k_fold(folder_path,destination_path, k):
    """
    Riporta tutti i file delle cartelle dei fold nella cartella originale.

    Args:
        folder_path (str): Percorso completo della cartella dei fold.
        destination_path (str): percorso completo della cartella di destinazione.
        k (int): Numero di fold.

    Returns:
        bool: True se i file sono stati riportati con successo, False altrimenti.
    """
    if not os.path.isdir(folder_path):
        return False

    fold_names = [f"fold_{i+1}" for i in range(k)]

    # Riporta i file dei fold nella cartella originale
    for fold_name in fold_names:
        fold_path = os.path.join(folder_path, fold_name)
        for label_folder in os.listdir(fold_path):
            label_folder_path = os.path.join(fold_path, label_folder)
            for file_name in os.listdir(label_folder_path):
                source_file_path = os.path.join(label_folder_path, file_name)
                destination_file_path = os.path.join(destination_path, label_folder, file_name)
                shutil.move(source_file_path, destination_file_path)

        # Elimina la cartella del fold
        shutil.rmtree(fold_path)

    return True

Si è deciso di gestire in questo modo il k-fold, creando diversi dataset per il training e non uno solo poichè preoccuparsi di fare quest'altro passaggio sarebbe costato in termini di tempo per la copia dei vari fold. Il risultato adottando questa situazione non cambia poichè crea un dataloader con tutti i dataset.<br>

In [None]:
def make_training_validation_dataloaders(dir,k, validation_num,preprocessing,augumentation,num_segments,frames_per_segment,transform_probability,batch_size,num_workers):
    """
      Restituisce una lista contenente i dataloader del training set e del
      validation set

      Args:
          dir (str): Percorso completo della cartella dei fold.
          k (int): Numero di fold.
          validation_num (int): Numero del fold di validation.

      Returns:
          [Dataloader, Datalaoader]: training e validation dataloaders.
      """
    from tqdm import tqdm
    from torch.utils.data import DataLoader,ConcatDataset

    if(validation_num>k):
      raise ValueError("k deve essere maggiore di validation_num!")

    # Dataset per il training
    training_sets=list()
    for i in range(1,k+1):
      if i!=validation_num:
        training_sets.append(VideoFrameDataset(root_path=dir+"/fold_"+str(i),
                                num_segments=num_segments,
                                frames_per_segment=frames_per_segment,
                                transform=albumentations.Compose([
                                    preprocessing,
                                    augumentation],
                                    p=transform_probability,
                                )
                                )
                                )
      else:
        validation_set=VideoFrameDataset(root_path=dir+"/fold_"+str(i),
                                num_segments=num_segments,
                                frames_per_segment=frames_per_segment,
                                transform=albumentations.Compose([
                                    preprocessing,
                                    augumentation],
                                    p=transform_probability,
                                ),
                                totensor=True,
                                test_mode=True,
                                )
    training_set = ConcatDataset(training_sets)
    dataloader_train = DataLoader(training_set, shuffle=True,
                              batch_size=batch_size, num_workers=num_workers, pin_memory=True)
    dataloader_validation = DataLoader(validation_set, shuffle=False,
                              batch_size=batch_size, num_workers=num_workers, pin_memory=True)

    return dataloader_train, dataloader_validation

# Definizione singola epoca

Funzione realizzata per eseguire una singola epoca di training e per valutare le performance del validation.<br>
Vengono utilizzati diversi indici di prestazione:
1. Accuracy
2. Loss
3. Precision
4. Recall
5. F-score

In [None]:
from torch import Tensor
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from torch.nn import Module
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def one_epoch( model, dataloader_train, dataloader_validation,
              lossFunction, optimizer, writer,
               epoch_num):
  ''' Esegue una epoca sul dataset di training ed effettua anche una validazione
    con il validation set


    Args:

    model (Module): Il modello di rete neurale.
    dataloader_train (DataLoader): Il DataLoader per il dataset di addestramento.
    dataloader_validation (DataLoader): Il DataLoader per il dataset di validazione.
    lossFunction (Module): La funzione di loss da utilizzare durante l'addestramento.
    optimizer (Optimizer): L'ottimizzatore da utilizzare durante l'addestramento.
    writer (SummaryWriter): Oggetto SummaryWriter per scrivere i log su TensorBoard.
    epoch_num (int): Il numero di epoca corrente.


    Returns:
    accuracy_final (float): La media delle accuracy sul dataset di validazione.
    loss_final (float): La media delle loss sul dataset di validazione.
    precision_final (float): La media del valore di precision sul dataset di validazione.
    recall_final (float): La media del valore di recall sul dataset di validazione.
    f_score_final (float): La media del valore di F-score sul dataset di validazione.
    '''

  # contatore per TensorBoard
  i_start = epoch_num * len(dataloader_train)

  # --- TRAINING ---
  for i, (X, y) in enumerate(dataloader_train):
    X[0]=X[0].cuda()
    X[1]=X[1].cuda()
    y = y.cuda()

    # azzero il gradiente
    optimizer.zero_grad()

    # effettuo il forward e utilizzo la funzione di loss, effettuando il backpropagation
    output = model.forward(X)
    loss = lossFunction(output, y)
    loss.backward()

    # utilizzo l'optimizer
    optimizer.step()

    # Definisco il valore di positività
    positive = output[:, 1] >= 0.5

    # calcolo gli indici di prestazione ed aggiorno il rispettivo contatore
    #true positive
    true_positive=(positive.detach()==True)&(y.detach()==1)
    true_positive_counter=true_positive.detach().float().count_nonzero().item()* 1.0
    #false positive
    false_positive=(positive.detach()==True)&(y.detach()==0)
    false_positive_counter=false_positive.detach().float().count_nonzero().item()* 1.0
    #true negative
    true_negative=(positive.detach()==False)&(y.detach()==0)
    true_negative_counter=true_negative.detach().float().count_nonzero().item()* 1.0
    #false negative
    false_negative=(positive.detach()==False)&(y.detach()==1)
    false_negative_counter=false_negative.detach().float().count_nonzero().item()* 1.0

    # Calcolo la precision
    if true_positive_counter+false_positive_counter>0:
      precision=true_positive_counter/(true_positive_counter+false_positive_counter)
    else:
      precision=0
    # Calcolo la recall
    if true_positive_counter+false_negative_counter>0:
      recall=true_positive_counter/(true_positive_counter+false_negative_counter)
    else:
      recall=0
    # Calcolo lo F-score
    if precision+recall>0:
      f_score=2*precision*recall/(precision+recall)
    else:
      f_score=0

    # Scrittura dei risutlati su TensorBoard
    # Train/Loss
    writer.add_scalar('train/loss', loss.detach().item(), i_start + i)
    # Train/Precision
    writer.add_scalar('train/precision', precision, i_start + i)
    # Train/Recall
    writer.add_scalar('train/recall', recall, i_start + i)
    # Train/F-score
    writer.add_scalar('train/f_score', f_score, i_start + i)

  # --- VALIDATION ---
  with torch.no_grad():
    # Creazione liste per salvare i vari indici
    true_positive_counter=[]
    false_positive_counter=[]
    true_negative_counter=[]
    false_negative_counter=[]
    elements=[]
    loss=[]

    # dati del validaiton
    for X, y in dataloader_validation:
      X[0]=X[0].cuda()
      X[1]=X[1].cuda()
      y = y.cuda()

      elements.append(len(y))

      # Ottengo i risultati dal modello
      output = model(X)
      loss_result = lossFunction(output,y)
      loss.append(loss_result.item())

      # Definisco il valore di positività
      positive = output[:, 1] >= 0.5

      # calcolo gli indici di prestazione ed aggiorno il rispettivo contatore
      # True positive
      true_positive=(positive==1)&(y==1)
      true_positive_counter.append(true_positive.count_nonzero().item()*1.0)
      # False positive
      false_positive=(positive==1)&(y==0)
      false_positive_counter.append(false_positive.count_nonzero().item()*1.0)
      # True negative
      true_negative=(positive==0)&(y==0)
      true_negative_counter.append(true_negative.count_nonzero().item()*1.0)
      # False negative
      false_negative=(positive==0)&(y==1)
      false_negative_counter.append(false_negative.count_nonzero().item()*1.0)

    # Calcolo i vari indici
    #loss
    loss_final = np.average(loss, weights=elements)
    # True positive
    true_positive_result=np.sum(true_positive_counter)
    # True negative
    true_negative_result=np.sum(true_negative_counter)
    # False positive
    false_positive_result=np.sum(false_positive_counter)
    # False negative
    false_negative_result=np.sum(false_negative_counter)
    # Precision
    precision_final=true_positive_result / (true_positive_result + false_positive_result) if true_positive_result + false_positive_result > 0 else 0.0
    # Recall
    recall_final=true_positive_result / (true_positive_result + false_negative_result) if true_positive_result + false_negative_result > 0 else 0.0
    # F-score
    f_score_final=2 * precision_final * recall_final / (precision_final + recall_final) if precision_final + recall_final > 0 else 0.0
    # Accuracy
    accuracy_final=(true_positive_result+true_negative_result)/(true_positive_result+true_negative_result+false_positive_result+false_negative_result)

    # Salvataggio dei risultati degli indici
    # Validation/Accuracy
    writer.add_scalar('validation/accuracy', accuracy_final, i_start + i)
    # Validation/Loss
    writer.add_scalar('validation/loss', loss_final, i_start + i)
    # Validation/Precision
    writer.add_scalar('validation/precision', precision_final, i_start + i)
    # Validation/Recall
    writer.add_scalar('validation/recall', recall_final, i_start + i)
    # Validation/F-score
    writer.add_scalar('validation/f_score', f_score_final, i_start + i)

    return accuracy_final, loss_final, precision_final, recall_final, f_score_final

# Preparare i parametri per i frame

Preprocessing dei frame in accordo agli standard della SlowFast_R50 ed augmentations tramite le funzioni di Albumentations.

In [None]:
# Preprocessing dei frame
preprocessing = albumentations.Sequential(
    [
        albumentations.SmallestMaxSize(max_size=256, always_apply=True),
        albumentations.CenterCrop(height=224, width=224, always_apply=True),
    ]
)

# Augumentation dei frame
augumentation = albumentations.Compose([
    albumentations.OneOf([
        albumentations.ColorJitter(p=0.5),
        albumentations.GaussianBlur(p=0.5),
        albumentations.HorizontalFlip(p=1),
        albumentations.RandomBrightnessContrast(p=0.5),
        albumentations.HueSaturationValue(p=0.5)
    ], p=0.1)
])

A causa della natura della SlowFast_R50, che gestisce sia uno slow path che un fast path, il numero di segmenti per ogni video è stato impostato a 32, come previsto dalla struttura della rete. Tuttavia, poiché la rete stessa gestisce il flusso temporale, abbiamo impostato il parametro "num_segment" uguale a 1. Ciò ci ha permesso di considerare frame consecutivi, mantenendo un approccio sequenziale nell'analisi dei video durante il training.

In [None]:
#@title  { display-mode: "form" }
#@markdown Parametri per l'augumentation
#@markdown ---

transform_probability = 1 #@param {type: "number"}

#@markdown ---
#@title  { display-mode: "form" }
#@markdown Parametri per il Dataloader
#@markdown ---

num_workers=2 #@param {type: "number"}

batch_size=16 #@param {type: "number"}

num_segments = 1 #@param {type:"integer"}

frames_per_segment = 32 #@param {type:"integer"}
#@markdown ---

# Preparare i parametri di training

Di seguito sono riportati i parametri utilizzati durante il training del modello consigliato.

In [None]:
#@title  { display-mode: "form" }
#@markdown Parametri per il training

epochs = 125 #@param {type:"integer"}
k_folds = 5 #@param {type:"integer"}

#@markdown ---

learning_rate = 0.000015 #@param {type: "number"}
momentum = 0 #@param {type:"number"}
weight_decay = 0 #@param {type: "number"}

#@markdown ---

early_stopping = True #@param {type:"boolean"}
early_stopping_patience = 5 #@param {type:"integer"}

#@markdown ---

# Avviare TensorBoard

In [None]:
#!rm -r './executions'
!killall tensorboard
%load_ext tensorboard
%tensorboard --logdir ./executions/

# Allenamento della rete

In questa fase, stiamo eseguendo il training della rete.<br>
Abbiamo scelto di utilizzare l'ottimizzatore "ADAM" perché, a differenza di SGD (Stochastic Gradient Descent), è meno sensibile al learning rate. L'ottimizzatore ADAM adatta il learning rate per ogni parametro del modello, rendendo il processo di addestramento più stabile e riducendo la necessità di una scelta manuale accurata del learning rate. Questo ci permette di ottenere una maggiore efficienza nel training della rete e una migliore convergenza verso soluzioni ottimali.

In [None]:
import os
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Altri parametri per il training
dataloader_params = {'pin_memory': True}
K_folds = [k_folds] # se si vogliono usare diversi tipi di divisione dei fold, inserli qui

lossFunction = CrossEntropyLoss()
folder_path = "executions"
# Se la cartella esiste, la svuota
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)  # Rimuovi la cartella e tutti i suoi contenuti
    os.makedirs(folder_path)  # Ricrea la cartella vuota
else:
    os.makedirs(folder_path)  # Crea la cartella se non esiste

# Per ogni dimensione di fold
for K in K_folds:
  # Settiamo a 0 i vari incidi di performance del validaiton
  # Accuracy
  validation_accuracy_results = torch.zeros(K, epochs)
  # Loss
  validation_loss_results = torch.zeros(K, epochs)
  # Precision
  validation_precision_results = torch.zeros(K, epochs)
  # Recall
  validation_recall_results = torch.zeros(K, epochs)
  # F-score
  validation_f_score_results = torch.zeros(K, epochs)

  # Nome dell'eserimento
  experiment_fold = 'exp-tot_folds-{}'.format(K)

  # Path per la cartella dell'esperimento
  experiment_path = os.path.join(folder_path, experiment_fold)
  os.mkdir(experiment_path)

  # Split dei fold
  k_fold_split(initial_path,kfold_path, K)
  # Per ogni fold
  for k in range(K):
    # Pulisco il modello
    clear_model()
    model.cuda()

    # Definisco la loss minima
    min_loss = 1e10

    # Definisco il nome dell'eserimento
    experiment_name = (experiment_fold + '-fold-{}').format(k+1)
    path = os.path.join(folder_path, experiment_fold, experiment_name)
    os.mkdir(path)

    # Creazione dei dataloader di training e validation a partire dai fold
    [dataloader_train, dataloader_validation]=make_training_validation_dataloaders(kfold_path,K, k+1,
            preprocessing,augumentation,num_segments,frames_per_segment,transform_probability,
            batch_size,num_workers)# +1 perchè partono da 1

    # Preparazione dell'optimizer e del writer per TensorFlow
    writer = SummaryWriter(path)
    optimizer = Adam(model.parameters(),
                    lr=learning_rate,

                    weight_decay=weight_decay)

    # Settaggio del contatore dell'early stopping
    early_stopping_counter = early_stopping_patience

    # Esecuzione delle epoche
    for epoch in tqdm(range(epochs), desc=experiment_name):
      validation_accuracy, validation_loss, validation_precision, validation_recall,validation_f_score = one_epoch(model,dataloader_train,
                                dataloader_validation,lossFunction,optimizer,writer,epoch)

      # Salvataggio degli indici di performance
      # Accuracy
      validation_accuracy_results[k, epoch] = validation_accuracy
      # Loss
      validation_loss_results[k, epoch] = validation_loss
      # Precision
      validation_precision_results[k, epoch] = validation_precision
      # Recall
      validation_recall_results[k, epoch] = validation_recall
      # F-score
      validation_f_score_results[k, epoch] = validation_f_score

      # Salvataggio dei pesi relativi al modello con minor loss attuale
      if validation_loss < min_loss:
        min_loss = validation_loss
        print("Epoca: "+str((epoch)))
        print("Min loss: "+str(min_loss))
        torch.save(model.state_dict(), os.path.join(path, 'model.pth'))
        early_stopping_counter = early_stopping_patience

      print("Accuracy: "+str(validation_accuracy))

      # Early Stopping
      if early_stopping and epoch > 0:
        if validation_loss >= validation_loss_results[k, epoch-1]:
          early_stopping_counter -= 1
        else:
          early_stopping_counter = early_stopping_patience
        if early_stopping_counter == 0:
          break

  # Restore dei fold
  restore_k_fold(kfold_path,initial_path, K)

  # Salvataggio dei risultati del validation
  # Accuracy
  torch.save(validation_accuracy_results, os.path.join(experiment_path, 'validation_accuracy_results.pth'))
  # Loss
  torch.save(validation_loss_results, os.path.join(experiment_path, 'validation_loss_results.pth'))
  # Precision
  torch.save(validation_precision_results, os.path.join(experiment_path, 'validation_precision_results.pth'))
  # Recall
  torch.save(validation_recall_results, os.path.join(experiment_path, 'validation_recall_results.pth'))
  # F-score
  torch.save(validation_f_score_results, os.path.join(experiment_path, 'validation_f_score_results.pth'))