In [1]:
import pandas as pd
from pathlib import Path

# 1. CARICAMENTO DEL DATASET
file_path = r".\archive\All_Players_1992-2025.csv"
file_path:Path = Path(file_path)

# Leggiamo il CSV. 
df_original = pd.read_csv(file_path, encoding='utf-8')

print(f"Dimensioni originali del dataset: {df_original.shape}")
print("Esempio formato stagione:", df_original['Season'].unique()[:5])

# Applichiamo il filtro: Vogliamo che l'anno di inizio sia >= 2018
# Questo includerà 2018-2019, 2019-2020, ecc.
df = df_original[df_original['Season'] >= "2017-2018"].copy()

# 3. VERIFICA
print("-" * 30)
print(f"Dimensioni dopo il filtro: {df.shape}")
print("Stagioni rimaste:", sorted(df['Season'].unique()))

# Visualizziamo le prime righe
df.head()

Dimensioni originali del dataset: (92170, 120)
Esempio formato stagione: ['1992-1993' '1993-1994' '1999-2000' '2000-2001' '1994-1995']
------------------------------
Dimensioni dopo il filtro: (27040, 120)
Stagioni rimaste: ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


Unnamed: 0,PlayerID,Player,Squad,League,Nation,Pos,Age,Born,Season,MP,...,The Best FIFA Mens Player,UEFA Best Player,UCL_MP,UCL_Gls,UCL_xG,UCL_Ast,UCL_xA,UCL_KP,UCL_GCA,UCL_SCA
5504,1681,Claudio Pizarro,Köln,Bundesliga,PER,"FW,MF",38.0,1978.0,2017-2018,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5513,1681,Claudio Pizarro,Werder Bremen,Bundesliga,PER,FW,39.0,1978.0,2018-2019,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5514,1681,Claudio Pizarro,Werder Bremen,Bundesliga,PER,"FW,MF",40.0,1978.0,2019-2020,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6280,1842,Roman Weidenfeller,Dortmund,Bundesliga,GER,GK,36.0,1980.0,2017-2018,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6465,1877,Alexander Meier,Eint Frankfurt,Bundesliga,GER,FW,34.0,1983.0,2017-2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
# Riempie i NaN con 0 per le stats, non dovrebbero esserci valori a NaN
df_clean = df.fillna(0).copy()

# creo un ID univoco per ogni giocatore in una stagione.
# in modo che MOFA lo consideri come un'entità separata
df_clean['SampleID'] = df_clean['Player'] + '_' + df_clean['Season']
df_clean = df_clean.set_index('SampleID')

# definisco le viste
views_definition = {
    'Attacco': ['Goals', 'Shots', 'SoT', 'Assists', 'G/Sh', 'G/SoT'],
    'Possesso': ['Passes_Total', 'Passes_Completed', 'Dribbles_Completed', 'Touches'],
    'Difesa': ['Tackles', 'Interceptions', 'Blocks', 'Clearances', 'Aerials_Won'],
    'Disciplina': ['Cards_Yellow', 'Cards_Red', 'Fouls']
}

data_for_mofa = [] # Lista di matrici
view_names = []    # Lista dei nomi delle viste

print("Creazione matrici per MOFA...")
for view, cols in views_definition.items():
    # Selezioniamo solo le colonne che esistono davvero nel dataset
    valid_cols = [c for c in cols if c in df_clean.columns]
    
    if valid_cols:
        # Estraiamo la matrice di dati
        matrix = df_clean[valid_cols].values
        
        # MOFA vuole le dimensioni (N_samples, N_features)
        print(f"Vista '{view}': {matrix.shape[1]} features trovate.")
        
        data_for_mofa.append(matrix)
        view_names.append(view)
    else:
        print(f"ATTENZIONE: Nessuna colonna trovata per la vista '{view}'. Controlla i nomi.")

Creazione matrici per MOFA...
Vista 'Attacco': 3 features trovate.
Vista 'Possesso': 1 features trovate.
Vista 'Difesa': 1 features trovate.
ATTENZIONE: Nessuna colonna trovata per la vista 'Disciplina'. Controlla i nomi.
