## Import Dataset e Data Preparation

In [2]:
import sys
import pandas as pd
import os

current_dir = os.getcwd()  # Cartella corrente
root_path = os.path.abspath(os.path.join(current_dir, '..')) # Percorso Root
sys.path.append(root_path)


In [3]:
# Definizione dei percorsi relativi ai dataset
path_dataset = "dataset"
path_train = os.path.join(path_dataset, "train")
path_test = os.path.join(path_dataset, "test")

In [4]:
# Funzione per creare DataFrame a partire dai file CSV mantenendo precisione completa
def create_dataframe(path):
    dfs_data = []
    data_path = os.path.join(path, "data")
    
    # Controllo se la cartella esiste
    if not os.path.exists(data_path):
        print(f"Errore: la cartella {data_path} non esiste.")
        return None, None

    filenames = sorted(os.listdir(data_path))
    
    for case, filename in enumerate(filenames):
        if filename.endswith('.csv'):
            file_path = os.path.join(data_path, filename)

            # Lettura CSV come stringhe per evitare approssimazioni
            df = pd.read_csv(file_path, dtype=str)  
            
            # Converti manualmente le colonne numeriche in float64 per mantenere precisione
            for col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")  

            df["Case"] = case + 1  # Aggiunta della colonna "Case"
            dfs_data.append(df)

    if dfs_data:
        df_data = pd.concat(dfs_data, ignore_index=True)
    else:
        df_data = None

    return df_data, dfs_data

In [5]:
# Funzione per importare i file di label
def create_df_labels(path, filename):
    file_path = os.path.join(path, filename)
    df = pd.read_excel(file_path, header=1)
    
    if "train" in path:
        df = df.rename(columns={"Unnamed: 0": "Case", "Unnamed: 1": "Spacecraft", "Unnamed: 2": "Condition"})
    elif "test" in path:
        df = df.rename(columns={"Case#": "Case", "Spacecraft#": "Spacecraft"})
        
    return df

In [6]:

# Creazione dei DataFrame per train e test
df_train, train_list = create_dataframe(path_train)
df_test, test_list = create_dataframe(path_test)

# Importazione dei label
df_labels_train = create_df_labels(path_train, "labels.xlsx")
df_labels_test = create_df_labels(path_test, "labels_spacecraft.xlsx")

# Visualizzazione dei DataFrame con display()
if df_train is not None:
    print("Train Dataset:")
    display(df_train.head())

if df_test is not None:
    print("Test Dataset:")
    display(df_test.head())

if df_labels_train is not None:
    print("Labels Train:")
    display(df_labels_train.head())

if df_labels_test is not None:
    print("Labels Test:")
    display(df_labels_test.head())

Train Dataset:


Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case
0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
1,0.001,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
2,0.002,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
3,0.003,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
4,0.004,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1


Test Dataset:


Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case
0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
1,0.001,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
2,0.002,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
3,0.003,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
4,0.004,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1


Labels Train:


Unnamed: 0,Case,Spacecraft,Condition,SV1,SV2,SV3,SV4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,BV1
0,1,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
1,2,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
2,3,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
3,4,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
4,5,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No


Labels Test:


Unnamed: 0,Case,Spacecraft
0,178,1
1,179,1
2,180,1
3,181,1
4,182,1


In [7]:
# Creazione della cartella "dataframes" se non esiste
path_dataframes = os.path.join("dataset", "dataframes")
os.makedirs(path_dataframes, exist_ok=True)

# Funzione per salvare in formato Pickle
def save_dataframe_pickle(df, filename):
    filepath = os.path.join(path_dataframes, filename)
    df.to_pickle(filepath)
    print(f"Salvato: {filepath}")

# Caricamento dei DataFrame
df_train = pd.read_csv("dataset/dataframes/train_data.csv")
df_test = pd.read_csv("dataset/dataframes/test_data.csv")
df_labels_train = pd.read_csv("dataset/dataframes/labels_train.csv") if os.path.exists("dataset/dataframes/labels_train.csv") else None
df_labels_test = pd.read_csv("dataset/dataframes/labels_test.csv") if os.path.exists("dataset/dataframes/labels_test.csv") else None

# Salvataggio in formato Pickle
save_dataframe_pickle(df_train, "train_data.pkl")
save_dataframe_pickle(df_test, "test_data.pkl")

if df_labels_train is not None:
    save_dataframe_pickle(df_labels_train, "labels_train.pkl")
if df_labels_test is not None:
    save_dataframe_pickle(df_labels_test, "labels_test.pkl")

Salvato: dataset\dataframes\train_data.pkl
Salvato: dataset\dataframes\test_data.pkl
Salvato: dataset\dataframes\labels_train.pkl
Salvato: dataset\dataframes\labels_test.pkl


Dataframe etichettato di train non aggregato

In [8]:
# Caricamento dei dataset
df_train = pd.read_csv(os.path.join(path_dataframes, "train_data.csv"))
df_labels_train = pd.read_csv(os.path.join(path_dataframes, "labels_train.csv"))

# Unione dei DataFrame usando la colonna "Case" come chiave
df_train_labeled = df_train.merge(df_labels_train, on="Case", how="left")

# Salvataggio del nuovo DataFrame in CSV
df_train_labeled.to_csv(os.path.join(path_dataframes, "train_data_labeled.csv"), index=False)
save_dataframe_pickle(df_train_labeled, "train_data_labeled.pkl")

print("✅ DataFrame unito salvato in formato Pickle: dataset/dataframes/train_data_labeled.pkl")
print("✅ DataFrame unito salvato in: dataset/dataframes/train_data_labeled.csv")

# Mostrare le prime righe del nuovo dataset per verifica
display(df_train_labeled)


Salvato: dataset\dataframes\train_data_labeled.pkl
✅ DataFrame unito salvato in formato Pickle: dataset/dataframes/train_data_labeled.pkl
✅ DataFrame unito salvato in: dataset/dataframes/train_data_labeled.csv


Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case,Spacecraft,...,SV3,SV4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,BV1
0,0.000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1,1,...,100,100,No,No,No,No,No,No,No,No
1,0.001,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1,1,...,100,100,No,No,No,No,No,No,No,No
2,0.002,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1,1,...,100,100,No,No,No,No,No,No,No,No
3,0.003,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1,1,...,100,100,No,No,No,No,No,No,No,No
4,0.004,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1,1,...,100,100,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212572,1.196,1.948734,1.912482,1.904362,1.835654,1.924899,1.857220,1.920020,177,3,...,100,100,No,No,No,No,No,No,No,Yes
212573,1.197,1.948823,1.899824,1.884360,1.825497,1.927513,1.846068,1.919081,177,3,...,100,100,No,No,No,No,No,No,No,Yes
212574,1.198,1.957784,1.911383,1.893740,1.859805,1.940953,1.861668,1.950562,177,3,...,100,100,No,No,No,No,No,No,No,Yes
212575,1.199,1.970451,1.950009,1.945417,1.913911,1.953648,1.835381,1.983321,177,3,...,100,100,No,No,No,No,No,No,No,Yes


In [None]:
# Caricamento dei dataset
df_train = pd.read_csv(os.path.join(path_dataframes, "test_data.csv"))
df_labels_train = pd.read_csv(os.path.join(path_dataframes, "labels_test.csv"))

# Unione dei DataFrame usando la colonna "Case" come chiave
df_train_labeled = df_train.merge(df_labels_train, on="Case", how="left")

# Salvataggio del nuovo DataFrame in CSV
df_train_labeled.to_csv(os.path.join(path_dataframes, "train_data_labeled.csv"), index=False)
save_dataframe_pickle(df_train_labeled, "train_data_labeled.pkl")

print("✅ DataFrame unito salvato in formato Pickle: dataset/dataframes/train_data_labeled.pkl")
print("✅ DataFrame unito salvato in: dataset/dataframes/train_data_labeled.csv")

# Mostrare le prime righe del nuovo dataset per verifica
display(df_train_labeled)