In [26]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import shutil
import csv
import re
from collections import defaultdict
from datetime import datetime
import pandas as pd

import sys
sys.path.append("./missing_data")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def get_files_from_folder(folder, extension, verbose=False):
    folder = Path(folder)
    files = list(folder.rglob(f"*.{extension}"))
    if verbose:
        print(f"Ci sono {len(files)} files")
    return files

def extract_dates_pattern_airmass_rgb_20200101_0000(filename):
    """
    Estrae le date di inizio e fine acquisizione dal nome del file.
    
    Esempio di nome file:
    airmass_rgb_20200101_0000.png
    """
    pattern = r"^airmass_rgb_(\d{8})_(\d{4})\.png$"
    match = re.match(pattern, filename)
    if match:
        date_str = match.group(1)  # YYYYMMDD
        time_str = match.group(2)  # HHMM
        datetime_str = f"{date_str}{time_str}"
        dt = datetime.strptime(datetime_str, '%Y%m%d%H%M')
        return dt
    else:
        return None

In [32]:
# Percorso alla cartella dei frame
input_dir = "./from_gcloud"  # Cambia questo percorso
output_dir = "./airmassRGB"  # Percorso per salvare i CSV
os.makedirs(output_dir, exist_ok=True)

filenames = get_files_from_folder(folder=input_dir, extension="png")

file_metadata = []
for fname in filenames:
    start_dt = extract_dates_pattern_airmass_rgb_20200101_0000(fname.name)
    file_metadata.append((fname, start_dt))

sorted_files = sorted(file_metadata, key=lambda x: x[1])  # Ordina per start_dt
#random_fnames =  [item[0] for item in file_metadata]
sorted_filenames = [item[0] for item in sorted_files]


In [10]:
#sorted_filenames

### copia le immagini in sottocartelle .../partN

In [None]:
def split_into_subfolders_and_track_dates(images, output_dir, num_frames=16):    
    subfolder_info = []
    num_total_files = len(images)
    num_subfolders = num_total_files // num_frames

    for i in range(num_subfolders):
        subfolder_name = f"part{i+1}"
        subfolder_path = os.path.join(output_dir, subfolder_name)
        os.makedirs(subfolder_path, exist_ok=True)

        start_idx = i * num_frames
        end_idx = start_idx + num_frames

        # Copia effettiva dei 16 frame nella sottocartella
        for idx, file in enumerate(images[start_idx:end_idx]):
            new_name = os.path.join(subfolder_path, f"img_{idx+1:05d}.png")
            shutil.copy(file, new_name)

        # Esempio: estrai la data dal frame centrale
        mid_idx = start_idx + num_frames // 2
        dt = extract_dates_pattern_airmass_rgb_20200101_0000(images[mid_idx].name)
        
        # Salviamo in una lista: cartella e data
        subfolder_info.append({"folder": subfolder_path, "date": dt})

    return subfolder_info

In [34]:
#subfolders = split_into_subfolders(sorted_filenames, output_dir)
subfolder_info = split_into_subfolders_and_track_dates(sorted_filenames, output_dir)

In [16]:
#subfolder_info

### Suddivide in train/test/val e scrive i CSV

In [28]:
output_dir

'E:/Medicanes_Data/airmassRGB'

In [30]:
from build_dataset import create_csv

In [33]:
create_csv(output_dir)

File CSV generati:
Train: E:/Medicanes_Data/airmassRGB\train.csv
Test: E:/Medicanes_Data/airmassRGB\test.csv
Validation: E:/Medicanes_Data/airmassRGB\val.csv


In [29]:
total = len(subfolders)
train_split = int(total * 0.7)
test_split = int(total * 0.99)

# per riscrivere i file csv
subfolders = sorted([os.path.join(output_dir, d) for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))])


train_dirs = subfolders[:train_split]
test_dirs = subfolders[train_split:test_split]
val_dirs = subfolders[test_split:]

# Scrive nei file CSV con il formato richiesto
def write_to_csv(dirs, csv_file):
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["path", "start", "end"])  # Intestazione
        for dir_path in dirs:
            writer.writerow([dir_path, 1, 16])  # Riga nel formato richiesto

write_to_csv(train_dirs, train_csv)
write_to_csv(test_dirs, test_csv)
write_to_csv(val_dirs, val_csv)

print(f"File CSV generati:\nTrain: {train_csv}\nTest: {test_csv}\nValidation: {val_csv}")

File CSV generati:
Train: E:/Medicanes_Data/airmassRGB\train.csv
Test: E:/Medicanes_Data/airmassRGB\test.csv
Validation: E:/Medicanes_Data/airmassRGB\val.csv


# Per il dataset supervisionato

In [35]:
def create_supervised_csv_from_info(subfolder_info, medicane_csv, out_csv):
    import csv
    from datetime import datetime
    import pandas as pd

    # Carica gli intervalli di medicane
    intervals = load_medicane_intervals(medicane_csv)


    with open(out_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["path","start","end","label"])

        for item in subfolder_info:
            folder_path = item["folder"]
            dt = item["date"]
            if dt is None:
                # Se non riesci a parsare la data, skip o label = -1
                continue
            label = 1 if is_in_medicane(dt, intervals) else 0
            writer.writerow([folder_path, 1, 16, label])

    print(f"Creato CSV supervisionato in: {out_csv}")



def load_medicane_intervals(medicane_csv):
    """
    Legge un file CSV con le date di inizio/fine dei Medicane.
    Esempio: col start_date, end_date in formato 'YYYY-MM-DD HH:MM'
    """
    
    intervals = []
    df = pd.read_csv(medicane_csv)
    for _, row in df.iterrows():
        start_dt = datetime.strptime(row['Start_Date'], "%Y-%m-%d")
        end_dt   = datetime.strptime(row['End_Date'],   "%Y-%m-%d")
        intervals.append((start_dt, end_dt))
    return intervals

def is_in_medicane(date_to_check, intervals):
    return any(start <= date_to_check <= end for (start, end) in intervals)


In [36]:
folder_root = output_dir      # cartella con subfolder part1, part2 ...
medicane_csv = "./medicane_validi.csv"    # le date dei medicane
out_csv = "./dataset.csv"
create_supervised_csv_from_info(subfolder_info, medicane_csv, out_csv)

Creato CSV supervisionato in: ./dataset.csv
