In [28]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import shutil
import csv
import re
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np

import sys
sys.path.append("./missing_data")

from mpl_toolkits.basemap import Basemap
from build_dataset import get_files_from_folder, extract_dates_pattern_airmass_rgb_20200101_0000
from build_dataset import load_cyclones_track_noheader, compute_pixel_scale, inside_tile, calc_tile_offsets, save_single_tile
from build_dataset import split_into_tiles_subfolders_and_track_dates, create_tile_videos, create_and_save_tile_from_complete_df
from medicane_utils.geo_const import latcorners, loncorners, x_center, y_center, basemap_obj
from medicane_utils.load_files import load_all_images

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
Xmin, Ymin, px_scale_x, px_scale_y = compute_pixel_scale()
1/px_scale_y/1000 * 224, 1/px_scale_x/1000 * 224

(672.3640767222608, 672.0122685182674)

In [107]:
# Percorso alla cartella dei frame
input_dir = "../fromgcloud"
output_dir = "../airmassRGB/supervised/" 

#### load all file names and dates

In [None]:
#os.makedirs(output_dir, exist_ok=True)
sorted_metadata_files = load_all_images(input_dir)

In [None]:
f_sorted_metadata = sorted_metadata_files[:32]
#f_sorted_metadata

In [None]:
num_total_files = len(f_sorted_metadata)
# quanti blocchi di 16 consecutivi
num_frames = 16
num_subfolders = num_total_files // num_frames

### Dataframe di tutte le tiles etichettate - check

In [3]:
df_data = pd.read_csv("all_data.csv", dtype={
        "path": 'string',
        "tile_offset_x": 'int16',
        "tile_offset_y": 'int16',
        "label": 'category',
        "lat": 'float32',
        "lon": 'float32',
        "x_pix": 'Int16',
        "y_pix": 'Int16',
        "name": 'string'
    }, parse_dates=['datetime'])
df_data.drop(columns="Unnamed: 0", inplace=True)

In [4]:
df_data

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label,lat,lon,x_pix,y_pix,name
0,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01,0,0,0,,,,,[]
1,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01,112,0,0,,,,,[]
2,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01,224,0,0,,,,,[]
3,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01,336,0,0,,,,,[]
4,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01,448,0,0,,,,,[]
...,...,...,...,...,...,...,...,...,...,...
2355275,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,560,112,0,,,,,[]
2355276,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,672,112,0,,,,,[]
2355277,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,784,112,0,,,,,[]
2355278,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,896,112,0,,,,,[]


In [5]:
# separo gruppi temporali contigui
df_data = df_data.sort_values('datetime') 
# Calcola la differenza temporale rispetto alla riga precedente
df_data['delta'] = df_data['datetime'].diff()

# Definisci i punti di rottura: True se la differenza è maggiore della frequenza attesa
df_data['new_group'] = (df_data['delta'] > pd.Timedelta(minutes=60))  # puoi aumentare il margine se serve

# Crea gli ID di gruppo cumulando i True
df_data['gruppo'] = df_data['new_group'].cumsum()

gruppi_date = [g for _, g in df_data.groupby('gruppo')]

### verifico i gruppi di periodi contigui

In [5]:
lista_date = []
lungh_gruppi = []
date_inizio_fine_gruppi = []
for df in gruppi_date:
    group_datetime = list(df.groupby("datetime"))
    for gd in group_datetime:        
        lista_date.append(len(gd[1]))
    lungh_gruppi.append(len(df)/len(gd[1])) # divido per le 20 tile di ciascun istante temporale
    start_date = df.datetime.iloc[0]
    end_date = df.datetime.iloc[-1]
    date_inizio_fine_gruppi.append((start_date, end_date))


In [25]:
np.array(lungh_gruppi)/288, set(lista_date) # abbiamo 20 tile per ogni data

(array([ 10.00347222,  10.00347222,   8.00347222,   8.00347222,
          4.98611111,   2.93402778,   5.37847222,  54.65625   ,
        277.73611111,   4.00347222,   5.37847222,   8.97916667,
          0.81597222,   8.02083333]),
 {20})

In [42]:
date_inizio_fine_gruppi

[(Timestamp('2011-11-01 00:00:00'), Timestamp('2011-11-11 00:00:00')),
 (Timestamp('2014-11-03 00:00:00'), Timestamp('2014-11-13 00:00:00')),
 (Timestamp('2016-10-26 00:00:00'), Timestamp('2016-11-03 00:00:00')),
 (Timestamp('2017-11-13 00:00:00'), Timestamp('2017-11-21 00:00:00')),
 (Timestamp('2018-09-25 00:00:00'), Timestamp('2018-09-29 23:35:00')),
 (Timestamp('2018-09-30 01:40:00'), Timestamp('2018-10-03 00:00:00')),
 (Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-06 09:00:00')),
 (Timestamp('2020-02-03 09:05:00'), Timestamp('2020-03-29 00:55:00')),
 (Timestamp('2020-03-29 02:00:00'), Timestamp('2020-12-31 23:55:00')),
 (Timestamp('2021-10-29 00:00:00'), Timestamp('2021-11-02 00:00:00')),
 (Timestamp('2021-11-04 00:00:00'), Timestamp('2021-11-09 09:00:00')),
 (Timestamp('2023-02-24 00:00:00'), Timestamp('2023-03-05 00:00:00')),
 (Timestamp('2023-09-03 00:00:00'), Timestamp('2023-09-03 19:30:00')),
 (Timestamp('2023-09-03 23:25:00'), Timestamp('2023-09-12 00:00:00'))]

### Sembra ben formato -> procedo a creare i video, per ogni blocco temporale

In [89]:
df = gruppi_date[8]

In [90]:
df

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label,lat,lon,x_pix,y_pix,name,delta,new_group,gruppo
598872,../fromgcloud/airmass_rgb_20200329_0200.png,2020-03-29 02:00:00,224,112,0,,,,,[],0 days 01:05:00,True,8
598873,../fromgcloud/airmass_rgb_20200329_0200.png,2020-03-29 02:00:00,336,112,0,,,,,[],0 days 00:00:00,False,8
598874,../fromgcloud/airmass_rgb_20200329_0200.png,2020-03-29 02:00:00,448,112,0,,,,,[],0 days 00:00:00,False,8
598879,../fromgcloud/airmass_rgb_20200329_0200.png,2020-03-29 02:00:00,1008,112,0,,,,,[],0 days 00:00:00,False,8
598877,../fromgcloud/airmass_rgb_20200329_0200.png,2020-03-29 02:00:00,784,112,0,,,,,[],0 days 00:00:00,False,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2198603,../fromgcloud/airmass_rgb_20201231_2355.png,2020-12-31 23:55:00,336,0,0,,,,,[],0 days 00:00:00,False,8
2198602,../fromgcloud/airmass_rgb_20201231_2355.png,2020-12-31 23:55:00,224,0,0,,,,,[],0 days 00:00:00,False,8
2198601,../fromgcloud/airmass_rgb_20201231_2355.png,2020-12-31 23:55:00,112,0,0,,,,,[],0 days 00:00:00,False,8
2198600,../fromgcloud/airmass_rgb_20201231_2355.png,2020-12-31 23:55:00,0,0,0,,,,,[],0 days 00:00:00,False,8


In [None]:
#grouped = df.groupby(["tile_offset_x", "tile_offset_y"], group_keys=False)  # ottengo 20 gruppi  perché tante sono le tiles

In [91]:
df_videos = create_tile_videos(df)

m_cicloni = df_videos.label == 1
m_non_cicloni = df_videos.label == 0

df_cicloni = df_videos[m_cicloni]
df_non_cicloni = df_videos[m_non_cicloni]



In [92]:
print(len(df_cicloni), len(df_non_cicloni), len(df_videos))
#for idx, row in df_res[m_cicloni].iterrows():
#    print(row)

2705 97275 99980


In [93]:
df_0_balanced = df_non_cicloni.sample(len(df_cicloni))
len(df_0_balanced)

2705

In [94]:
create_and_save_tile_from_complete_df(df_0_balanced, output_dir)
create_and_save_tile_from_complete_df(df_cicloni, output_dir)

In [96]:
#df_creati_2011 = pd.concat([df_cicloni, df_0_balanced])
#df_creati_2014 = pd.concat([df_cicloni, df_0_balanced])
#df_creati_2016 = pd.concat([df_cicloni, df_0_balanced])
#df_creati_2017 = pd.concat([df_cicloni, df_0_balanced])
#df_creati_2020_1 = pd.concat([df_cicloni, df_0_balanced])
#df_creati_2020_2 = pd.concat([df_cicloni, df_0_balanced])
df_creati_2020_3 = pd.concat([df_cicloni, df_0_balanced])

In [101]:
print(df_creati_2020_3.shape)
df_creati_2020_3.to_csv("df_test_2020.csv")

(5410, 8)


In [99]:
df_train = pd.concat([df_creati_2011,
df_creati_2014,
df_creati_2016,
df_creati_2017,
df_creati_2020_1,
df_creati_2020_2])

In [102]:
print(df_train.shape)
df_train.to_csv("df_train.csv")

(3376, 8)


### Creazione csv per il training

In [115]:
def create_final_df_csv(df_in, output_dir):
    df_dataset_csv = df_in[['path', 'label']]
    df_dataset_csv['path'] = output_dir + df_dataset_csv['path']
    df_dataset_csv['start'] = 1
    df_dataset_csv['end'] = 16
    df_dataset_csv = df_dataset_csv[['path', 'start', 'end', 'label']]
    return df_dataset_csv


In [117]:
df_dataset_csv_test = create_final_df_csv(df_creati_2020_3, output_dir)

In [113]:
df_dataset_csv.to_csv("./train_supervised.csv", index=False)

In [118]:
df_dataset_csv_test.to_csv("./test_supervised.csv", index=False)

In [104]:
df_out[df_out.label ==1]

Unnamed: 0,path,start,end,label,start_time,end_time
22395,../airmassRGB/supervised/part316_336_0,1,16,1,2014-11-10 11:55:00,2014-11-10 13:10:00
22396,../airmassRGB/supervised/part317_336_0,1,16,1,2014-11-10 13:15:00,2014-11-10 14:30:00
23716,../airmassRGB/supervised/part1637_336_0,1,16,1,2020-03-15 23:05:00,2020-03-16 00:20:00
23717,../airmassRGB/supervised/part1638_336_0,1,16,1,2020-03-16 00:25:00,2020-03-16 01:40:00
23718,../airmassRGB/supervised/part1639_336_0,1,16,1,2020-03-16 01:45:00,2020-03-16 03:00:00
...,...,...,...,...,...,...
139242,../airmassRGB/supervised/part6763_896_112,1,16,1,2020-12-25 23:05:00,2020-12-26 00:20:00
139243,../airmassRGB/supervised/part6764_896_112,1,16,1,2020-12-26 00:25:00,2020-12-26 01:40:00
139244,../airmassRGB/supervised/part6765_896_112,1,16,1,2020-12-26 01:45:00,2020-12-26 03:00:00
139245,../airmassRGB/supervised/part6766_896_112,1,16,1,2020-12-26 03:05:00,2020-12-26 04:20:00


In [84]:
df_out = df_out.rename(columns={'folder':'path'})

In [85]:
df_out.to_csv("./df_videotiles_date.csv", index=False)

### Guardo alcuni video da cancellare dal disco per risparmiare spazio

In [128]:
df_out[df_out['path'].str.contains(r"/part43\d+", regex=True)].label.sum()

0

In [129]:
df_filt = df_out[df_out['path'].apply(os.path.isdir)]
print(f"Numero di cartelle esistenti: {len(df_filt)}, righe totali precedenti: {df_out.shape[0]}")

Numero di cartelle esistenti: 126001, righe totali precedenti: 147200


In [130]:
df_filt[df_filt.label==1].shape[0], df_filt[df_filt.label==0].shape[0], round(df_filt.label.sum()/df_filt.shape[0], 3)

(1512, 124489, 0.012)

In [62]:
def balance_df_by_label(df, label_column="label", random_state=478562):
    """
    Bilancia un DataFrame mantenendo tutte le righe con label=1
    e selezionando (in modo random) lo stesso numero di righe con label=0.
    Restituisce un DataFrame mescolato con uguale quantità di 1 e 0.
    
    Parametri:
    - df: DataFrame originale
    - label_column: il nome della colonna che contiene la label (default: 'label')
    - random_state: per riproducibilità del campionamento casuale
    """
    # Seleziona tutte le righe con label=1
    df_pos = df[df[label_column] == 1]
    # Seleziona tutte le righe con label=0
    df_neg = df[df[label_column] == 0]

    # Numero di righe positive
    num_pos = len(df_pos)
    # Esegui un sample sulle righe negative pari a num_pos
    df_neg_sampled = df_neg.sample(n=num_pos, random_state=random_state)

    # Concatena e mescola
    df_balanced = pd.concat([df_pos, df_neg_sampled])
    df_balanced = df_balanced.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return df_balanced

In [131]:
df_balanced = balance_df_by_label(df_filt)
print(len(df_balanced), df_balanced['label'].value_counts())

3024 label
1    1512
0    1512
Name: count, dtype: int64


In [137]:
df_balanced


Unnamed: 0,path,start,end,label,start_time,end_time
0,../airmassRGB/supervised/part6762_784_112,1,16,1,2020-12-25 21:45:00,2020-12-25 23:00:00
1,../airmassRGB/supervised/part6102_784_112,1,16,1,2020-11-19 04:50:00,2020-11-19 06:05:00
2,../airmassRGB/supervised/part6101_784_112,1,16,1,2020-11-19 03:30:00,2020-11-19 04:45:00
3,../airmassRGB/supervised/part3246_560_0,1,16,1,2020-06-13 11:20:00,2020-06-13 12:35:00
4,../airmassRGB/supervised/part1925_336_112,1,16,0,2020-04-01 00:05:00,2020-04-01 01:20:00
...,...,...,...,...,...,...
3019,../airmassRGB/supervised/part1653_560_0,1,16,1,2020-03-16 20:25:00,2020-03-16 21:40:00
3020,../airmassRGB/supervised/part1993_896_112,1,16,0,2020-04-04 18:45:00,2020-04-04 20:00:00
3021,../airmassRGB/supervised/part3207_448_112,1,16,1,2020-06-11 07:20:00,2020-06-11 08:35:00
3022,../airmassRGB/supervised/part2788_336_112,1,16,0,2020-05-18 23:05:00,2020-05-19 00:20:00


In [135]:
l = df_balanced.shape[0]
#l = 100
tr_p = int(l * 0.7)
val_p = int(l * 0.9)
print(f"ranges train e validation:  {tr_p} - {val_p}")
df_train = df_balanced[:tr_p]
df_val = df_balanced[tr_p:val_p]
df_test = df_balanced[val_p:]

ranges train e validation:  2116 - 2721


In [136]:
out_csv_label = os.path.join("./", "train_supervised.csv")
df_train.to_csv(out_csv_label, index=False)
out_csv_label = os.path.join("./", "val_supervised.csv")
df_val.to_csv(out_csv_label, index=False)
out_csv_label = os.path.join("./", "test_supervised.csv")
df_test.to_csv(out_csv_label, index=False)

### copia le immagini in sottocartelle .../partN

In [27]:
from build_supervised_dataset_no_tiles import split_into_subfolders_and_track_dates

In [None]:
#subfolders = split_into_subfolders(sorted_filenames, output_dir)
subfolder_info = split_into_subfolders_and_track_dates(sorted_filenames, output_dir)

In [None]:
#subfolder_info

### Suddivide in train/test/val e scrive i CSV

In [28]:
output_dir

'E:/Medicanes_Data/airmassRGB'

In [30]:
from build_dataset import create_csv

In [33]:
create_csv(output_dir)

File CSV generati:
Train: E:/Medicanes_Data/airmassRGB\train.csv
Test: E:/Medicanes_Data/airmassRGB\test.csv
Validation: E:/Medicanes_Data/airmassRGB\val.csv


# Per il dataset supervisionato 
(senza tiles -> sbagliato perché ogni etichetta va associata alla singola tile)

In [None]:
from build_supervised_dataset_no_tiles import create_supervised_csv_from_info, is_in_medicane
from medicane_utils.load_files import load_medicane_intervals

In [36]:
folder_root = output_dir      # cartella con subfolder part1, part2 ...
medicane_csv = "./medicane_validi.csv"    # le date dei medicane
out_csv = "./dataset.csv"
create_supervised_csv_from_info(subfolder_info, medicane_csv, out_csv)

Creato CSV supervisionato in: ./dataset.csv
