# Predict general data

In [2]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import shutil
import csv
import re
from time import time
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
import torch

import sys
sys.path.append("./missing_data")

from dataset.datasets import MedicanesClsDataset
from torch.utils.data import DataLoader
import models
from timm.models import create_model

from mpl_toolkits.basemap import Basemap
from build_dataset import get_files_from_folder, extract_dates_pattern_airmass_rgb_20200101_0000
from build_dataset import load_cyclones_track_noheader, compute_pixel_scale, inside_tile, calc_tile_offsets, save_single_tile
from build_dataset import split_into_tiles_subfolders_and_track_dates, create_and_save_tile_from_complete_df
from build_dataset import create_final_df_csv
from build_dataset import get_gruppi_date, create_tile_videos, group_df_by_offsets
from medicane_utils.geo_const import latcorners, loncorners, x_center, y_center, create_basemap_obj
from medicane_utils.load_files import load_all_images

from view_test_tiles import create_labeled_images_with_tiles, create_mediterranean_video
from model_analysis import predict_label, get_path_pred_label, create_df_predictions

from arguments import prepare_finetuning_args, Args


basemap_obj = create_basemap_obj()
file_master_df = "all_data_full_tiles.csv"

output_dir = "../airmassRGB/supervised/" 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

df_data = pd.read_csv(file_master_df, dtype={
        "path": 'string',
        "tile_offset_x": 'int16',
        "tile_offset_y": 'int16',
        "label": 'int16',
        "lat": 'object',
        "lon": 'object',
        "x_pix": 'object',
        "y_pix": 'object',
        "name": 'string',
        "source": 'string'
    }, parse_dates=['datetime'])
df_data.drop(columns="Unnamed: 0", inplace=True)

In [3]:
df_2023 = df_data[df_data.datetime > datetime(2023,9,1)]

In [4]:
df_2023.drop(columns=['lat','lon','x_pix','y_pix','name'], inplace=True)
df_2023

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label,source
1382628,../fromgcloud/airmass_rgb_20230903_0000.png,2023-09-03,0,0,0,[]
1382629,../fromgcloud/airmass_rgb_20230903_0000.png,2023-09-03,213,0,0,[]
1382630,../fromgcloud/airmass_rgb_20230903_0000.png,2023-09-03,426,0,0,[]
1382631,../fromgcloud/airmass_rgb_20230903_0000.png,2023-09-03,639,0,0,[]
1382632,../fromgcloud/airmass_rgb_20230903_0000.png,2023-09-03,852,0,0,[]
...,...,...,...,...,...,...
1413163,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,213,196,0,[]
1413164,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,426,196,0,[]
1413165,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,639,196,0,[]
1413166,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12,852,196,0,[]


In [5]:
def sub_select_frequency(df, freq='20min'):
    # selezionare ore intere
    df['dt_floor'] = df['datetime'].dt.floor(freq)
    mask = df['datetime'] == df['dt_floor']
    df_filtered = df[mask]
    #grouped = df_filtered.groupby("path", dropna=False)
    return df_filtered

In [6]:
df_2023_20m = sub_select_frequency(df_2023)
df_2023_20m.shape[0]

7644

In [None]:
df_offsets_groups = group_df_by_offsets(df_2023)
df_videos = create_tile_videos(df_offsets_groups, supervised=False)
df_videos.label=0
df_videos

Unnamed: 0,video_id,tile_offset_x,tile_offset_y,path,label,start_time,end_time,orig_paths
0,0,0,0,03-09-2023_0115_0_0,0,2023-09-03 00:00:00,2023-09-03 01:15:00,"[../fromgcloud/airmass_rgb_20230903_0000.png, ..."
1,1,0,0,03-09-2023_0235_0_0,0,2023-09-03 01:20:00,2023-09-03 02:35:00,"[../fromgcloud/airmass_rgb_20230903_0120.png, ..."
2,2,0,0,03-09-2023_0355_0_0,0,2023-09-03 02:40:00,2023-09-03 03:55:00,"[../fromgcloud/airmass_rgb_20230903_0240.png, ..."
3,3,0,0,03-09-2023_0515_0_0,0,2023-09-03 04:00:00,2023-09-03 05:15:00,"[../fromgcloud/airmass_rgb_20230903_0400.png, ..."
4,4,0,0,03-09-2023_0635_0_0,0,2023-09-03 05:20:00,2023-09-03 06:35:00,"[../fromgcloud/airmass_rgb_20230903_0520.png, ..."
...,...,...,...,...,...,...,...,...
1903,1903,1065,196,11-09-2023_1835_1065_196,0,2023-09-11 17:20:00,2023-09-11 18:35:00,"[../fromgcloud/airmass_rgb_20230911_1720.png, ..."
1904,1904,1065,196,11-09-2023_1955_1065_196,0,2023-09-11 18:40:00,2023-09-11 19:55:00,"[../fromgcloud/airmass_rgb_20230911_1840.png, ..."
1905,1905,1065,196,11-09-2023_2115_1065_196,0,2023-09-11 20:00:00,2023-09-11 21:15:00,"[../fromgcloud/airmass_rgb_20230911_2000.png, ..."
1906,1906,1065,196,11-09-2023_2235_1065_196,0,2023-09-11 21:20:00,2023-09-11 22:35:00,"[../fromgcloud/airmass_rgb_20230911_2120.png, ..."


In [35]:
create_and_save_tile_from_complete_df(df_videos, output_dir)

Creazione delle folder per i 1908 video...
Salvati 30528 file - Erano già presenti 0 file - File totali 30528


In [36]:
df_dataset_csv = create_final_df_csv(df_videos, output_dir)
df_dataset_csv.to_csv("./general_inference_set.csv", index=False)

In [None]:
# bonus -> mi guardo il video del periodo
grouped = df_2023.groupby("path", dropna=False)
print(len(grouped))
create_labeled_images_with_tiles(grouped, 'daniel_complete_tiles.gif', basemap_obj, 213, 196)

2545

# Inference on this set

In [40]:
# carico i dati
args = prepare_finetuning_args()

dataset_val = MedicanesClsDataset(
        anno_path="./general_inference_set.csv",
        data_root=args.data_root,
        mode='validation',  # oppure 'test'
        clip_len=args.num_frames,
        transform=None
    )
nb_classes = 2

data_loader_val = DataLoader(
    dataset_val,
    batch_size=args.batch_size,
    shuffle=True,         # Per estrarre sample casuali
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    drop_last=False
)


In [41]:

# carico il modello
get_prediction = True
args.test_mode = True

model = create_model(
    args.model,
    num_classes=args.nb_classes,
    drop_rate=0.0,
    drop_path_rate=args.drop_path,
    #attn_drop_rate=0.0,
    drop_block_rate=None,
    **args.__dict__
)

model.to(args.device)
model.eval()   


Caricamento del checkpoint da: ./output/checkpoint-best.pth
Caricato state_dict con chiave: model
Checkpoint caricato con successo!


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 1408, kernel_size=(2, 14, 14), stride=(2, 14, 14))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1408, out_features=4224, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1408, out_features=1408, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1408, out_features=6144, bias=True)
        (act): GELU(approximate=none)
        (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
  

In [None]:
all_paths, all_preds, all_labels = get_path_pred_label(model, data_loader_val)
df_predictions = create_df_predictions(all_paths, all_preds, all_labels)

In [44]:
df_filtrato_on_video_path = df_videos.merge(df_predictions, on='path')

In [14]:
df_filtrato_on_video_path.columns

Index(['video_id', 'tile_offset_x', 'tile_offset_y', 'path', 'label',
       'start_time', 'end_time', 'orig_paths', 'predictions', 'labels'],
      dtype='object')

In [45]:
# Espando dataframe con le associazioni path_offsets -> predictions
records = []
for _, row in df_filtrato_on_video_path.iterrows():
    for orig_path in row['orig_paths']:
        records.append({
            'path': orig_path,
            'predictions': row['predictions'],
            'tmp_label': row['labels'],
            'tile_offset_x': row['tile_offset_x'],
            'tile_offset_y': row['tile_offset_y']
        })

# Li trasformiamo in un nuovo DataFrame
df_mapping = pd.DataFrame(records)
df_mapping[['path', 'tile_offset_x', 'tile_offset_y']].duplicated().sum()
#Non ci sono path duplicati se in combinazione con gli offsets

0

In [16]:
df_mapping

Unnamed: 0,path,predictions,tmp_label,tile_offset_x,tile_offset_y
0,../fromgcloud/airmass_rgb_20230903_0000.png,1,0,0,0
1,../fromgcloud/airmass_rgb_20230903_0020.png,1,0,0,0
2,../fromgcloud/airmass_rgb_20230903_0040.png,1,0,0,0
3,../fromgcloud/airmass_rgb_20230903_0100.png,1,0,0,0
4,../fromgcloud/airmass_rgb_20230903_0120.png,1,0,0,0
...,...,...,...,...,...
7483,../fromgcloud/airmass_rgb_20230911_1820.png,1,0,1065,196
7484,../fromgcloud/airmass_rgb_20230911_1840.png,1,0,1065,196
7485,../fromgcloud/airmass_rgb_20230911_1900.png,1,0,1065,196
7486,../fromgcloud/airmass_rgb_20230911_1920.png,1,0,1065,196


In [46]:
# ottengo il dataframe per creare i video
df_data_merg = df_mapping.merge(df_data, on=['path', 'tile_offset_x', 'tile_offset_y'], how='left').drop(columns='label').rename(columns={'tmp_label':'label'})

In [47]:
df_data_merg[['tile_offset_x','tile_offset_y']].nunique()#.path.nunique()

tile_offset_x    6
tile_offset_y    2
dtype: int64

In [25]:
def make_animation(df, nomefile='predictions_validation3.gif'):
    grouped = df.groupby("path", dropna=False)
    print(f" abbiamo {len(list(grouped))} gruppi", flush=True)
    start = time()
    video = create_mediterranean_video(list(grouped), tile_offset_x=224, tile_offset_y=196)
    video.save(nomefile, writer='pillow')
    end = time()
    print(f"{round((end-start)/60.0, 2)} minuti")
    print(f"Video salvato: {nomefile}")

In [49]:
make_animation(df_data_merg, nomefile='daniel_prediction.gif')

 abbiamo 2544 gruppi
11.17 minuti
Video salvato: daniel_prediction.gif


In [6]:
import pandas as pd
pd.set_option('display.max_rows', 500)

In [33]:
master_id_df = pd.read_csv('master_data_2020_wID.csv')
master_df = pd.read_csv('all_data_full_tiles.csv').drop(columns=['Unnamed: 0'])

In [34]:
#master_id_df.sort_values(by='datetime')


In [35]:
#master_df.sort_values(by='datetime')

In [36]:
master_id_df.path.isin(master_df.path).sum()

279372

In [37]:
master_df[cols]

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label
0,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01 00:00:00,0,0,0
1,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01 00:00:00,213,0,0
2,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01 00:00:00,426,0,0
3,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01 00:00:00,639,0,0
4,../fromgcloud/airmass_rgb_20111101_0000.png,2011-11-01 00:00:00,852,0,0
...,...,...,...,...,...
1413163,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12 00:00:00,213,196,0
1413164,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12 00:00:00,426,196,0
1413165,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12 00:00:00,639,196,0
1413166,../fromgcloud/airmass_rgb_20230912_0000.png,2023-09-12 00:00:00,852,196,0


In [38]:
master_id_df[cols]

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label
0,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,0,0
1,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,0,0
2,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,0,0
3,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,0,0
4,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,852,0,0
...,...,...,...,...,...
279367,../fromgcloud/airmass_rgb_20201228_0100.png,2020-12-28 01:00:00,213,196,0
279368,../fromgcloud/airmass_rgb_20201228_0100.png,2020-12-28 01:00:00,426,196,0
279369,../fromgcloud/airmass_rgb_20201228_0100.png,2020-12-28 01:00:00,639,196,0
279370,../fromgcloud/airmass_rgb_20201228_0100.png,2020-12-28 01:00:00,852,196,0


In [45]:
cols = ['path','datetime','tile_offset_x','tile_offset_y','label']
res = pd.merge(master_df[cols], master_id_df[cols], on=cols, how='inner')

In [46]:
master_id_df[master_id_df.path.str.contains('airmass_rgb_20200101_1200')][cols]

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label
0,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,0,0
1,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,0,0
2,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,0,0
3,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,0,0
4,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,852,0,0
5,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,1065,0,0
6,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,196,0
7,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,196,0
8,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,196,0
9,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,196,0


In [47]:
master_df[master_df.path.str.contains('airmass_rgb_20200101_1200')][cols]

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label
153564,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,0,0
153565,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,0,0
153566,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,0,0
153567,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,0,0
153568,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,852,0,0
153569,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,1065,0,0
153570,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,196,0
153571,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,196,0
153572,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,196,0
153573,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,196,0


In [48]:
res[res.path.str.contains('airmass_rgb_20200101_1200')]

Unnamed: 0,path,datetime,tile_offset_x,tile_offset_y,label
0,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,0,0
1,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,0,0
2,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,0,0
3,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,0,0
4,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,852,0,0
5,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,1065,0,0
6,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,0,196,0
7,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,213,196,0
8,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,426,196,0
9,../fromgcloud/airmass_rgb_20200101_1200.png,2020-01-01 12:00:00,639,196,0


In [None]:
all(res == master_id_df[cols])  # -> allora il nuovo master df è completamente contenuto nel vecchio

True