In [1]:
import pandas as pd

In [13]:
med_new_wind = './medicanes_new_windows.csv'
df = pd.read_csv(med_new_wind)

In [15]:
mapping = df.groupby('id_final', as_index=False).agg(start_time=('start_time','first'),
                                                     end_time=('end_time','first')).sort_values('id_final')
mapping['start_time'] = pd.to_datetime(mapping['start_time'])
mapping = mapping.sort_values('start_time').reset_index(drop=True)
mapping

Unnamed: 0,id_final,start_time,end_time
0,7001283,2010-10-12 12:55:00,2010-10-13 13:25:00
1,7001328,2011-11-05 22:55:00,2011-11-08 08:05:00
2,7001358,2012-04-13 04:15:00,2012-04-14 21:00:00
3,7001421,2013-11-18 22:00:00,2013-11-20 09:00:00
4,7001461,2014-11-07 00:10:00,2014-11-08 14:00:00
5,7001466,2014-11-30 06:25:00,2014-12-03 06:30:00
6,7001500,2015-09-30 21:25:00,2015-10-01 21:00:00
7,7001521,2016-02-28 23:20:00,2016-03-01 08:05:00
8,7001542,2016-10-28 06:30:00,2016-10-31 01:50:00
9,7001575,2017-11-16 14:45:00,2017-11-18 12:40:00


In [16]:
mapping['end_time'] = pd.to_datetime(mapping['end_time'])
mapping['durata'] = (mapping['end_time'] - mapping['start_time']).dt.total_seconds() / 3600
mapping

Unnamed: 0,id_final,start_time,end_time,durata
0,7001283,2010-10-12 12:55:00,2010-10-13 13:25:00,24.5
1,7001328,2011-11-05 22:55:00,2011-11-08 08:05:00,57.166667
2,7001358,2012-04-13 04:15:00,2012-04-14 21:00:00,40.75
3,7001421,2013-11-18 22:00:00,2013-11-20 09:00:00,35.0
4,7001461,2014-11-07 00:10:00,2014-11-08 14:00:00,37.833333
5,7001466,2014-11-30 06:25:00,2014-12-03 06:30:00,72.083333
6,7001500,2015-09-30 21:25:00,2015-10-01 21:00:00,23.583333
7,7001521,2016-02-28 23:20:00,2016-03-01 08:05:00,32.75
8,7001542,2016-10-28 06:30:00,2016-10-31 01:50:00,67.333333
9,7001575,2017-11-16 14:45:00,2017-11-18 12:40:00,45.916667


In [17]:
# mostra la colonna 'durata' arrotondata a 1 cifra
mapping['durata'] = mapping['durata'].round(1)
mapping

Unnamed: 0,id_final,start_time,end_time,durata
0,7001283,2010-10-12 12:55:00,2010-10-13 13:25:00,24.5
1,7001328,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2
2,7001358,2012-04-13 04:15:00,2012-04-14 21:00:00,40.8
3,7001421,2013-11-18 22:00:00,2013-11-20 09:00:00,35.0
4,7001461,2014-11-07 00:10:00,2014-11-08 14:00:00,37.8
5,7001466,2014-11-30 06:25:00,2014-12-03 06:30:00,72.1
6,7001500,2015-09-30 21:25:00,2015-10-01 21:00:00,23.6
7,7001521,2016-02-28 23:20:00,2016-03-01 08:05:00,32.8
8,7001542,2016-10-28 06:30:00,2016-10-31 01:50:00,67.3
9,7001575,2017-11-16 14:45:00,2017-11-18 12:40:00,45.9


In [10]:
# costruisce la tabella a partire dai manifest che contengono solo 'datetime' (senza usare id_final)
from pathlib import Path
root = Path('../manifests')
manifests = [root/'./train.csv', root/'./val.csv', root/'./test.csv']
found = [p for p in manifests if p.exists()]
if not found:
    raise FileNotFoundError(f"Nessun manifest trovato tra: {', '.join(str(p) for p in manifests)}")

# leggi ogni manifest e annota la sorgente
dfs = []
for p in found:
    tmp = pd.read_csv(p)
    tmp['__source'] = p.name
    dfs.append(tmp)
combined = pd.concat(dfs, ignore_index=True)

if 'datetime' not in combined.columns:
    raise ValueError("I file manifest devono contenere la colonna 'datetime'.")

combined['datetime'] = pd.to_datetime(combined['datetime'])
combined = combined.sort_values('datetime').reset_index(drop=True)

# soglia per separare periodi (modifica se necessario)
threshold = pd.Timedelta('5min')

# crea segmenti quando il gap è maggiore della soglia
diff = combined['datetime'].diff().fillna(pd.Timedelta(seconds=0))
segments = (diff > threshold).cumsum()
combined['segment'] = segments

# aggrega per segmento per ottenere start/end, durata e i file sorgente coinvolti
periods = combined.groupby('segment', group_keys=False).agg(
    start_time=('datetime', 'min'),
    end_time=('datetime', 'max'),
    source_files=('__source', lambda s: ','.join(sorted(set(s))))
).reset_index(drop=True)

periods['durata'] = ((periods['end_time'] - periods['start_time']).dt.total_seconds() / 3600).round(1)

# mostra le colonne come nelle tabelle precedenti
periods = periods[['start_time', 'end_time', 'durata', 'source_files']]
periods

Unnamed: 0,start_time,end_time,durata,source_files
0,2011-11-01 22:55:00,2011-11-11 00:00:00,217.1,"test.csv,train.csv,val.csv"
1,2014-11-03 00:10:00,2014-11-12 14:00:00,229.8,"test.csv,train.csv,val.csv"
2,2016-10-26 00:00:00,2016-11-03 00:00:00,192.0,"test.csv,train.csv,val.csv"
3,2017-11-13 00:00:00,2017-11-21 00:00:00,192.0,"test.csv,train.csv,val.csv"
4,2019-11-06 09:30:00,2019-11-11 13:55:00,124.4,"test.csv,train.csv,val.csv"
5,2019-11-11 14:35:00,2019-11-15 17:30:00,98.9,"test.csv,train.csv,val.csv"
6,2020-09-10 23:50:00,2020-09-22 19:40:00,283.8,"test.csv,train.csv,val.csv"
7,2020-12-09 08:20:00,2020-12-11 00:15:00,39.9,"test.csv,train.csv,val.csv"
8,2020-12-11 01:15:00,2020-12-20 13:45:00,228.5,"test.csv,train.csv,val.csv"
9,2020-12-21 10:35:00,2020-12-31 10:50:00,240.2,"test.csv,train.csv,val.csv"


In [11]:
# per ogni manifest crea una tabella periodi separata (start_time, end_time, durata)
from pathlib import Path
root = Path('../manifests')
manifests = [root/'./train.csv', root/'./val.csv', root/'./test.csv']
found = [p for p in manifests if p.exists()]
if not found:
    raise FileNotFoundError(f"Nessun manifest trovato tra: {', '.join(str(p) for p in manifests)}")

from IPython.display import display

threshold = pd.Timedelta('5min')

results = {}
for p in found:
    dfp = pd.read_csv(p)
    if 'datetime' not in dfp.columns:
        raise ValueError(f"File {p.name} non contiene la colonna 'datetime'.")
    dfp['datetime'] = pd.to_datetime(dfp['datetime'])
    dfp = dfp.sort_values('datetime').reset_index(drop=True)
    diff = dfp['datetime'].diff().fillna(pd.Timedelta(seconds=0))
    segments = (diff > threshold).cumsum()
    periods = dfp.groupby(segments, group_keys=False).agg(
        start_time=('datetime', 'min'),
        end_time=('datetime', 'max')
    ).reset_index(drop=True)
    periods['durata'] = ((periods['end_time'] - periods['start_time']).dt.total_seconds() / 3600).round(1)
    periods = periods[['start_time', 'end_time', 'durata']]
    varname = f"periods_{p.stem}"
    results[varname] = periods

# mostra separatamente le tre tabelle (se esistono)
for name in ['periods_train', 'periods_val', 'periods_test']:
    if name in results:
        print(name)
        display(results[name])

periods_train


Unnamed: 0,start_time,end_time,durata
0,2011-11-01 22:55:00,2011-11-01 23:00:00,0.1
1,2011-11-01 23:10:00,2011-11-01 23:10:00,0.0
2,2011-11-01 23:20:00,2011-11-02 00:05:00,0.8
3,2011-11-02 00:20:00,2011-11-02 00:50:00,0.5
4,2011-11-02 01:05:00,2011-11-02 01:25:00,0.3
...,...,...,...
6528,2023-09-12 06:35:00,2023-09-12 06:40:00,0.1
6529,2023-09-12 07:00:00,2023-09-12 07:05:00,0.1
6530,2023-09-12 07:20:00,2023-09-12 07:30:00,0.2
6531,2023-09-12 07:45:00,2023-09-12 09:05:00,1.3


periods_val


Unnamed: 0,start_time,end_time,durata
0,2011-11-01 23:05:00,2011-11-01 23:05:00,0.0
1,2011-11-01 23:15:00,2011-11-01 23:15:00,0.0
2,2011-11-02 00:15:00,2011-11-02 00:15:00,0.0
3,2011-11-02 00:55:00,2011-11-02 01:00:00,0.1
4,2011-11-02 01:30:00,2011-11-02 01:30:00,0.0
...,...,...,...
3931,2023-09-12 04:05:00,2023-09-12 04:05:00,0.0
3932,2023-09-12 06:05:00,2023-09-12 06:05:00,0.0
3933,2023-09-12 06:45:00,2023-09-12 06:55:00,0.2
3934,2023-09-12 07:35:00,2023-09-12 07:40:00,0.1


periods_test


Unnamed: 0,start_time,end_time,durata
0,2011-11-02 00:10:00,2011-11-02 00:10:00,0.0
1,2011-11-02 02:20:00,2011-11-02 02:20:00,0.0
2,2011-11-02 02:30:00,2011-11-02 02:30:00,0.0
3,2011-11-02 04:25:00,2011-11-02 04:25:00,0.0
4,2011-11-02 05:05:00,2011-11-02 05:05:00,0.0
...,...,...,...
3965,2023-09-12 05:40:00,2023-09-12 05:40:00,0.0
3966,2023-09-12 05:55:00,2023-09-12 05:55:00,0.0
3967,2023-09-12 06:15:00,2023-09-12 06:15:00,0.0
3968,2023-09-12 06:30:00,2023-09-12 06:30:00,0.0


In [18]:
# unisci mapping (medicanes) con le tabelle prodotte dai manifest per confrontare start/end
# prepara mapping_med con suffisso _medicanes
mapping['start_time'] = pd.to_datetime(mapping['start_time'])
mapping['end_time'] = pd.to_datetime(mapping['end_time'])
mapping['durata'] = mapping['durata'].round(1)
mapping_med = mapping.rename(columns={
    'start_time': 'start_time_medicanes',
    'end_time': 'end_time_medicanes',
    'durata': 'durata_medicanes'
})

# assicurati che 'results' (periodi per file) esista; altrimenti ricomponilo
try:
    results
except NameError:
    results = {}
    from pathlib import Path
    root = Path('../manifests')
    manifests = [root/'./train.csv', root/'./val.csv', root/'./test.csv']
    for p in manifests:
        if p.exists():
            dfp = pd.read_csv(p)
            if 'datetime' not in dfp.columns:
                continue
            dfp['datetime'] = pd.to_datetime(dfp['datetime'])
            dfp = dfp.sort_values('datetime').reset_index(drop=True)
            diff = dfp['datetime'].diff().fillna(pd.Timedelta(seconds=0))
            segments = (diff > pd.Timedelta('5min')).cumsum()
            periods = dfp.groupby(segments, group_keys=False).agg(
                start_time=('datetime', 'min'),
                end_time=('datetime', 'max')
            ).reset_index(drop=True)
            periods['durata'] = ((periods['end_time'] - periods['start_time']).dt.total_seconds() / 3600).round(1)
            periods = periods[['start_time', 'end_time', 'durata']]
            results[f'periods_{p.stem}'] = periods

# funzione che trova sovrapposizioni tra intervalli e crea tabella di confronto
import pandas as pd

def merge_mapping_with_periods(mapping_med, periods_df, manifest_name):
    rows = []
    for _, m in mapping_med.iterrows():
        mask = (periods_df['start_time'] <= m['end_time_medicanes']) & (periods_df['end_time'] >= m['start_time_medicanes'])
        overlaps = periods_df[mask]
        if overlaps.empty:
            rows.append({
                'start_time_medicanes': m['start_time_medicanes'],
                'end_time_medicanes': m['end_time_medicanes'],
                'durata_medicanes': m['durata_medicanes'],
                'start_time_manifest': pd.NaT,
                'end_time_manifest': pd.NaT,
                'durata_manifest': pd.NA,
                'manifest': manifest_name
            })
        else:
            for _, o in overlaps.iterrows():
                rows.append({
                    'start_time_medicanes': m['start_time_medicanes'],
                    'end_time_medicanes': m['end_time_medicanes'],
                    'durata_medicanes': m['durata_medicanes'],
                    'start_time_manifest': o['start_time'],
                    'end_time_manifest': o['end_time'],
                    'durata_manifest': o['durata'],
                    'manifest': manifest_name
                })
    return pd.DataFrame(rows)

# genera e mostra le tabelle unite per ogni manifest disponibile
merged_results = {}
for name in ['periods_train', 'periods_val', 'periods_test']:
    if name in results:
        merged = merge_mapping_with_periods(mapping_med, results[name], name.replace('periods_',''))
        merged_results[f'merged_{name}'] = merged
        print(f"merged_{name}")
        display(merged)

merged_periods_train


Unnamed: 0,start_time_medicanes,end_time_medicanes,durata_medicanes,start_time_manifest,end_time_manifest,durata_manifest,manifest
0,2010-10-12 12:55:00,2010-10-13 13:25:00,24.5,NaT,NaT,,train
1,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 22:55:00,2011-11-05 22:55:00,0.0,train
2,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:05:00,2011-11-05 23:05:00,0.0,train
3,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:30:00,2011-11-05 23:35:00,0.1,train
4,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-06 00:00:00,2011-11-06 00:45:00,0.8,train
...,...,...,...,...,...,...,...
1773,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:05:00,2023-09-10 00:10:00,0.1,train
1774,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:20:00,2023-09-10 00:30:00,0.2,train
1775,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:45:00,2023-09-10 00:50:00,0.1,train
1776,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 01:05:00,2023-09-10 01:35:00,0.5,train


merged_periods_val


Unnamed: 0,start_time_medicanes,end_time_medicanes,durata_medicanes,start_time_manifest,end_time_manifest,durata_manifest,manifest
0,2010-10-12 12:55:00,2010-10-13 13:25:00,24.5,NaT,NaT,,val
1,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:10:00,2011-11-05 23:10:00,0.0,val
2,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:20:00,2011-11-05 23:20:00,0.0,val
3,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:40:00,2011-11-05 23:50:00,0.2,val
4,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-06 01:10:00,2011-11-06 01:10:00,0.0,val
...,...,...,...,...,...,...,...
1043,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-09 22:00:00,2023-09-09 22:00:00,0.0,val
1044,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-09 22:35:00,2023-09-09 22:40:00,0.1,val
1045,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-09 23:30:00,2023-09-09 23:30:00,0.0,val
1046,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-09 23:40:00,2023-09-09 23:40:00,0.0,val


merged_periods_test


Unnamed: 0,start_time_medicanes,end_time_medicanes,durata_medicanes,start_time_manifest,end_time_manifest,durata_manifest,manifest
0,2010-10-12 12:55:00,2010-10-13 13:25:00,24.5,NaT,NaT,,test
1,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:00:00,2011-11-05 23:00:00,0.0,test
2,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:15:00,2011-11-05 23:15:00,0.0,test
3,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:25:00,2011-11-05 23:25:00,0.0,test
4,2011-11-05 22:55:00,2011-11-08 08:05:00,57.2,2011-11-05 23:55:00,2011-11-05 23:55:00,0.0,test
...,...,...,...,...,...,...,...
1079,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:15:00,2023-09-10 00:15:00,0.0,test
1080,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:35:00,2023-09-10 00:40:00,0.1,test
1081,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 00:55:00,2023-09-10 01:00:00,0.1,test
1082,2023-09-05 03:20:00,2023-09-10 02:00:00,118.7,2023-09-10 01:40:00,2023-09-10 01:40:00,0.0,test
