In [2]:
import argparse
import logging
import os, sys
from copy import copy
from datetime import datetime
from pathlib import Path
import time
from functools import partial
import gzip
from collections import defaultdict
from itertools import product
from functools import wraps

import dask.bag as db
import geopandas as gpd
import pandas as pd
import polars as pl
from tqdm import tqdm
from sqlalchemy.orm import Session
import networkx as nx
import numpy as np

from pathlib import Path

%reload_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.append("..")
sys.path.append("../..")


from database import get_engine
from utils.config_utils import load_config

In [None]:
# Find out which timestamps have no cells to cut cell tracks

SQL_QUERY = """
SELECT "timestamp" as time, method, COUNT(*)
	FROM raincells.stormcells
    group by time, method
    order by time
"""

dbconf_file = Path(".").resolve().parent / "config/database/database.yaml"
dbconf = load_config(dbconf_file)
engine = get_engine(dbconf)

with Session(engine) as session:
    df = pl.read_database(query=SQL_QUERY, connection=session.bind)

df = df.with_columns(pl.col("time").diff().alias("time_diff"))

df.filter(pl.col("time_diff") > pd.Timedelta("3h"))

time,method,count,time_diff
datetime[μs],str,i64,duration[μs]
2021-05-02 10:00:00,"""opencv_vil_1.0:minArea_10:clus…",1,12h 45m
2021-05-05 01:40:00,"""opencv_vil_1.0:minArea_10:clus…",1,2d 35m
2021-05-05 11:50:00,"""opencv_vil_1.0:minArea_10:clus…",1,9h 10m
2021-05-06 15:20:00,"""opencv_vil_1.0:minArea_10:clus…",1,21h 40m
2021-05-07 04:25:00,"""opencv_vil_1.0:minArea_10:clus…",1,4h 45m
…,…,…,…
2023-09-15 23:40:00,"""opencv_vil_1.0:minArea_10:clus…",1,4h 40m
2023-09-17 13:45:00,"""opencv_vil_1.0:minArea_10:clus…",1,11h 20m
2023-09-19 15:50:00,"""opencv_vil_1.0:minArea_10:clus…",1,12h
2023-09-19 21:45:00,"""opencv_vil_1.0:minArea_10:clus…",1,4h 5m


In [43]:
df

time,method,count,time_diff
datetime[μs],str,i64,duration[μs]
2021-05-01 11:35:00,"""opencv_vil_1.0:minArea_10:clus…",1,
2021-05-01 11:40:00,"""opencv_vil_1.0:minArea_10:clus…",1,5m
2021-05-01 11:45:00,"""opencv_vil_1.0:minArea_10:clus…",2,5m
2021-05-01 11:50:00,"""opencv_vil_1.0:minArea_10:clus…",2,5m
2021-05-01 11:55:00,"""opencv_vil_1.0:minArea_10:clus…",1,5m
…,…,…,…
2023-09-23 19:45:00,"""opencv_vil_1.0:minArea_10:clus…",2,5m
2023-09-23 19:50:00,"""opencv_vil_1.0:minArea_10:clus…",2,5m
2023-09-23 19:55:00,"""opencv_vil_1.0:minArea_10:clus…",1,5m
2023-09-23 20:00:00,"""opencv_vil_1.0:minArea_10:clus…",1,5m


In [51]:
break_str = "3h"
break_duration = pd.Timedelta(break_str)

df
breaks = df.filter((pl.col("time_diff") > break_duration) | pl.col("time_diff").is_null()).sort("time").select("time")
# breaks = breaks.with_columns((pl.col("time")))

breaks = breaks.with_columns(pl.col("time").shift(-1).alias("next_time"))

# Add last time
breaks[-1, "next_time"] = pd.Timestamp("2023-10-01 00:00:00")
breaks

time,next_time
datetime[μs],datetime[μs]
2021-05-01 11:35:00,2021-05-02 10:00:00
2021-05-02 10:00:00,2021-05-05 01:40:00
2021-05-05 01:40:00,2021-05-05 11:50:00
2021-05-05 11:50:00,2021-05-06 15:20:00
2021-05-06 15:20:00,2021-05-07 04:25:00
…,…
2023-09-15 23:40:00,2023-09-17 13:45:00
2023-09-17 13:45:00,2023-09-19 15:50:00
2023-09-19 15:50:00,2023-09-19 21:45:00
2023-09-19 21:45:00,2023-09-20 04:10:00


In [40]:
str(break_duration)

'0 days 03:00:00'

In [52]:
breaks.write_csv(f"../dates_breaks_no_cells_for_{break_str}.csv", datetime_format="%Y%m%d%H%M", include_header=False)