In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import json
import tqdm
import tqdm.notebook as tqdm
from datetime import timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(16, 3))

from flood_filters import *
from flood_filters.models import FloodDetector

In [3]:
CLS_NAMES = np.array(['flood', 'blip', 'pulse-chain', 'box', 'snow', 'something'])

# load

def load_csvs(filenames, event_fname):
    df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df = df.set_index('time').sort_index()

    event_df = load_events(event_fname)

    df = df[df.deployment_id.isin(event_df.Deployment_id.unique())]
    df = df.loc[event_df.start.min():event_df.end.max()]

    df['event_id'] = pd.NA
    df['class_id'] = pd.NA
    df['label'] = pd.NA

    for deployment_id, edf in tqdm.tqdm(event_df.groupby('Deployment_id'), desc='adding events...'):
        is_dep = df.deployment_id == deployment_id
        for eid, row in edf.iterrows():
            mask = (
                is_dep &
                (df.index >= row.start - timedelta(minutes=1)) &
                (df.index < row.end + timedelta(minutes=1))
            )
            df.loc[mask, 'event_id'] = eid
            df.loc[mask, 'class_id'] = row.Class
            df.loc[mask, 'label'] = row.label
            df.loc[mask, 'duration'] = row.duration
            df.loc[mask, 'start'] = row.start
            df.loc[mask, 'end'] = row.end
    return df


def load_events(event_fname):
    # load all events
    event_df = pd.read_csv(event_fname)
    event_df = event_df.set_index(pd.Index(np.arange(len(event_df)), name='event_id'))
    event_df['start'] = pd.to_datetime(event_df.Start_time.str.strip(), format='%Y-%m-%d %H:%M:%S', utc=True)
    event_df['end'] = pd.to_datetime(event_df.End_time.str.strip(), format='%Y-%m-%d %H:%M:%S', utc=True)
    event_df['duration'] = (event_df.end - event_df.start).dt.total_seconds()
    event_df['label'] = CLS_NAMES[event_df.Class]
    event_df = event_df.sort_values('start')
    # event_df = event_df[~event_df.label.isin(['snow', 'something'])]
    return event_df

def print_block(*xs, c='#', nc=40):
    print()
    print()
    print(c*nc)
    for x in xs:
        if isinstance(x, (list, tuple)):
            print(*x)
        elif isinstance(x, dict):
            print(*(x for kv in x.items() for x in kv))
        else:
            print(x)
    print(c*nc)
    print()
    print()

In [4]:
data_dir = '../data'
fs = glob.glob(os.path.join(data_dir, 'deployments/*.csv'))
event_fname = os.path.join(data_dir, 'Events_523.csv')
full_data_df = data_df = load_csvs(fs, event_fname)
data_df = full_data_df[~full_data_df.label.isin(['snow'])]
print(data_df.shape)
data_df.head()

  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading...')])
  df = pd.concat([pd.read_csv(f) for f in tqdm.tqdm(filenames, desc='loading

(11518490, 65)


Unnamed: 0_level_0,index,deployment_id,depth_raw_mm,depth_filt_mm,depth_proc_mm,dev_id,batt_v,f_cnt,dist_mm,lat,...,mllw_mm,navd88_mm,binary_class,simple_class,full_multi_class,label,class_id,duration,start,end
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-11-16 01:32:58.250962+00:00,5070,daily_new_falcon,-3.0,0.0,0.0,,,,,,...,,,1,1,1,flood,0,12240.0,2020-11-16 01:30:00+00:00,2020-11-16 04:54:00+00:00
2020-11-16 01:37:08.059261+00:00,5071,daily_new_falcon,-1.0,0.0,0.0,,,,,,...,,,1,1,1,flood,0,12240.0,2020-11-16 01:30:00+00:00,2020-11-16 04:54:00+00:00
2020-11-16 01:41:17.980824+00:00,5072,daily_new_falcon,-23.0,0.0,0.0,,,,,,...,,,1,1,1,flood,0,12240.0,2020-11-16 01:30:00+00:00,2020-11-16 04:54:00+00:00
2020-11-16 01:45:28.202578+00:00,5073,daily_new_falcon,50.0,50.0,50.0,,,,,,...,,,1,1,1,flood,0,12240.0,2020-11-16 01:30:00+00:00,2020-11-16 04:54:00+00:00
2020-11-16 01:49:38.715987+00:00,5074,daily_new_falcon,90.0,90.0,90.0,,,,,,...,,,1,1,1,flood,0,12240.0,2020-11-16 01:30:00+00:00,2020-11-16 04:54:00+00:00


In [5]:
full_data_df.groupby(['deployment_id', 'label']).index.count().unstack().fillna(0).astype(int)

label,blip,box,flood,pulse-chain,snow,something
deployment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
asleep_apricot_bedbug,18,0,24,34,0,10
big_pink_elephant,132,712,301,372,0,0
blue_eyed_tiger,309,1281,33,1760,0,8
bored_blue_fish,78,16,157,0,0,0
boring_olive_stingray,1167,641,0,5444,0,28
clearly_bored_turtle,75,6,17,0,0,0
closed_wagon_snail,118,9,14,0,0,0
daily_gentle_beetle,336,2335,0,17192,0,22
daily_happy_satyr,726,3735,8065,10179,0,288
daily_mutual_gnat,6,5,0,0,0,0


In [6]:
data_df.label.value_counts()

label
pulse-chain    334074
flood           45808
box             43399
blip            35806
something       32985
Name: count, dtype: int64

In [7]:
x = data_df.depth_proc_mm

In [8]:
(
    (data_df.depth_proc_mm == 0).mean(), 
    pd.isna(data_df.depth_proc_mm).mean(),
)

(0.9120767565887542, 0.06634168193921251)

In [9]:
(
    (data_df.depth_filt_mm == 0).mean(), 
    pd.isna(data_df.depth_filt_mm).mean(),
)

(0.8863315417211804, 0.08024984177613559)

In [10]:
data_df.label[1000]

<NA>

In [11]:
x = data_df.depth_proc_mm
(
    (x[pd.isna(data_df.label)] == 0).mean(),
    (x[data_df.label != 'flood'] == 0).mean(),
)

(0.9285635643415658, 0.9154439214823527)

In [12]:
pd.isna(0)

False

In [13]:
(pd.isna(data_df.depth_proc_mm) & (data_df.depth_proc_mm == 0)).mean()

0.0

In [14]:
(1-((data_df.depth_proc_mm.fillna(0) == 0)).mean()), (1-((data_df.depth_filt_mm.fillna(0) == 0)).mean())

(0.021581561472033273, 0.03341861650268396)