# Exploratory Data Analysis

## Load Required Libraries

In [1]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For data importing
import os

In [None]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

## Load data

Files of interest:
- `weir_calibration.csv` includes calibration points for the weir
- `bci_lutzweir_combined.csv` includes raw runoff measurement, corrected runoff measurement, data source (*Chart measurements can be removed)
- `bci_cl_ra_elect2.CSV` has corrected rainfall (`ra`) in mm with measurements of `0` as `NA`s (`bci_cl_ra_elect.csv` has `0`s)
- `bci_lutz_deep_gsm_man.csv`, `bci_lutz_shallow_gsm_man.csv` have soil moisture measurements (water by wet weight and water by dry weight; one can be chosen for analysis as they are linearly related)
<!-- `bci_cl_ra_elect.csv` has corrected rainfall (`ra`) in mm, contains `0`s (large file) -->

All values level values are in mm, and datetime is in UTC-5 (Panama time zone).


### Import

In [None]:
## Calibrations dataset
data_calibrations = pd.read_csv(
    "data/weir_calibration.csv",
    # nrows = 10000,
    usecols = ['datetime', 'weir_level'], # weir_hour is a repeat of the time in datetime and can be skipped
    parse_dates=['datetime'],
    date_format='%d/%m/%Y %H:%M:%S'
    # dtype = {'datetime': 'datetime', 'weir_level': 'int'}
)

data_calibrations

In [None]:
pd.read_csv("data/bci_lutzweir_combined.csv", nrows=2)

# # Dataframe to filter out CHART sources
# data_combined_sources = pd.read_csv(
#     "data/bci_lutzweir_combined.csv",
#     usecols = ['datetime', 'source', 'chk_note', 'chk_fail'],
#     parse_dates=['datetime'],
#     date_format='%d/%m/%Y %H:%M:%S',
#     dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'category'}
#     # nrows = 2000000
# )

In [None]:
# # Dataframe to filter out CHART sources
# data_combined_sources = pd.read_csv(
#     "data/bci_lutzweir_combined.csv",
#     usecols = ['datetime', 'source', 'chk_note', 'chk_fail'],
#     parse_dates=['datetime'],
#     date_format='%d/%m/%Y %H:%M:%S',
#     dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'category'}
#     # nrows = 2000000
# )

# print(data_combined_sources)
# print(data_combined_sources['chk_note'].cat.categories.tolist())
# print(data_combined_sources['chk_fail'].cat.categories.tolist())
# print(data_combined_sources['source'].cat.categories.tolist())

In [None]:
# # bci_lutzweir_combined.csv
# # ['CHART', 'CHART+AF', 'ISCO']

# data_combined = pd.read_csv(
#     "data/bci_lutzweir_combined.csv",
#     # nrows = 300000,
#     usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
#     parse_dates=['datetime'],
#     dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
#     date_format='%d/%m/%Y %H:%M:%S'
# )

In [12]:
# Checking if the dataset is already loaded into the workspace
try:
    if data_combined.empty == False:
        print("Data loaded, random sample shown below")
        print(data_combined.sample(n=5))
except NameError:
    print("Data has not yet been read in, loading now...")
    data_combined = pd.read_csv(
        "data/bci_lutzweir_combined.csv",
        usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
        parse_dates=['datetime'],
        dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
        date_format='%d/%m/%Y %H:%M:%S'
    )

Data loaded, random sample shown below
                   datetime  level    raw chk_note chk_fail comment source
3646952 2022-09-10 09:55:00  51.65  51.65       nc      NaN     NaN    NaN
3718538 2023-05-16 23:25:00   7.80   7.80       nc      NaN     NaN    NaN
1839449 2005-05-05 03:40:00  31.00  31.00     good      NaN     NaN   ISCO
2272098 2009-06-16 20:40:00   9.10   9.10     good      NaN     NaN   ISCO
1816806 2005-02-15 12:45:00  26.80  26.80     good      NaN     NaN   ISCO


In [None]:
# Get earliest and latest dates of sources
cat_source = data_combined['source'].unique().tolist()
# print(cat_source)
for cat in cat_source:
    if pd.isna(cat) == True:
        temp_subset = data_combined[data_combined["source"].isnull()]
    else:
        temp_subset = data_combined[data_combined["source"]==cat]
    print(min(temp_subset['datetime']), "", max(temp_subset['datetime']), cat)

# Save space, remove no longer needed items
del cat_source, cat, temp_subset

1972-01-01 01:00:00  2015-03-18 14:15:00 CHART
1972-09-16 00:15:00  2025-08-01 13:00:00 nan
1989-07-19 11:55:00  1996-10-01 23:55:00 CHART+AF
1996-10-02 00:00:00  2013-01-13 05:50:00 ISCO
2012-04-23 08:30:00  2012-04-24 08:35:00 ESTIMATED
2014-08-22 10:30:00  2021-05-19 09:40:00 RADAR
2018-08-31 10:05:00  2018-09-05 12:55:00 TROLL


In [6]:
print(data_combined.dtypes)
print(data_combined.sample(n=25))
print("Source:", data_combined['source'].cat.categories.tolist())
print("Notes:", data_combined['chk_note'].cat.categories.tolist())
print("Fail mode:", data_combined['chk_fail'].unique())
print("Comments:", data_combined['comment'].unique())

datetime    datetime64[ns]
level              float64
raw                float64
chk_note          category
chk_fail            object
comment             object
source            category
dtype: object
                   datetime  level     raw  chk_note  \
702829  1994-05-19 09:55:00  13.10   13.10      good   
607151  1993-06-17 03:20:00  25.70   25.70      good   
2505769 2011-09-07 03:40:00  63.60   63.60      good   
2692083 2013-07-16 09:57:49  38.32   38.32      good   
2755691 2014-03-18 18:45:00   7.19    5.98  adjusted   
368860  1991-03-05 16:05:00  75.20   75.20      good   
458178  1992-01-09 19:20:00  33.30   33.30      good   
1636447 2003-05-31 06:50:00  16.40   16.40      good   
2115081 2007-12-19 21:15:00  47.20   47.20      good   
3822535 2024-05-12 01:50:00   0.00 -133.40  adjusted   
2537630 2011-12-26 18:40:00  72.40   72.40      good   
2465957 2011-04-21 14:15:00   9.90    9.90      good   
3485065 2021-02-25 07:20:00  28.00   32.00  adjusted   
3452688 2020-

## General Variable Exploration

In [None]:
# print('chk_note:', data_combined_sources['chk_note'].cat.categories.tolist())
# print('chk_fail:', data_combined_sources['chk_fail'].cat.categories.tolist())
# print('source:', data_combined_sources['source'].cat.categories.tolist())

# data_combined_sources[data_combined_sources['source']=='TROLL']
# data_combined_sources.T['source']
# data_combined_sources.sample(10)["source"]

# counts of each 'source' type
print(data_combined['chk_note'].value_counts(dropna = False))
print(data_combined['comment'].value_counts(dropna=False))
print(data_combined['source'].value_counts(dropna = False))

# data_combined.info()


chk_note
good        2730307
adjusted     867014
nc           336129
missing       17668
bad               1
Name: count, dtype: int64
comment
NaN                                                                         3950937
Original data missing. Copied from 17/07/2013 (had a similar rain event)        103
Original data missing. Copied from 22/09/2013 (had a similar rain event)         63
Data missing                                                                     16
Name: count, dtype: int64
source
ISCO         1656120
CHART+AF      752435
RADAR         702704
NaN           525298
CHART         312844
TROLL           1428
ESTIMATED        290
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951119 entries, 0 to 3951118
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   datetime  datetime64[ns]
 1   level     float64       
 2   raw       float64       
 3   chk_note  category      
 4   chk_fail  object  

In [None]:
# get IDs of data points to filter out
data_combined_sources.sample(n=10)["source"=="CHART"]

In [None]:
# block to test if the item is already loaded
hello_world = 2

try:
    if hello_world:
    # if hello_world.empty == False:
        print("data loaded")
except NameError:
    print("data NOT loaded")
# data_combined_sources.empty
# assert(len(data_combined_sources) == 0)

# try:
#     if len(data_combined) >

In [None]:
data_subset = data_combined_sources.sample(n=100)
data_subset[data_subset['source'] == 'CHART']


In [None]:
# data_combined.loc[data_combined['source']='CHART']
# print(data_combined.loc[data_combined['source'] == 'CHART']) # Selects rows where column 'A' is greater than 1
# data_combined.loc[data_combined['source'].isin(['CHART', 'CHART+AF'])]
# len(data_combined['source'].isin(['CHART', 'CHART+AF']).index.tolist())
# data_combined['source']