In [1]:
def get_df_from_json(data_url, record_path=['Data']):
    content = requests.get(data_url).json()
    df_all = pd.json_normalize(content,record_path=record_path)
    
    return df_all

In [2]:
# Scrape webpage
import io
import requests
import pandas as pd
from datetime import date, timedelta

# Start date and end date
today = date.today()
start_date = '2019-01-01'
end_date = date.today() - timedelta(days=1)

In [3]:
# Get flow data
flow_data_url = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=35946&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)
flow_df_all = get_df_from_json(flow_data_url)
flow_df = flow_df_all[["TimeStamp", "Value"]]
flow_df = flow_df.rename(columns={'Value': 'FlowRate'})
flow_df['TimeStamp']= pd.to_datetime(flow_df['TimeStamp'])

print(flow_df.shape)
print(flow_df.head())

(399149, 2)
                  TimeStamp   FlowRate
0 2022-10-18 00:00:00+00:00  38.785383
1 2022-10-17 23:55:00+00:00  38.519381
2 2022-10-17 23:50:00+00:00  38.452985
3 2022-10-17 23:45:00+00:00  38.752096
4 2022-10-17 23:40:00+00:00  38.519381


In [4]:
# Get Lake levels
lake_level_url = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=32419&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)
lake_level_df_all = get_df_from_json(lake_level_url)
lake_level_df = lake_level_df_all[["TimeStamp", "Value"]]
lake_level_df = lake_level_df.rename(columns={'Value': 'LakeLevel'})
lake_level_df['TimeStamp']= pd.to_datetime(lake_level_df['TimeStamp'])
print(lake_level_df.shape)
print(lake_level_df.head())


(399246, 2)
                  TimeStamp LakeLevel
0 2022-10-18 00:00:00+00:00   279.183
1 2022-10-17 23:55:00+00:00   279.181
2 2022-10-17 23:50:00+00:00   279.183
3 2022-10-17 23:45:00+00:00   279.184
4 2022-10-17 23:40:00+00:00   279.184


In [5]:
# Get gate levels
#Gate 1
gate_levels_url_1 = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=38970&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)
gate_1_df = get_df_from_json(gate_levels_url_1)
gate_1_df = gate_1_df[["TimeStamp", "Value"]]
gate_1_df = gate_1_df.rename(columns={"Value": "Gate1"})

# Gate 2
gate_levels_url_2 = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=38973&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)
gate_2_df = get_df_from_json(gate_levels_url_2)
gate_2_df = gate_2_df[["TimeStamp", "Value"]]
gate_2_df = gate_2_df.rename(columns={"Value": "Gate2"})

# Gate 3
gate_levels_url_3 = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=38972&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)
gate_3_df = get_df_from_json(gate_levels_url_3)
gate_3_df = gate_3_df[["TimeStamp", "Value"]]
gate_3_df = gate_3_df.rename(columns={"Value": "Gate3"})

# Concatenate into single dataframe
gate_levels_df_temp = pd.merge(gate_1_df, gate_2_df, on='TimeStamp')
gate_levels_df = pd.merge(gate_levels_df_temp, gate_3_df, on='TimeStamp', how='left')
gate_levels_df['TimeStamp']= pd.to_datetime(gate_levels_df['TimeStamp'])

print(gate_levels_df.describe())
print(gate_levels_df.head())

               Gate1          Gate2          Gate3
count  397410.000000  397410.000000  397375.000000
mean      386.343327     388.321804     397.767165
std       301.967903     289.122161     291.343976
min         0.000000       0.000000       0.000000
25%       219.000000     225.000000     230.000000
50%       300.000000     299.000000     300.000000
75%       500.000000     499.000000     499.000000
max      1600.000000    1538.000000    1600.000000
                  TimeStamp   Gate1   Gate2   Gate3
0 2022-10-18 00:00:00+00:00  1500.0  1500.0  1500.0
1 2022-10-17 23:55:00+00:00  1500.0  1500.0  1500.0
2 2022-10-17 23:50:00+00:00  1500.0  1500.0  1500.0
3 2022-10-17 23:45:00+00:00  1500.0  1500.0  1500.0
4 2022-10-17 23:40:00+00:00  1500.0  1500.0  1500.0


In [6]:
# Get rainfall at Lake Rotoiti
rainfall_url = 'https://envdata.boprc.govt.nz/Data/DatasetGrid?dataset=32417&sort=TimeStamp-desc&page=1&group=&filter=&interval=Custom&timezone=720&date={}&endDate={}&calendar=1&alldata=false'.format(start_date, end_date)

rainfall_df_all = get_df_from_json(rainfall_url)
rainfall_df = rainfall_df_all[["TimeStamp", "Value"]]
rainfall_df = rainfall_df.rename(columns={'Value': 'Rainfall'})
rainfall_df['TimeStamp']= pd.to_datetime(rainfall_df['TimeStamp'])
print(rainfall_df.head())
print(rainfall_df.shape)

                  TimeStamp  Rainfall
0 2022-10-18 00:00:00+00:00       0.0
1 2022-10-17 23:00:00+00:00       0.0
2 2022-10-17 22:00:00+00:00       0.0
3 2022-10-17 21:00:00+00:00       0.0
4 2022-10-17 20:00:00+00:00       0.0
(132913, 2)


In [7]:
# Merge datasets into one dataframe
df_temp = pd.merge(lake_level_df, flow_df, on='TimeStamp')
df_temp = pd.merge(df_temp, gate_levels_df, on='TimeStamp')
df_combined = pd.merge(df_temp, rainfall_df, on='TimeStamp', how='left')

#Set index
df_combined['TimeStamp'] = pd.to_datetime(df_combined['TimeStamp'])
df_combined.set_index('TimeStamp', inplace=True)

#Remove rows for which we don't have rainfall data
df_river_data = df_combined.dropna()

#Sort by timestamp descending
df_river_data = df_river_data.sort_values('TimeStamp')

print (df_river_data)

                          LakeLevel   FlowRate   Gate1   Gate2   Gate3  \
TimeStamp                                                                
2019-01-01 00:00:00+00:00   279.192  36.970287  1200.0  1200.0  1200.0   
2019-01-01 00:10:00+00:00    279.19   37.13397  1200.0  1200.0  1200.0   
2019-01-01 00:20:00+00:00   279.191  36.904889  1200.0  1200.0  1200.0   
2019-01-01 00:30:00+00:00   279.191  36.872206  1200.0  1200.0  1200.0   
2019-01-01 00:40:00+00:00   279.193   37.13397  1200.0  1200.0  1200.0   
...                             ...        ...     ...     ...     ...   
2022-10-17 20:00:00+00:00   279.181  38.552594  1500.0  1500.0  1500.0   
2022-10-17 21:00:00+00:00   279.184  38.652297  1500.0  1500.0  1500.0   
2022-10-17 22:00:00+00:00   279.183  38.386632  1500.0  1500.0  1500.0   
2022-10-17 23:00:00+00:00   279.181  38.718819  1500.0  1500.0  1500.0   
2022-10-18 00:00:00+00:00   279.183  38.785383  1500.0  1500.0  1500.0   

                           Rainfall  

In [8]:
#Save to csv
df_river_data.to_csv('kaituna_data.csv')