"""
Updated on Saturday Oct 05 2024

Purpose: Create aggregated data frame for seasons and time period 
@author: Siddharth Chaudhary
"""

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import parquet



In [3]:
# Specify the path to the extracted Parquet file
parquet_file = "/Volumes/Personal/streamflow/grouped_median_discharge.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file)


In [4]:
df.head()
#df.shape

Unnamed: 0,time,station_name,scenario,discharge
0,1976-01-07,1104150.0,hist,83.478806
1,1976-01-07,1104200.0,hist,0.680449
2,1976-01-07,1104300.0,hist,4.381658
3,1976-01-07,1104450.0,hist,
4,1976-01-07,1104480.0,hist,0.244086


In [5]:
# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Extract year, month, and day into separate columns
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day

# Create 'time_period' column based on the year
conditions = [
    (df['year'] < 2006),
    (df['year'] >= 2006) & (df['year'] < 2040),
    (df['year'] >= 2040) & (df['year'] < 2070),
    (df['year'] >= 2070)
]
choices = ['historical', 'early', 'mid', 'end']

df['time_period'] = pd.cut(df['year'], bins=[-float('inf'), 2005, 2039, 2069, float('inf')], labels=choices, right=False)


In [6]:
df = df.dropna()
df.shape

(120614936, 8)

In [7]:
conditions = [
    (df['month'].isin([1, 2, 3])),
    (df['month'].isin([4, 5, 6])),
    (df['month'].isin([7, 8, 9])),
    (df['month'].isin([10, 11, 12]))
]

choices = ['Season 1', 'Season 2', 'Season 3', 'Season 4']

# Create new column 'season'
df['season'] = np.select(conditions, choices)

In [8]:
df.columns

Index(['time', 'station_name', 'scenario', 'discharge', 'year', 'month', 'day',
       'time_period', 'season'],
      dtype='object')

In [9]:
unique_scenario = df['scenario'].unique()

# Print unique values
print(unique_scenario)

['hist' 'rcp4p5' 'rcp8p5']


In [10]:
unique_time_period = df['time_period'].unique()

# Print unique values
print(unique_time_period)

['historical', 'early', 'mid', 'end']
Categories (4, object): ['historical' < 'early' < 'mid' < 'end']


In [11]:
df['time'] = pd.to_datetime(df['time'])

# Filter the DataFrame where scenario is 'hist'
filtered_df = df[df['scenario'] == 'hist']

# Define the percentiles to calculate
percentiles = [1, 5, 10, 90, 95, 99]

percentile_df = filtered_df.groupby('station_name')['discharge'].quantile([p/100 for p in percentiles]).unstack()

# Rename the columns for clarity
percentile_df.columns = [f'p{int(p)}' for p in percentiles]

# Merge the percentiles back into the original DataFrame

print(percentile_df)

                    p1        p5        p10         p90         p95  \
station_name                                                          
1104150.0     2.598351  6.308822  11.348228  224.553796  280.981632   
1104200.0     0.003896  0.026722   0.043520    1.659673    2.133056   
1104300.0     0.000133  0.001142   0.005574    7.366641    9.939234   
1104480.0     0.000000  0.000000   0.000000    0.280867    0.390582   
1104500.0     0.052046  0.105169   0.158477    1.145553    1.427213   
...                ...       ...        ...         ...         ...   
6984800.0     0.025626  0.065388   0.072911   90.388947  122.973452   
6986100.0     0.618985  1.391174   2.161176   18.560699   22.239927   
6987050.0     1.820842  2.247439   2.561722   11.106482   13.143107   
6987100.0     5.842051  8.441415  10.145994   34.617324   38.627123   
6987150.0     1.113012  1.569021   1.845787    6.090517    6.922308   

                     p99  
station_name              
1104150.0     395.2001

In [12]:
df = df.merge(percentile_df, on='station_name', how='left')

In [13]:
df.head

<bound method NDFrame.head of                 time  station_name scenario  discharge  year  month  day  \
0         1976-01-07     1104150.0     hist  83.478806  1976      1    7   
1         1976-01-07     1104200.0     hist   0.680449  1976      1    7   
2         1976-01-07     1104300.0     hist   4.381658  1976      1    7   
3         1976-01-07     1104480.0     hist   0.244086  1976      1    7   
4         1976-01-07     1104500.0     hist   0.671000  1976      1    7   
...              ...           ...      ...        ...   ...    ...  ...   
120614931 2099-12-30     6987050.0   rcp8p5   7.912493  2099     12   30   
120614932 2099-12-30     6987100.0   rcp4p5  50.412224  2099     12   30   
120614933 2099-12-30     6987100.0   rcp8p5  26.211447  2099     12   30   
120614934 2099-12-30     6987150.0   rcp4p5   8.691892  2099     12   30   
120614935 2099-12-30     6987150.0   rcp8p5   4.085582  2099     12   30   

          time_period    season        p1        p5      

In [14]:
# Create new columns based on the comparison criteria
df['p1_flag'] = df['discharge'] < df['p1']
df['p5_flag'] = df['discharge'] < df['p5']
df['p10_flag'] = df['discharge'] < df['p10']
df['p90_flag'] = df['discharge'] > df['p90']
df['p95_flag'] = df['discharge'] > df['p95']
df['p99_flag'] = df['discharge'] > df['p99']

# Convert boolean flags to integers (1 or 0)
df['p1_flag'] = df['p1_flag'].astype(int)
df['p5_flag'] = df['p5_flag'].astype(int)
df['p10_flag'] = df['p10_flag'].astype(int)
df['p90_flag'] = df['p90_flag'].astype(int)
df['p95_flag'] = df['p95_flag'].astype(int)
df['p99_flag'] = df['p99_flag'].astype(int)

# Display the resulting DataFrame
print(df)

                time  station_name scenario  discharge  year  month  day  \
0         1976-01-07     1104150.0     hist  83.478806  1976      1    7   
1         1976-01-07     1104200.0     hist   0.680449  1976      1    7   
2         1976-01-07     1104300.0     hist   4.381658  1976      1    7   
3         1976-01-07     1104480.0     hist   0.244086  1976      1    7   
4         1976-01-07     1104500.0     hist   0.671000  1976      1    7   
...              ...           ...      ...        ...   ...    ...  ...   
120614931 2099-12-30     6987050.0   rcp8p5   7.912493  2099     12   30   
120614932 2099-12-30     6987100.0   rcp4p5  50.412224  2099     12   30   
120614933 2099-12-30     6987100.0   rcp8p5  26.211447  2099     12   30   
120614934 2099-12-30     6987150.0   rcp4p5   8.691892  2099     12   30   
120614935 2099-12-30     6987150.0   rcp8p5   4.085582  2099     12   30   

          time_period    season        p1  ...        p10         p90  \
0          his

In [14]:
#df.to_csv('/Users/sidchaudhary/Documents/GitHub/Hydro-Seesaw/Results/station_weekly_flag_counts_early_mid_end.csv')

In [25]:
df_early_mid_end = df[(df['time_period'] == 'historical') |(df['time_period'] == 'early') | (df['time_period'] == 'mid')| (df['time_period'] == 'end')]
flag_counts = df_early_mid_end.groupby(['station_name','scenario','time_period','season'])[['p1_flag', 'p5_flag', 'p10_flag', 'p90_flag', 'p95_flag', 'p99_flag']].sum()
print(flag_counts)

  flag_counts = df_early_mid_end.groupby(['station_name','scenario','time_period','season'])[['p1_flag', 'p5_flag', 'p10_flag', 'p90_flag', 'p95_flag', 'p99_flag']].sum()


                                            p1_flag  p5_flag  p10_flag  \
station_name scenario time_period season                                 
1104150.0    hist     historical  Season 1        0        0         0   
                                  Season 2        0        1         2   
                                  Season 3       15       68       124   
                                  Season 4        1        9        30   
                      early       Season 1        0        0         0   
...                                             ...      ...       ...   
6987150.0    rcp8p5   mid         Season 4       10       46        64   
                      end         Season 1        0        1         2   
                                  Season 2        0        2        11   
                                  Season 3       56      170       236   
                                  Season 4       13       48        71   

                                     

In [28]:
annual_sums = flag_counts.groupby(['station_name', 'scenario', 'time_period']).sum()

# Create a DataFrame for the annual sums
annual_sums = annual_sums.reset_index()
annual_sums['season'] = 'Annual'

# Set the multi-index for annual_sums to match flag_counts
annual_sums.set_index(['station_name', 'scenario', 'time_period', 'season'], inplace=True)

# Concatenate the annual sums into the original flag_counts DataFrame
flag_counts = pd.concat([flag_counts, annual_sums])

# Sort the DataFrame if needed
flag_counts.sort_index(inplace=True)

# Print the updated DataFrame
print(flag_counts)

  annual_sums = flag_counts.groupby(['station_name', 'scenario', 'time_period']).sum()


                                            p1_flag  p5_flag  p10_flag  \
station_name scenario time_period season                                 
1104150.0    hist     historical  Annual         16       78       156   
                                  Season 1        0        0         0   
                                  Season 2        0        1         2   
                                  Season 3       15       68       124   
                                  Season 4        1        9        30   
...                                             ...      ...       ...   
6987150.0    rcp8p5   end         Annual         69      221       320   
                                  Season 1        0        1         2   
                                  Season 2        0        2        11   
                                  Season 3       56      170       236   
                                  Season 4       13       48        71   

                                     

In [29]:
flag_counts.columns

Index(['p1_flag', 'p5_flag', 'p10_flag', 'p90_flag', 'p95_flag', 'p99_flag'], dtype='object')

In [30]:
flag_counts.to_csv('/Users/sidchaudhary/Documents/GitHub/Hydro-Seesaw/Results/flag_counts_early_mid_end_season.csv')
#flag_counts_mid.to_parquet('file_name.parquet', engine='pyarrow')

In [31]:
df_flag_counts = pd.read_csv('/Users/sidchaudhary/Documents/GitHub/Hydro-Seesaw/Results/flag_counts_early_mid_end_season.csv')
df_station_name = pd.read_csv('/Users/sidchaudhary/Documents/GitHub/Hydro-Seesaw/data/staion_id.csv')
merged_df = pd.merge(df_flag_counts, df_station_name, on='station_name', how='inner')

In [32]:
merged_df.to_csv('/Users/sidchaudhary/Documents/GitHub/Hydro-Seesaw/Results/lat_lon_flag_counts_early_mid_end_season.csv')