In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import parquet

In [9]:
# Specify the path to the extracted Parquet file
parquet_file = "/Volumes/Personal/streamflow/grouped_median_discharge.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file)


In [10]:
df.head()
#df.shape

Unnamed: 0,time,station_name,scenario,discharge
0,1976-01-07,1104150.0,hist,83.478806
1,1976-01-07,1104200.0,hist,0.680449
2,1976-01-07,1104300.0,hist,4.381658
3,1976-01-07,1104450.0,hist,
4,1976-01-07,1104480.0,hist,0.244086


In [11]:
df = df.dropna()
df.shape

(120614936, 4)

In [13]:
(df.shape[0]/121759976)*100

99.05959245589865

In [14]:
df['time'] = pd.to_datetime(df['time'])

# Filter the DataFrame where scenario is 'hist'
filtered_df = df[df['scenario'] == 'hist']

# Define the percentiles to calculate
percentiles = [1, 5, 10, 90, 95, 99]

percentile_df = filtered_df.groupby('station_name')['discharge'].quantile([p/100 for p in percentiles]).unstack()

# Rename the columns for clarity
percentile_df.columns = [f'p{int(p)}' for p in percentiles]

# Merge the percentiles back into the original DataFrame

print(percentile_df)

                    p1        p5        p10         p90         p95  \
station_name                                                          
1104150.0     2.598351  6.308822  11.348228  224.553796  280.981632   
1104200.0     0.003896  0.026722   0.043520    1.659673    2.133056   
1104300.0     0.000133  0.001142   0.005574    7.366641    9.939234   
1104480.0     0.000000  0.000000   0.000000    0.280867    0.390582   
1104500.0     0.052046  0.105169   0.158477    1.145553    1.427213   
...                ...       ...        ...         ...         ...   
6984800.0     0.025626  0.065388   0.072911   90.388947  122.973452   
6986100.0     0.618985  1.391174   2.161176   18.560699   22.239927   
6987050.0     1.820842  2.247439   2.561722   11.106482   13.143107   
6987100.0     5.842051  8.441415  10.145994   34.617324   38.627123   
6987150.0     1.113012  1.569021   1.845787    6.090517    6.922308   

                     p99  
station_name              
1104150.0     395.2001

In [15]:
df = df.merge(percentile_df, on='station_name', how='left')

In [16]:
df.head

<bound method NDFrame.head of                 time  station_name scenario  discharge        p1        p5  \
0         1976-01-07     1104150.0     hist  83.478806  2.598351  6.308822   
1         1976-01-07     1104200.0     hist   0.680449  0.003896  0.026722   
2         1976-01-07     1104300.0     hist   4.381658  0.000133  0.001142   
3         1976-01-07     1104480.0     hist   0.244086  0.000000  0.000000   
4         1976-01-07     1104500.0     hist   0.671000  0.052046  0.105169   
...              ...           ...      ...        ...       ...       ...   
120614931 2099-12-30     6987050.0   rcp8p5   7.912493  1.820842  2.247439   
120614932 2099-12-30     6987100.0   rcp4p5  50.412224  5.842051  8.441415   
120614933 2099-12-30     6987100.0   rcp8p5  26.211447  5.842051  8.441415   
120614934 2099-12-30     6987150.0   rcp4p5   8.691892  1.113012  1.569021   
120614935 2099-12-30     6987150.0   rcp8p5   4.085582  1.113012  1.569021   

                 p10         p90 

In [17]:
# Create new columns based on the comparison criteria
df['p1_flag'] = df['discharge'] < df['p1']
df['p5_flag'] = df['discharge'] < df['p5']
df['p10_flag'] = df['discharge'] < df['p10']
df['p90_flag'] = df['discharge'] > df['p90']
df['p95_flag'] = df['discharge'] > df['p95']
df['p99_flag'] = df['discharge'] > df['p99']

# Convert boolean flags to integers (1 or 0)
df['p1_flag'] = df['p1_flag'].astype(int)
df['p5_flag'] = df['p5_flag'].astype(int)
df['p10_flag'] = df['p10_flag'].astype(int)
df['p90_flag'] = df['p90_flag'].astype(int)
df['p95_flag'] = df['p95_flag'].astype(int)
df['p99_flag'] = df['p99_flag'].astype(int)

# Display the resulting DataFrame
print(df)

                time  station_name scenario  discharge        p1        p5  \
0         1976-01-07     1104150.0     hist  83.478806  2.598351  6.308822   
1         1976-01-07     1104200.0     hist   0.680449  0.003896  0.026722   
2         1976-01-07     1104300.0     hist   4.381658  0.000133  0.001142   
3         1976-01-07     1104480.0     hist   0.244086  0.000000  0.000000   
4         1976-01-07     1104500.0     hist   0.671000  0.052046  0.105169   
...              ...           ...      ...        ...       ...       ...   
120614931 2099-12-30     6987050.0   rcp8p5   7.912493  1.820842  2.247439   
120614932 2099-12-30     6987100.0   rcp4p5  50.412224  5.842051  8.441415   
120614933 2099-12-30     6987100.0   rcp8p5  26.211447  5.842051  8.441415   
120614934 2099-12-30     6987150.0   rcp4p5   8.691892  1.113012  1.569021   
120614935 2099-12-30     6987150.0   rcp8p5   4.085582  1.113012  1.569021   

                 p10         p90         p95         p99  p1_fl

In [18]:
flag_counts = df.groupby('station_name')[['p1_flag', 'p5_flag', 'p10_flag', 'p90_flag', 'p95_flag', 'p99_flag']].sum()

# Display the flag counts
print(flag_counts)

              p1_flag  p5_flag  p10_flag  p90_flag  p95_flag  p99_flag
station_name                                                          
1104150.0        1079     2445      3321       396       175        32
1104200.0         271      673      1051       537       255        46
1104300.0         639     2041      3306       156        78        16
1104480.0           0        0         0       156        78        16
1104500.0        2257     3186      3888       389       146        20
...               ...      ...       ...       ...       ...       ...
6984800.0         149      309       415       963       471       110
6986100.0          27      166       363      1071       516        73
6987050.0         359     1104      1785      1409       801       289
6987100.0         309     1088      1711      1483       945       293
6987150.0         221      990      1563      1409       824       197

[10647 rows x 6 columns]
