In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import parquet

In [2]:
# Specify the path to the extracted Parquet file
parquet_file = "/Volumes/Personal/streamflow/grouped_median_discharge.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file)


In [3]:
df.head()
#df.shape

Unnamed: 0,time,station_name,scenario,discharge
0,1976-01-07,1104150.0,hist,83.478806
1,1976-01-07,1104200.0,hist,0.680449
2,1976-01-07,1104300.0,hist,4.381658
3,1976-01-07,1104450.0,hist,
4,1976-01-07,1104480.0,hist,0.244086


In [7]:
# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Extract year, month, and day into separate columns
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day

# Create 'time_period' column based on the year
conditions = [
    (df['year'] < 2020),
    (df['year'] >= 2020) & (df['year'] < 2060),
    (df['year'] >= 2060)
]
choices = ['historical', 'mid', 'end']

df['time_period'] = pd.Series(pd.Categorical(pd.cut(df['year'], bins=[-float('inf'), 2019, 2059, float('inf')], labels=choices, include_lowest=True)))


In [8]:
df = df.dropna()
df.shape

(119480792, 8)

In [9]:
unique_scenario = df['scenario'].unique()

# Print unique values
print(unique_scenario)

['hist' 'rcp4p5' 'rcp8p5']


In [10]:
unique_time_period = df['time_period'].unique()

# Print unique values
print(unique_time_period)

['historical', 'mid', 'end']
Categories (3, object): ['historical' < 'mid' < 'end']


In [5]:
(df.shape[0]/121759976)*100

99.05959245589865

In [11]:
df['time'] = pd.to_datetime(df['time'])

# Filter the DataFrame where scenario is 'hist'
filtered_df = df[df['scenario'] == 'hist']

# Define the percentiles to calculate
percentiles = [1, 5, 10, 90, 95, 99]

percentile_df = filtered_df.groupby('station_name')['discharge'].quantile([p/100 for p in percentiles]).unstack()

# Rename the columns for clarity
percentile_df.columns = [f'p{int(p)}' for p in percentiles]

# Merge the percentiles back into the original DataFrame

print(percentile_df)

                    p1        p5        p10         p90         p95  \
station_name                                                          
1104150.0     2.598351  6.308822  11.348228  224.553796  280.981632   
1104200.0     0.003896  0.026722   0.043520    1.659673    2.133056   
1104300.0     0.000133  0.001142   0.005574    7.366641    9.939234   
1104480.0     0.000000  0.000000   0.000000    0.280867    0.390582   
1104500.0     0.052046  0.105169   0.158477    1.145553    1.427213   
...                ...       ...        ...         ...         ...   
6984800.0     0.025626  0.065388   0.072911   90.388947  122.973452   
6986100.0     0.618985  1.391174   2.161176   18.560699   22.239927   
6987050.0     1.820842  2.247439   2.561722   11.106482   13.143107   
6987100.0     5.842051  8.441415  10.145994   34.617324   38.627123   
6987150.0     1.113012  1.569021   1.845787    6.090517    6.922308   

                     p99  
station_name              
1104150.0     395.2001

In [12]:
df = df.merge(percentile_df, on='station_name', how='left')

In [13]:
df.head

<bound method NDFrame.head of                 time  station_name scenario  discharge  year  month  day  \
0         1976-01-07     1104150.0     hist  83.478806  1976      1    7   
1         1976-01-07     1104200.0     hist   0.680449  1976      1    7   
2         1976-01-07     1104300.0     hist   4.381658  1976      1    7   
3         1976-01-07     1104480.0     hist   0.244086  1976      1    7   
4         1976-01-07     1104500.0     hist   0.671000  1976      1    7   
...              ...           ...      ...        ...   ...    ...  ...   
119480787 2098-12-23     5606172.0   rcp8p5   0.022447  2098     12   23   
119480788 2098-12-23     5606173.0   rcp4p5   0.890564  2098     12   23   
119480789 2098-12-23     5606173.0   rcp8p5   0.011531  2098     12   23   
119480790 2098-12-23     5606174.0   rcp4p5   0.214002  2098     12   23   
119480791 2098-12-23     5606174.0   rcp8p5   0.002629  2098     12   23   

          time_period        p1        p5        p10     

In [14]:
# Create new columns based on the comparison criteria
df['p1_flag'] = df['discharge'] < df['p1']
df['p5_flag'] = df['discharge'] < df['p5']
df['p10_flag'] = df['discharge'] < df['p10']
df['p90_flag'] = df['discharge'] > df['p90']
df['p95_flag'] = df['discharge'] > df['p95']
df['p99_flag'] = df['discharge'] > df['p99']

# Convert boolean flags to integers (1 or 0)
df['p1_flag'] = df['p1_flag'].astype(int)
df['p5_flag'] = df['p5_flag'].astype(int)
df['p10_flag'] = df['p10_flag'].astype(int)
df['p90_flag'] = df['p90_flag'].astype(int)
df['p95_flag'] = df['p95_flag'].astype(int)
df['p99_flag'] = df['p99_flag'].astype(int)

# Display the resulting DataFrame
print(df)

                time  station_name scenario  discharge  year  month  day  \
0         1976-01-07     1104150.0     hist  83.478806  1976      1    7   
1         1976-01-07     1104200.0     hist   0.680449  1976      1    7   
2         1976-01-07     1104300.0     hist   4.381658  1976      1    7   
3         1976-01-07     1104480.0     hist   0.244086  1976      1    7   
4         1976-01-07     1104500.0     hist   0.671000  1976      1    7   
...              ...           ...      ...        ...   ...    ...  ...   
119480787 2098-12-23     5606172.0   rcp8p5   0.022447  2098     12   23   
119480788 2098-12-23     5606173.0   rcp4p5   0.890564  2098     12   23   
119480789 2098-12-23     5606173.0   rcp8p5   0.011531  2098     12   23   
119480790 2098-12-23     5606174.0   rcp4p5   0.214002  2098     12   23   
119480791 2098-12-23     5606174.0   rcp8p5   0.002629  2098     12   23   

          time_period        p1        p5        p10         p90         p95  \
0      

In [27]:
df_mid_end = df[(df['time_period'] == 'mid') | (df['time_period'] == 'end')]
flag_counts_mid = df_mid_end.groupby(['station_name','scenario'])[['p1_flag', 'p5_flag', 'p10_flag', 'p90_flag', 'p95_flag', 'p99_flag']].sum()

# Display the flag counts
print(flag_counts_mid)
flag_counts_mid.to_csv('flag_counts_mid_end.csv')
#flag_counts_mid.to_parquet('file_name.parquet', engine='pyarrow')


                       p1_flag  p5_flag  p10_flag  p90_flag  p95_flag  \
station_name scenario                                                   
1104150.0    rcp4p5        289      830      1125        96        38   
             rcp8p5        702     1337      1712        37        14   
1104200.0    rcp4p5         37      153       279       166        74   
             rcp8p5        206      408       555        95        42   
1104300.0    rcp4p5        164      754      1215         0         0   
...                        ...      ...       ...       ...       ...   
6987050.0    rcp8p5        261      630       904       493       278   
6987100.0    rcp4p5         78      345       572       584       378   
             rcp8p5        191      549       801       529       357   
6987150.0    rcp4p5         56      307       509       547       316   
             rcp8p5        129      490       724       497       300   

                       p99_flag  
station_name sce