In [2]:
import pandas as pd
import os


In [6]:
rdrs_dir = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000\ProjectDischarge'
            r'\RdrsSample\LstmDatasets\10YrNhRun\Rdrs10yrCsv')
q_dir = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000\ProjectDischarge'
         r'\RdrsSample\LstmDatasets\basin_q')

rdrs_files = os.listdir(rdrs_dir)
q_files = os.listdir(q_dir)

for file in rdrs_files:
    rdrs_basin_id = file.split('_')[0]
    if file.endswith('.csv'):
        rdrs = os.path.join(rdrs_dir, file)
        df_rdrs = pd.read_csv(rdrs, parse_dates=['date'])

        for q in q_files:
            if q.endswith('.csv'):
                q_basin_id = q.split('_')[0]
                q = os.path.join(q_dir, q)
                if rdrs_basin_id == q_basin_id:
                    df_q = pd.read_csv(q, parse_dates=['Date'], skiprows=1)
                    # Rename the 'Date' column to 'date' in df_q.
                    df_q.rename(columns={'Date': 'date'}, inplace=True)

                    ##### Filter to desired date range #############################
                    # Find the earliest and latest dates in df_rdrs and use them to filter .
                    start_date = df_rdrs['date'].min()
                    end_date = df_rdrs['date'].max()
                    # Filter df_q based on the date range
                    df_q = df_q[(df_q['date'] >= start_date) & (df_q['date'] <= end_date)]
                    ##### Reorganize the data ######################################
                    df_q_filt_stage = df_q[df_q['PARAM'] == 1]
                    # Filter df_q to only include rows where 'PARAM'== 2 (discharge rate).
                    df_q_filt_q = df_q[df_q['PARAM'] == 2]
                    # For df_q_filt_stage, rename the 'Value' column to 'stage_m' and drop the 'PARAM' column.
                    df_q_filt_stage.rename(columns={'Value': 'stage_m'}, inplace=True)
                    df_q_filt_stage.drop(columns=['PARAM'], inplace=True)
                    # For df_q_filt_q, rename the 'Value' column to 'q_m3_s' and drop the 'PARAM' column.
                    df_q_filt_q.rename(columns={'Value': 'q_m3_s'}, inplace=True)
                    df_q_filt_q.drop(columns=['PARAM'], inplace=True)
                    # Merge df_q_filt_stage and df_q_filt_q on 'date' using an inner join.
                    df_q_joined = pd.merge(df_q_filt_stage, df_q_filt_q, on='date', how='inner')
                    # Interpolate missing values in  for columns 'q_m3_s'.
                    df_q_joined['q_m3_s'] = df_q_joined['q_m3_s'].interpolate()
                    df_q_joined['stage_m'] = df_q_joined['stage_m'].interpolate()
                    # Drop unnecessary columns from df_rdrs. Warning: there is a space in the ' ID' 
                    # and ' ID_y' column names.
                    df_q_joined.drop(columns=[' ID_y', 'SYM_x', 'SYM_y'], inplace=True)
                    df_q_joined.rename(columns={' ID_x': 'Id'}, inplace=True)
                    # Merge df_rdrs and df_q_joined on 'date' using an inner join.
                    df_q_joined = pd.merge(df_rdrs, df_q_joined, on='date', how='inner')
                    # Change date format to 'dd/mm/yyyy'.
                    df_q_joined['date'] = pd.to_datetime(df_q_joined['date']).dt.strftime('%d/%m/%Y')

                    ##### Check for missing values ################################
                    # Identify groups of consecutive missing values
                    df_q['group'] = df_q['Value'].isna().ne(df_q['Value'].shift().isna()).cumsum()

                    # Count the number of missing values in each group
                    counts = df_q.groupby('group')['Value'].apply(lambda x: x.isna().sum())

                    # Calculate the total number of rows for groups with more than 10 consecutive missing values
                    total_rows = counts[counts > 10].sum()

                    # Calculate 10% of the total original number of rows in df_q
                    ten_percent = df_q.shape[0] * 0.1

                    # If the total number of rows for groups with more than 10 consecutive missing values is greater than 10% of the total original number of rows in df_q
                    if total_rows > ten_percent:
                        # Calculate the percentage of the df_q that is groups with more than 10 consecutive missing values
                        percent = (total_rows / df_q.shape[0]) * 100
                        # print(f"{percent}% of {q_basin_id} is groups with more than 10 consecutive missing values.")
                        break
                    elif total_rows <= ten_percent:
                        # impute missing values in df_q
                        df_q['Value'] = df_q['Value'].interpolate()

                    # Print the maximum number of consecutive missing values in the 'Value' column
                    print(f"Max number of consecutive missing values in {q_basin_id}: {counts.max()}")
                    # Print the percent of missing values in the 'Value' column.
                    print(f"Percent of missing values in {q_basin_id}: {df_q['Value'].isna().mean() * 100}")
                    print()


                    output_dir = (r'C:\Users\daryl\OneDrive\Documents\GDAA3000'
                                  r'\ProjectDischarge\RdrsSample\LstmDatasets\10YrNhRun\Rdrs10yrsJoinedQCsv')
                    df_q_joined_dropped = df_q_joined.drop(columns=['Id'])
                    df_q_joined_dropped = df_q_joined_dropped.rename(columns={'RDRS_v2.1_A_PR0_SFC': 'A_PRO_SFC',
                                                                              'RDRS_v2.1_P_HR_1.5m': 'P_HR_1-5m',
                                                                              'RDRS_v2.1_P_HU_1.5m': 'P_HU_1-5m',
                                                                              'RDRS_v2.1_P_TD_1.5m': 'P_TD_1-5m',
                                                                              'RDRS_v2.1_P_TT_1.5m': 'P_TT_1-5m',
                                                                              'RDRS_v2.1_P_UUC_10m': 'P_UUC_10m',
                                                                              'RDRS_v2.1_P_VVC_10m': 'P_VVC_10m',
                                                                              'RDRS_v2.1_P_FB_SFC': 'P_FB_SFC',})
                    # Save the joined dataframe as a csv file in the output_dir.
                    df_q_joined_dropped.to_csv(os.path.join(output_dir, f'{q_basin_id}.csv'), index=False)
                    print(df_q_joined_dropped.columns)
                    print(df_q_joined_dropped.head())                

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_stage.rename(columns={'Value': 'stage_m'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_stage.drop(columns=['PARAM'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_q.rename(columns={'Value': 'q_m3_s'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Max number of consecutive missing values in 01DR001: 42
Percent of missing values in 01DR001: 0.0

Index(['date', 'basin', 'A_PRO_SFC', 'P_HR_1-5m', 'P_HU_1-5m', 'P_TD_1-5m',
       'P_TT_1-5m', 'P_UUC_10m', 'P_VVC_10m', 'P_FB_SFC', 'stage_m', 'q_m3_s'],
      dtype='object')
         date    basin  A_PRO_SFC  P_HR_1-5m  P_HU_1-5m  P_TD_1-5m  P_TT_1-5m  \
0  01/01/2011  01DR001   0.000486   0.907476   0.003406  -1.510888  -0.167569   
1  02/01/2011  01DR001   0.000244   0.920298   0.003576  -0.932975   0.283169   
2  03/01/2011  01DR001   0.019123   0.886419   0.003053  -4.020798  -2.385277   
3  04/01/2011  01DR001   0.000175   0.754222   0.001673 -10.928314  -7.213029   
4  05/01/2011  01DR001   0.000000   0.722910   0.001447 -12.929564  -8.620917   

   P_UUC_10m  P_VVC_10m   P_FB_SFC  stage_m  q_m3_s  
0   5.940687  -4.905020  41.276009     6.53   1.192  
1   2.051195  -2.407153  78.966560     5.65   1.151  
2   5.345839   1.127073  13.419610     6.65   1.189  
3   5.219585   5.020

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_stage.rename(columns={'Value': 'stage_m'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_stage.drop(columns=['PARAM'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q_filt_q.rename(columns={'Value': 'q_m3_s'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v