In [1]:
# Data acquisition 

# Find all files in each folder that have .csv extension and use pandas concat
import os
import pandas as pd

data_main_folders = ['2021', '2022', '2023', '2024', '2025']
months = [f'{i:02d}' for i in range(1, 13)]
# Sub folder example: 2021/202101, 2021/202102, ..., 2021/202112
# Fixed the path construction to match actual folder structure
sub_folders = [f'{folder}/{folder}{month}' for folder in data_main_folders for month in months]

print("Looking for files in these paths:")
for folder in sub_folders[:5]:  # Show first 5 paths as example
    print(f"  {folder}/CPY015.csv")
print("  ...")

# Read and concat all csv files with name "CPY015.csv" in each sub folder using pandas
master_out = 'master_CPY015.csv'
files_found = 0
dataframes = []

for sub_folder in sub_folders:
    file_path = f'{sub_folder}/CPY015.csv'
    if os.path.exists(file_path):
        files_found += 1
        print(f"Processing: {file_path}")
        try:
            # Read CSV file into DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)
            print(f"  Added {len(df)} rows")
        except Exception as e:
            print(f"  Error reading file: {e}")
    else:
        print(f"File not found: {file_path}")

# Concatenate all DataFrames
if dataframes:
    master_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to CSV
    master_df.to_csv(master_out, index=False)
    
    print(f"\nSummary:")
    print(f"Files found and processed: {files_found}")
    print(f"Total data rows in master file: {len(master_df)}")
    print(f"Total columns: {len(master_df.columns)}")
    print(f"Output file: {master_out}")
    print(f"Data shape: {master_df.shape}")
    
    # Display first few rows and basic info
    print(f"\nFirst 5 rows:")
    print(master_df.head())
    print(f"\nColumn names:")
    print(master_df.columns.tolist())
else:
    print("No data files found to concatenate!")

Looking for files in these paths:
  2021/202101/CPY015.csv
  2021/202102/CPY015.csv
  2021/202103/CPY015.csv
  2021/202104/CPY015.csv
  2021/202105/CPY015.csv
  ...
Processing: 2021/202101/CPY015.csv
  Added 4464 rows
Processing: 2021/202102/CPY015.csv
  Added 4032 rows
Processing: 2021/202103/CPY015.csv
  Added 4464 rows
Processing: 2021/202104/CPY015.csv
  Added 4320 rows
Processing: 2021/202105/CPY015.csv
  Added 4464 rows
Processing: 2021/202106/CPY015.csv
  Added 4320 rows
Processing: 2021/202107/CPY015.csv
  Added 4464 rows
Processing: 2021/202108/CPY015.csv
  Added 4464 rows
Processing: 2021/202109/CPY015.csv
  Added 4320 rows
Processing: 2021/202110/CPY015.csv
  Added 4464 rows
Processing: 2021/202111/CPY015.csv
  Added 4320 rows
Processing: 2021/202112/CPY015.csv
  Added 4464 rows
Processing: 2022/202201/CPY015.csv
  Added 4464 rows
Processing: 2022/202202/CPY015.csv
  Added 4032 rows
Processing: 2022/202203/CPY015.csv
  Added 4464 rows
Processing: 2022/202204/CPY015.csv
  Add

In [2]:
# Clean the data of master_CPY015.csv

# Read with low_memory=False to avoid dtype warnings
df_master = pd.read_csv("master_CPY015.csv", low_memory=False)
print("Original data info:")
df_master.info()
print(f"\nOriginal shape: {df_master.shape}")
print(f"\nColumn names: {df_master.columns.tolist()}")

# Display sample data to understand the structure
print(f"\nFirst 5 rows:")
print(df_master.head())
print(f"\nLast 5 rows:")
print(df_master.tail())

# Check for different data formats by looking at non-null patterns
print(f"\nData format analysis:")
print("Rows with old format (date, time, water_lv):", df_master[['date', 'time', 'water_lv']].dropna().shape[0])
print("Rows with new format (station_code, measure_datetime, water_level):", df_master[['station_code', 'measure_datetime', 'water_level']].dropna().shape[0])

# Check unique values in some columns
print(f"\nUnique station codes: {df_master['station_code'].dropna().unique()}")
print(f"Water_lv data type examples: {df_master['water_lv'].dropna().head()}")
print(f"Water_level data type examples: {df_master['water_level'].dropna().head()}")

# Check date formats
print(f"\nDate format examples:")
print("date column:", df_master['date'].dropna().head(3).tolist())
print("measure_datetime column:", df_master['measure_datetime'].dropna().head(3).tolist())

Original data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232128 entries, 0 to 232127
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   date              183888 non-null  object 
 1   time              183888 non-null  object 
 2   water_lv          183888 non-null  object 
 3   station_code      48240 non-null   object 
 4   measure_datetime  48240 non-null   object 
 5   water_level       48061 non-null   float64
 6   quality_flag      48061 non-null   object 
dtypes: float64(1), object(6)
memory usage: 12.4+ MB

Original shape: (232128, 7)

Column names: ['date', 'time', 'water_lv', 'station_code', 'measure_datetime', 'water_level', 'quality_flag']

First 5 rows:
         date      time water_lv station_code measure_datetime  water_level  \
0  2021-01-01  00:00:00    -0.66          NaN              NaN          NaN   
1  2021-01-01  00:10:00    -0.72          NaN              NaN          

In [3]:
df_master.isna().sum()

date                 48240
time                 48240
water_lv             48240
station_code        183888
measure_datetime    183888
water_level         184067
quality_flag        184067
dtype: int64

In [4]:
# Comprehensive data cleaning and standardization

import pandas as pd
import numpy as np
from datetime import datetime

# Create a copy for cleaning
df_clean = df_master.copy()

print("=== STEP 1: Separate and standardize different data formats ===")

# Identify old format data (has date, time, water_lv)
old_format_mask = df_clean[['date', 'time', 'water_lv']].notnull().all(axis=1)
old_format_data = df_clean[old_format_mask].copy()

# Identify new format data (has station_code, measure_datetime, water_level)
new_format_mask = df_clean[['station_code', 'measure_datetime', 'water_level']].notnull().all(axis=1)
new_format_data = df_clean[new_format_mask].copy()

print(f"Old format rows: {len(old_format_data)}")
print(f"New format rows: {len(new_format_data)}")

# Standardize old format data
if len(old_format_data) > 0:
    print("\n=== STEP 2: Processing old format data ===")
    
    # Combine date and time columns for old format
    old_format_data['datetime_combined'] = pd.to_datetime(
        old_format_data['date'] + ' ' + old_format_data['time'], 
        errors='coerce'
    )
    
    # Convert water_lv to numeric (it might be stored as string)
    old_format_data['water_level_clean'] = pd.to_numeric(old_format_data['water_lv'], errors='coerce')
    
    # Add station code (assuming all data is from CPY015)
    old_format_data['station_code_clean'] = 'CPY015'
    
    # Create standardized dataframe for old format
    old_standardized = pd.DataFrame({
        'station_code': old_format_data['station_code_clean'],
        'measure_datetime': old_format_data['datetime_combined'],
        'water_level': old_format_data['water_level_clean'],
        'quality_flag': 'old_format',  # Mark as old format
        'source_format': 'old'
    })
    
    print(f"Old format after processing: {len(old_standardized)} rows")
    print(f"Valid datetime records: {old_standardized['measure_datetime'].notnull().sum()}")
    print(f"Valid water level records: {old_standardized['water_level'].notnull().sum()}")

# Standardize new format data
if len(new_format_data) > 0:
    print("\n=== STEP 3: Processing new format data ===")
    
    # Convert measure_datetime to proper datetime
    new_format_data['datetime_clean'] = pd.to_datetime(new_format_data['measure_datetime'], errors='coerce')
    
    # Create standardized dataframe for new format
    new_standardized = pd.DataFrame({
        'station_code': new_format_data['station_code'],
        'measure_datetime': new_format_data['datetime_clean'],
        'water_level': new_format_data['water_level'],
        'quality_flag': new_format_data['quality_flag'],
        'source_format': 'new'
    })
    
    print(f"New format after processing: {len(new_standardized)} rows")
    print(f"Valid datetime records: {new_standardized['measure_datetime'].notnull().sum()}")
    print(f"Valid water level records: {new_standardized['water_level'].notnull().sum()}")

print("\n=== STEP 4: Combine and clean final dataset ===")

# Combine both formats
dataframes_to_combine = []
if len(old_format_data) > 0:
    dataframes_to_combine.append(old_standardized)
if len(new_format_data) > 0:
    dataframes_to_combine.append(new_standardized)

if dataframes_to_combine:
    df_combined = pd.concat(dataframes_to_combine, ignore_index=True)
else:
    df_combined = pd.DataFrame(columns=['station_code', 'measure_datetime', 'water_level', 'quality_flag', 'source_format'])

print(f"Combined data shape: {df_combined.shape}")

# Remove rows with missing essential data
df_final = df_combined.copy()

# Remove duplicate records (same datetime and station)
df_final = df_final.drop_duplicates(subset=['station_code', 'measure_datetime'], keep='first')
print(f"After removing duplicates: {df_final.shape}")

# Sort by datetime
df_final = df_final.sort_values('measure_datetime').reset_index(drop=True)

# Add additional time-based columns for analysis
df_final['year'] = df_final['measure_datetime'].dt.year
df_final['month'] = df_final['measure_datetime'].dt.month
df_final['day'] = df_final['measure_datetime'].dt.day
df_final['hour'] = df_final['measure_datetime'].dt.hour

print("\n=== STEP 5: Data quality summary ===")
print(f"Final clean dataset shape: {df_final.shape}")
print(f"Date range: {df_final['measure_datetime'].min()} to {df_final['measure_datetime'].max()}")
print(f"Water level range: {df_final['water_level'].min():.2f} to {df_final['water_level'].max():.2f}")
print(f"Years covered: {sorted(df_final['year'].unique())}")
print(f"Data by source format:")
print(df_final['source_format'].value_counts())

print(f"\nFirst 5 rows of cleaned data:")
print(df_final.head())

# Save cleaned data
df_final.to_csv('master_CPY015_cleaned.csv', index=False)
print(f"\nCleaned data saved to: master_CPY015_cleaned.csv")

=== STEP 1: Separate and standardize different data formats ===
Old format rows: 183888
New format rows: 48061

=== STEP 2: Processing old format data ===
Old format after processing: 183888 rows
Valid datetime records: 183888
Valid water level records: 168767

=== STEP 3: Processing new format data ===
New format after processing: 48061 rows
Valid datetime records: 48061
Valid water level records: 48061

=== STEP 4: Combine and clean final dataset ===
Combined data shape: (231949, 5)
After removing duplicates: (231949, 5)

=== STEP 5: Data quality summary ===
Final clean dataset shape: (231949, 9)
Date range: 2021-01-01 00:00:00 to 2025-05-31 23:50:00
Water level range: -1.79 to 11.54
Years covered: [np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024), np.int32(2025)]
Data by source format:
source_format
old    183888
new     48061
Name: count, dtype: int64

First 5 rows of cleaned data:
  station_code    measure_datetime  water_level quality_flag source_format  \
0       C

In [5]:
# Display cleaned data header
display(df_final.shape)
df_final.head()

(231949, 9)

Unnamed: 0,station_code,measure_datetime,water_level,quality_flag,source_format,year,month,day,hour
0,CPY015,2021-01-01 00:00:00,-0.66,old_format,old,2021,1,1,0
1,CPY015,2021-01-01 00:10:00,-0.72,old_format,old,2021,1,1,0
2,CPY015,2021-01-01 00:20:00,-0.75,old_format,old,2021,1,1,0
3,CPY015,2021-01-01 00:30:00,-0.81,old_format,old,2021,1,1,0
4,CPY015,2021-01-01 00:40:00,-0.82,old_format,old,2021,1,1,0


In [6]:
# Check for missing values
missing_values = df_final.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
station_code            0
measure_datetime        0
water_level         15121
quality_flag            0
source_format           0
year                    0
month                   0
day                     0
hour                    0
dtype: int64


In [7]:
# Display water_level missing values dataset
df[df['water_level'].isna()]

Unnamed: 0,station_code,measure_datetime,water_level,quality_flag
574,CPY015,2025-05-04 23:40:00,,
774,CPY015,2025-05-06 09:00:00,,
790,CPY015,2025-05-06 11:40:00,,
821,CPY015,2025-05-06 16:50:00,,
1157,CPY015,2025-05-09 00:50:00,,
1282,CPY015,2025-05-09 21:40:00,,
1711,CPY015,2025-05-12 21:10:00,,
1971,CPY015,2025-05-14 16:30:00,,
2852,CPY015,2025-05-20 19:20:00,,
3038,CPY015,2025-05-22 02:20:00,,


In [8]:
# Fill the null value with the forward fill method
df_final['water_level'] = df_final['water_level'].fillna(method='ffill')

  df_final['water_level'] = df_final['water_level'].fillna(method='ffill')


In [9]:
# Data quality assessment and basic analysis

print("=== DETAILED DATA QUALITY REPORT ===")

# Check for missing values
print(f"\nMissing values:")
print(df_final.isnull().sum())

# Check data distribution by year and month
print(f"\nData distribution by year:")
year_counts = df_final['year'].value_counts().sort_index()
print(year_counts)

print(f"\nData distribution by month:")
month_counts = df_final['month'].value_counts().sort_index()
print(month_counts)

# Check for outliers in water level
print(f"\nWater level statistics:")
print(df_final['water_level'].describe())

# Check time frequency (should be mostly 10-minute intervals)
df_final_sorted = df_final.sort_values('measure_datetime')
time_diffs = df_final_sorted['measure_datetime'].diff()
print(f"\nTime interval analysis:")
print(f"Most common intervals:")
print(time_diffs.value_counts().head())

# Check for data gaps
print(f"\nData completeness by year:")
for year in sorted(df_final['year'].unique()):
    year_data = df_final[df_final['year'] == year]
    print(f"{year}: {len(year_data):,} records")
    
# Basic statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total records: {len(df_final):,}")
print(f"Date range: {df_final['measure_datetime'].min()} to {df_final['measure_datetime'].max()}")
print(f"Average water level: {df_final['water_level'].mean():.2f} meters")
print(f"Water level std dev: {df_final['water_level'].std():.2f} meters")
print(f"Data spans {(df_final['measure_datetime'].max() - df_final['measure_datetime'].min()).days} days")

print(f"\n=== CLEANED DATA IS READY FOR ANALYSIS ===")
print(f"Use 'df_final' variable for further analysis")
print(f"Cleaned file saved as: 'master_CPY015_cleaned.csv'")

=== DETAILED DATA QUALITY REPORT ===

Missing values:
station_code        0
measure_datetime    0
water_level         0
quality_flag        0
source_format       0
year                0
month               0
day                 0
hour                0
dtype: int64

Data distribution by year:
year
2021    52560
2022    52560
2023    52560
2024    52608
2025    21661
Name: count, dtype: int64

Data distribution by month:
month
1     22310
2     20292
3     22306
4     21592
5     22281
6     17280
7     17843
8     17838
9     17264
10    17837
11    17263
12    17843
Name: count, dtype: int64

Water level statistics:
count    231949.000000
mean          0.310201
std           0.681064
min          -1.790000
25%          -0.170000
50%           0.440000
75%           0.810000
max          11.540000
Name: water_level, dtype: float64

Time interval analysis:
Most common intervals:
measure_datetime
0 days 00:10:00    231776
0 days 00:20:00       165
0 days 00:30:00         7
Name: count, dt

In [10]:
df_final.head()

Unnamed: 0,station_code,measure_datetime,water_level,quality_flag,source_format,year,month,day,hour
0,CPY015,2021-01-01 00:00:00,-0.66,old_format,old,2021,1,1,0
1,CPY015,2021-01-01 00:10:00,-0.72,old_format,old,2021,1,1,0
2,CPY015,2021-01-01 00:20:00,-0.75,old_format,old,2021,1,1,0
3,CPY015,2021-01-01 00:30:00,-0.81,old_format,old,2021,1,1,0
4,CPY015,2021-01-01 00:40:00,-0.82,old_format,old,2021,1,1,0


In [11]:
df = df_final.copy()
# Remove station_code column if exists
if 'station_code' in df.columns:
    df = df.drop(columns=['station_code'])

# Change index to measure_datetime
if not np.issubdtype(df['measure_datetime'].dtype, np.datetime64):
    df['measure_datetime'] = pd.to_datetime(df['measure_datetime'], errors='coerce')
df = df.set_index('measure_datetime')

In [12]:
df.head()

Unnamed: 0_level_0,water_level,quality_flag,source_format,year,month,day,hour
measure_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01 00:00:00,-0.66,old_format,old,2021,1,1,0
2021-01-01 00:10:00,-0.72,old_format,old,2021,1,1,0
2021-01-01 00:20:00,-0.75,old_format,old,2021,1,1,0
2021-01-01 00:30:00,-0.81,old_format,old,2021,1,1,0
2021-01-01 00:40:00,-0.82,old_format,old,2021,1,1,0


In [13]:
df_water = df['water_level']

In [14]:
df_water.head()

measure_datetime
2021-01-01 00:00:00   -0.66
2021-01-01 00:10:00   -0.72
2021-01-01 00:20:00   -0.75
2021-01-01 00:30:00   -0.81
2021-01-01 00:40:00   -0.82
Name: water_level, dtype: float64

In [15]:
# Save water level series to CSV
df_water.to_csv('water_level_series.csv', header=['water_level'], index_label='measure_datetime')
print("Water level series saved to 'water_level_series.csv'")

Water level series saved to 'water_level_series.csv'


In [16]:
# Resample to hourly frequency (mean)
df_hourly = df_water.resample('H').mean()
df_hourly.head()


  df_hourly = df_water.resample('H').mean()


measure_datetime
2021-01-01 00:00:00   -0.770000
2021-01-01 01:00:00   -0.953333
2021-01-01 02:00:00   -1.015000
2021-01-01 03:00:00   -0.795000
2021-01-01 04:00:00   -0.216667
Freq: h, Name: water_level, dtype: float64

In [17]:
# Request data from open-meteo.com API for weather data
import requests
import pandas as pd
import numpy as np
import pprint

# Define API Endpoint for Past Temperature (URL)
url_1 = "https://archive-api.open-meteo.com/v1/archive"

# Define query parameters
params_1 = {
    "latitude": 13.700287,
    "longitude": 100.492805,
    "hourly": ["temperature_2m", "rain", "showers", "cloud_cover", "relative_humidity_2m", "dew_point_2m", "precipitation", "weather_code", "pressure_msl", "surface_pressure", "wind_speed_10m", "wind_direction_10m", "wind_gusts_10m", "et0_fao_evapotranspiration"],
    "start_date": "2021-01-01",
    "end_date": "2025-05-31",
    "timezone": "Asia/Bangkok"
}

# Make the GET request and assign the response to "r_1"
r_1 = requests.get(url_1, params=params_1, timeout=15) 
r_1.raise_for_status()
js_1 = r_1.json() # transform to JSON response

# Make it dataframe
df_weather = pd.DataFrame(js_1['hourly'])
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather = df_weather.set_index('time')


In [18]:
df_weather.head()

Unnamed: 0_level_0,temperature_2m,rain,showers,cloud_cover,relative_humidity_2m,dew_point_2m,precipitation,weather_code,pressure_msl,surface_pressure,wind_speed_10m,wind_direction_10m,wind_gusts_10m,et0_fao_evapotranspiration
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-01-01 00:00:00,21.2,0.0,0.0,11,55,11.9,0.0,0,1015.4,1015.3,8.9,43,20.9,0.05
2021-01-01 01:00:00,20.3,0.0,0.0,34,61,12.5,0.0,1,1015.2,1015.1,6.7,36,14.8,0.03
2021-01-01 02:00:00,19.5,0.0,0.0,48,66,13.0,0.0,1,1014.6,1014.5,5.8,30,10.8,0.02
2021-01-01 03:00:00,18.8,0.0,0.0,0,71,13.5,0.0,0,1014.1,1014.0,5.6,15,9.0,0.01
2021-01-01 04:00:00,18.2,0.0,0.0,0,76,14.0,0.0,0,1014.1,1014.0,6.5,6,7.9,0.01


In [19]:
df_weather.isna().sum()

temperature_2m                0
rain                          0
showers                       0
cloud_cover                   0
relative_humidity_2m          0
dew_point_2m                  0
precipitation                 0
weather_code                  0
pressure_msl                  0
surface_pressure              0
wind_speed_10m                0
wind_direction_10m            0
wind_gusts_10m                0
et0_fao_evapotranspiration    0
dtype: int64

In [20]:
# Check the shape of both datasets
print(f"Water level series shape: {df_hourly.shape}")
print(f"Weather data shape: {df_weather.shape}")

Water level series shape: (38688,)
Weather data shape: (38688, 14)


In [21]:
# Merge water level and weather data on datetime index
df_merged = pd.merge(df_hourly, df_weather, left_index=True, right_index=True, how='inner')
df_merged.head()

Unnamed: 0_level_0,water_level,temperature_2m,rain,showers,cloud_cover,relative_humidity_2m,dew_point_2m,precipitation,weather_code,pressure_msl,surface_pressure,wind_speed_10m,wind_direction_10m,wind_gusts_10m,et0_fao_evapotranspiration
measure_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-01 00:00:00,-0.77,21.2,0.0,0.0,11,55,11.9,0.0,0,1015.4,1015.3,8.9,43,20.9,0.05
2021-01-01 01:00:00,-0.953333,20.3,0.0,0.0,34,61,12.5,0.0,1,1015.2,1015.1,6.7,36,14.8,0.03
2021-01-01 02:00:00,-1.015,19.5,0.0,0.0,48,66,13.0,0.0,1,1014.6,1014.5,5.8,30,10.8,0.02
2021-01-01 03:00:00,-0.795,18.8,0.0,0.0,0,71,13.5,0.0,0,1014.1,1014.0,5.6,15,9.0,0.01
2021-01-01 04:00:00,-0.216667,18.2,0.0,0.0,0,76,14.0,0.0,0,1014.1,1014.0,6.5,6,7.9,0.01


In [22]:
# Request data from open-meteo.com API for flood data
# Define API Endpoint for Past Temperature (URL)
url_2 = "https://flood-api.open-meteo.com/v1/flood"

# Define query parameters
params_2 = {
	"latitude": 13.700287,
	"longitude": 100.492805,
	"daily": "river_discharge",
    "start_date": "2021-01-01",
    "end_date": "2025-05-31",
    "timezone": "Asia/Bangkok"
}

# Make the GET request and assign the response to "r_1"
r_2 = requests.get(url_2, params=params_2, timeout=15) 
r_2.raise_for_status()
js_2 = r_2.json() # transform to JSON response

# Make it dataframe
df_flood = pd.DataFrame(js_2['daily'])
df_flood['time'] = pd.to_datetime(df_flood['time'])
df_flood = df_flood.set_index('time')

# Display flood data
df_flood.head()


Unnamed: 0_level_0,river_discharge
time,Unnamed: 1_level_1
2021-01-01,686.07
2021-01-02,681.34
2021-01-03,678.2
2021-01-04,513.04
2021-01-05,266.92


In [23]:
# Resample to hourly frequency for flood data
# Ensure daily DateTimeIndex and sort
s = df_flood["river_discharge"].sort_index().asfreq("D")

# Build an hourly index that reaches the last day's 23:00
hidx = pd.date_range(s.index.min(),
                     s.index.max() + pd.Timedelta(hours=23),
                     freq="H")

# Interpolate only inside the known span; then carry the last day's value across its remaining hours
df_flood_hourly = (
    s.reindex(hidx)
     .interpolate(method="time", limit_area="inside")
     .ffill(limit=23)                                  # fills 01:00â€“23:00 of the last day only
     .to_frame(name="river_discharge")
)
print(f"Shape of flood data after resampling to hourly: {df_flood_hourly.shape}")
df_flood_hourly.head()

Shape of flood data after resampling to hourly: (38688, 1)


  hidx = pd.date_range(s.index.min(),


Unnamed: 0,river_discharge
2021-01-01 00:00:00,686.07
2021-01-01 01:00:00,685.872917
2021-01-01 02:00:00,685.675833
2021-01-01 03:00:00,685.47875
2021-01-01 04:00:00,685.281667


In [24]:
# Merge water level, weather, flood data on datetime index
df_full_merged = df_merged.merge(df_flood_hourly, left_index=True, right_index=True, how="outer", suffixes=("_weather", "_flood"))
print(f"Shape of the full merge: {df_full_merged.shape}")
df_full_merged.head()

Shape of the full merge: (38688, 16)


Unnamed: 0_level_0,water_level,temperature_2m,rain,showers,cloud_cover,relative_humidity_2m,dew_point_2m,precipitation,weather_code,pressure_msl,surface_pressure,wind_speed_10m,wind_direction_10m,wind_gusts_10m,et0_fao_evapotranspiration,river_discharge
measure_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-01-01 00:00:00,-0.77,21.2,0.0,0.0,11,55,11.9,0.0,0,1015.4,1015.3,8.9,43,20.9,0.05,686.07
2021-01-01 01:00:00,-0.953333,20.3,0.0,0.0,34,61,12.5,0.0,1,1015.2,1015.1,6.7,36,14.8,0.03,685.872917
2021-01-01 02:00:00,-1.015,19.5,0.0,0.0,48,66,13.0,0.0,1,1014.6,1014.5,5.8,30,10.8,0.02,685.675833
2021-01-01 03:00:00,-0.795,18.8,0.0,0.0,0,71,13.5,0.0,0,1014.1,1014.0,5.6,15,9.0,0.01,685.47875
2021-01-01 04:00:00,-0.216667,18.2,0.0,0.0,0,76,14.0,0.0,0,1014.1,1014.0,6.5,6,7.9,0.01,685.281667


In [25]:
df_full_merged.isna().sum()

water_level                   0
temperature_2m                0
rain                          0
showers                       0
cloud_cover                   0
relative_humidity_2m          0
dew_point_2m                  0
precipitation                 0
weather_code                  0
pressure_msl                  0
surface_pressure              0
wind_speed_10m                0
wind_direction_10m            0
wind_gusts_10m                0
et0_fao_evapotranspiration    0
river_discharge               0
dtype: int64

In [26]:
# Export full merged dataset to .csv
df_full_merged.to_csv('full_merged.csv')

In [27]:
# Display dataset that contain missing values
df_full_merged[df_full_merged.isna().any(axis=1)]

Unnamed: 0_level_0,water_level,temperature_2m,rain,showers,cloud_cover,relative_humidity_2m,dew_point_2m,precipitation,weather_code,pressure_msl,surface_pressure,wind_speed_10m,wind_direction_10m,wind_gusts_10m,et0_fao_evapotranspiration,river_discharge
measure_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [28]:

df_hourly.head()

measure_datetime
2021-01-01 00:00:00   -0.770000
2021-01-01 01:00:00   -0.953333
2021-01-01 02:00:00   -1.015000
2021-01-01 03:00:00   -0.795000
2021-01-01 04:00:00   -0.216667
Freq: h, Name: water_level, dtype: float64