In [18]:
import h5py

# Open the HDF5 file
with h5py.File('deven_wind.h5', 'r') as f:
    # Print all top-level groups/datasets
    def printname(name):
        print(name)
    f.visit(printname)


data
data/18116
data/18116/windspeed-aws_instant_30minute
data/18116/windspeed-aws_instant_30minute/_i_table
data/18116/windspeed-aws_instant_30minute/_i_table/index
data/18116/windspeed-aws_instant_30minute/_i_table/index/abounds
data/18116/windspeed-aws_instant_30minute/_i_table/index/bounds
data/18116/windspeed-aws_instant_30minute/_i_table/index/indices
data/18116/windspeed-aws_instant_30minute/_i_table/index/indicesLR
data/18116/windspeed-aws_instant_30minute/_i_table/index/mbounds
data/18116/windspeed-aws_instant_30minute/_i_table/index/mranges
data/18116/windspeed-aws_instant_30minute/_i_table/index/ranges
data/18116/windspeed-aws_instant_30minute/_i_table/index/sorted
data/18116/windspeed-aws_instant_30minute/_i_table/index/sortedLR
data/18116/windspeed-aws_instant_30minute/_i_table/index/zbounds
data/18116/windspeed-aws_instant_30minute/table
data/18116/windspeed-synoptic_instant_3hour
data/18116/windspeed-synoptic_instant_3hour/_i_table
data/18116/windspeed-synoptic_instant_3

In [3]:
import h5py
import pandas as pd

file_path = 'deven_wind.h5'

with h5py.File(file_path, 'r') as f:
    # Navigate to the core datasets
    airtemp_30min = f['data/42112/airtemp-aws_instant_30minute/table'][:]
    airtemp_3hour = f['data/42112/airtemp-thp_instant_3hour/table'][:]
    rh_30min = f['data/42112/relativehumidity-aws_instant_30minute/table'][:]
    rh_3hour = f['data/42112/relativehumidity-thp_instant_3hour/table'][:]

    # Print shape and dtype to understand structure
    print("Air Temp 30min:", airtemp_30min.shape, airtemp_30min.dtype)
    print("Air Temp 3hour:", airtemp_3hour.shape, airtemp_3hour.dtype)
    print("RH 30min:", rh_30min.shape, rh_30min.dtype)
    print("RH 3hour:", rh_3hour.shape, rh_3hour.dtype)


Air Temp 30min: (6815,) [('index', '<i8'), ('values_block_0', '<f8', (1,)), ('values_block_1', 'S3', (1,))]
Air Temp 3hour: (1124,) [('index', '<i8'), ('values_block_0', '<f8', (1,)), ('values_block_1', 'S1', (1,))]
RH 30min: (6815,) [('index', '<i8'), ('values_block_0', '<f8', (1,)), ('values_block_1', 'S3', (1,))]
RH 3hour: (1124,) [('index', '<i8'), ('values_block_0', '<f8', (1,)), ('values_block_1', 'S1', (1,))]


In [19]:
import h5py
import pandas as pd

file_path = "deven_wind.h5"

with h5py.File(file_path, 'r') as f:
    data = f['data/40922/windspeed-aws_instant_30minute/table'][:]

# Convert to DataFrame
df_airtemp_3hour = pd.DataFrame({
    'timestamp_ns': data['index'],
    'windspeed': data['values_block_0'].flatten(),
    'flag': [x.decode('utf-8') for x in data['values_block_1'].flatten()]
})

# Convert nanosecond timestamps to datetime
df_airtemp_3hour['time'] = pd.to_datetime(df_airtemp_3hour['timestamp_ns'], unit='ns')

# View first few rows
df_airtemp_3hour.head()


Unnamed: 0,timestamp_ns,windspeed,flag,time
0,1672531200000000000,4.611111,N,2023-01-01 00:00:00
1,1672533000000000000,4.111111,N,2023-01-01 00:30:00
2,1672534800000000000,4.111111,N,2023-01-01 01:00:00
3,1672536600000000000,3.611111,N,2023-01-01 01:30:00
4,1672538400000000000,4.611111,N,2023-01-01 02:00:00


In [21]:
import h5py
import pandas as pd

# Step 1: Load the HDF5 file
file_path = "deven_wind.h5"  # Update this path if needed

with h5py.File(file_path, 'r') as f:
    data = f['data/40922/windspeed-aws_instant_30minute/table'][:]

# Step 2: Convert to DataFrame
df_airtemp_3hour = pd.DataFrame({
    'timestamp_ns': data['index'],
    'windspeed': data['values_block_0'].flatten(),
    'flag': [x.decode('utf-8') for x in data['values_block_1'].flatten()]
})

# Step 3: Convert nanosecond timestamps to datetime and extract date
df_airtemp_3hour['time'] = pd.to_datetime(df_airtemp_3hour['timestamp_ns'], unit='ns')
df_airtemp_3hour['date'] = df_airtemp_3hour['time'].dt.date

# Step 4: Aggregate by date to get max/min temperatures
daily_summary = df_airtemp_3hour.groupby('date').agg(
    max_temp=('windspeed', 'max'),
    min_temp=('windspeed', 'min')
).reset_index()

# Step 5: Export to Excel
output_file = "windspeed_3hour_daily_summary.xlsx"
daily_summary.to_excel(output_file, sheet_name="QLD1", index=False)

print(f"Saved daily summary to {output_file}")


Saved daily summary to windspeed_3hour_daily_summary.xlsx


In [22]:
print("Earliest date:", df_airtemp_3hour['time'].min())
print("Latest date:", df_airtemp_3hour['time'].max())


Earliest date: 2023-01-01 00:00:00
Latest date: 2023-05-22 23:00:00


In [13]:
import h5py
import numpy as np
from datetime import datetime
import pandas as pd

def find_station_with_longest_airtemp_record(filename):
    """
    Scans through an H5 file to find the BOM station with the longest
    historical record for airtemp-thp_instant_3hour data.
    
    Args:
        filename (str): Path to the H5 file
        
    Returns:
        tuple: (station_id, start_date, end_date, duration_in_days, df)
    """
    longest_duration = 0
    longest_station_id = None
    longest_start_date = None
    longest_end_date = None
    longest_df = None
    station_data = {}
    
    with h5py.File(filename, 'r') as h5file:
        # Get all station IDs (directories under 'data')
        station_ids = []
        if 'data' in h5file:
            station_ids = list(h5file['data'].keys())
        else:
            # Search for station IDs at the root level
            station_ids = [key for key in h5file.keys() if isinstance(h5file[key], h5py.Group)]
        
        print(f"Found {len(station_ids)} stations in the H5 file")
        
        # Check each station for airtemp-thp_instant_3hour data
        for station_id in station_ids:
            try:
                # Path to the table with temperature data
                path = f"data/{station_id}/airtemp-thp_instant_3hour/table"
                
                # Alternative path if the structure is different
                if path not in h5file:
                    path = f"{station_id}/airtemp-thp_instant_3hour/table"
                    if path not in h5file:
                        continue
                
                # Get the dataset
                dataset = h5file[path]
                
                # Get column names
                column_names = dataset.dtype.names
                print(f"Station {station_id} columns: {column_names}")
                
                # Look for time-related columns based on the sample data
                # First try specific columns we know about
                date_col = None
                timestamp_col = None
                temp_col = None
                flag_col = None
                
                # Check for the timestamp_ns column (from sample data)
                if 'timestamp_ns' in column_names:
                    timestamp_col = 'timestamp_ns'
                else:
                    # Look for any timestamp column
                    timestamp_candidates = [col for col in column_names if 'timestamp' in col.lower()]
                    if timestamp_candidates:
                        timestamp_col = timestamp_candidates[0]
                
                # Check for the formatted time column
                time_candidates = [col for col in column_names if 'time' in col.lower() and not 'timestamp' in col.lower()]
                if time_candidates:
                    date_col = time_candidates[0]
                
                # Look for temperature column
                temp_candidates = [col for col in column_names if 'temp' in col.lower() or 'air' in col.lower()]
                if temp_candidates:
                    temp_col = temp_candidates[0]
                
                # Look for flag column
                flag_candidates = [col for col in column_names if 'flag' in col.lower()]
                if flag_candidates:
                    flag_col = flag_candidates[0]
                
                # If we don't have a date or timestamp column, we can't process this station
                if not (date_col or timestamp_col):
                    print(f"Station {station_id}: No time column found")
                    continue
                
                # Create a dataframe from the dataset
                df = pd.DataFrame()
                
                # Extract data based on available columns
                if timestamp_col:
                    timestamp_data = dataset[timestamp_col][:]
                    # Convert nanosecond timestamps to datetime
                    df['timestamp'] = pd.to_datetime(timestamp_data, unit='ns')
                
                if date_col:
                    date_data = dataset[date_col][:]
                    # Convert to strings if they're not already
                    if date_data.dtype.kind == 'S':  # bytes strings
                        date_strings = [d.decode('utf-8') for d in date_data]
                    else:
                        date_strings = date_data
                    
                    # Try to parse the dates
                    try:
                        df['formatted_time'] = pd.to_datetime(date_strings)
                    except:
                        print(f"Station {station_id}: Could not parse date strings")
                
                # Use either parsed datetime or timestamp for date range calculation
                if 'formatted_time' in df.columns:
                    dates = df['formatted_time']
                elif 'timestamp' in df.columns:
                    dates = df['timestamp']
                else:
                    print(f"Station {station_id}: No usable date column")
                    continue
                
                # Extract temperature and flag data if available
                if temp_col:
                    df['temperature'] = dataset[temp_col][:]
                
                if flag_col:
                    df['flag'] = dataset[flag_col][:]
                
                # Get start and end dates
                start_date = min(dates)
                end_date = max(dates)
                
                # Calculate the duration in days
                duration = (end_date - start_date).total_seconds() / (24 * 3600)
                
                # Store the result for this station
                station_data[station_id] = {
                    'start_date': start_date,
                    'end_date': end_date,
                    'duration_days': duration,
                    'record_count': len(dates)
                }
                
                # Update if this is the longest record so far
                if duration > longest_duration:
                    longest_duration = duration
                    longest_station_id = station_id
                    longest_start_date = start_date
                    longest_end_date = end_date
                    longest_df = df.copy()
                
                print(f"Station {station_id}: {start_date} to {end_date} ({duration:.2f} days, {len(dates)} records)")
                
            except Exception as e:
                print(f"Error processing station {station_id}: {str(e)}")
    
    # Create a summary dataframe
    if station_data:
        df = pd.DataFrame.from_dict(
            {k: {
                'start_date': v['start_date'],
                'end_date': v['end_date'],
                'duration_days': v['duration_days'],
                'record_count': v['record_count']
            } for k, v in station_data.items()
        }, orient='index')
        
        # Sort by duration (longest first)
        df = df.sort_values('duration_days', ascending=False)
        print("\nAll stations with airtemp-thp_instant_3hour data, sorted by duration:")
        print(df)
    
    return longest_station_id, longest_start_date, longest_end_date, longest_duration, longest_df

def extract_station_temperature_data(filename, station_id):
    """
    Extracts the temperature data for a specific station from the H5 file
    and saves it to a CSV file.
    
    Args:
        filename (str): Path to the H5 file
        station_id (str): ID of the station to extract data for
        
    Returns:
        DataFrame: The extracted temperature data
    """
    with h5py.File(filename, 'r') as h5file:
        # Path to the table with temperature data
        path = f"data/{station_id}/airtemp-thp_instant_3hour/table"
        
        # Alternative path if the structure is different
        if path not in h5file:
            path = f"{station_id}/airtemp-thp_instant_3hour/table"
            if path not in h5file:
                print(f"Error: airtemp-thp_instant_3hour data not found for station {station_id}")
                return None
        
        # Get the dataset
        dataset = h5file[path]
        column_names = dataset.dtype.names
        print(f"Available columns for station {station_id}: {column_names}")
        
        # Create a dataframe to hold all data
        df = pd.DataFrame()
        
        # Copy all columns to the dataframe
        for col in column_names:
            df[col] = dataset[col][:]
            
        # Look for timestamp columns
        timestamp_col = next((col for col in column_names if 'timestamp' in col.lower()), None)
        if timestamp_col:
            df['datetime'] = pd.to_datetime(df[timestamp_col], unit='ns')
        
        # Look for formatted time columns
        time_col = next((col for col in column_names if 'time' in col.lower() and 'timestamp' not in col.lower()), None)
        if time_col:
            if df[time_col].dtype.kind == 'S':  # If it's a bytes string
                df['formatted_time'] = [t.decode('utf-8') if isinstance(t, bytes) else t for t in df[time_col]]
            else:
                df['formatted_time'] = df[time_col]
        
        # Look for temperature column
        temp_col = next((col for col in column_names if 'temp' in col.lower() or 'air' in col.lower()), None)
        if temp_col:
            df['temperature'] = df[temp_col]
        
        # Save to CSV
        csv_filename = f"station_{station_id}_airtemp_thp_instant_3hour.csv"
        df.to_csv(csv_filename, index=False)
        print(f"Saved temperature data for station {station_id} to {csv_filename}")
        
        return df

if __name__ == "__main__":
    h5_file = "deven_solar.h5"  # Change this to your file path
    
    station_id, start, end, duration, df = find_station_with_longest_airtemp_record(h5_file)
    
    if station_id:
        print("\n===== Result =====")
        print(f"Station with longest airtemp-thp_instant_3hour record: {station_id}")
        print(f"Date range: {start} to {end}")
        print(f"Duration: {duration:.2f} days ({duration/365.25:.2f} years)")
        
        # Extract and save the data for this station
        print("\nExtracting data for the station with the longest record...")
        extract_station_temperature_data(h5_file, station_id)
    else:
        print("No stations with airtemp-thp_instant_3hour data found")

Found 37 stations in the H5 file
Station 23885 columns: ('index', 'values_block_0', 'values_block_1')
Station 23885: No time column found
Station 24511 columns: ('index', 'values_block_0', 'values_block_1')
Station 24511: No time column found
Station 24584 columns: ('index', 'values_block_0', 'values_block_1')
Station 24584: No time column found
Station 30022 columns: ('index', 'values_block_0', 'values_block_1')
Station 30022: No time column found
Station 32195 columns: ('index', 'values_block_0', 'values_block_1')
Station 32195: No time column found
Station 33002 columns: ('index', 'values_block_0', 'values_block_1')
Station 33002: No time column found
Station 33327 columns: ('index', 'values_block_0', 'values_block_1')
Station 33327: No time column found
Station 34035 columns: ('index', 'values_block_0', 'values_block_1')
Station 34035: No time column found
Station 35134 columns: ('index', 'values_block_0', 'values_block_1')
Station 35134: No time column found
Station 36031 columns: