In [1]:
import pandas as pd 
import xarray as xr
import numpy as np 
import matplotlib.pyplot as plt 
from glob import glob
from datetime import datetime, timezone
from zoneinfo import ZoneInfo # Python 3.9
import pytz
from multiprocess import Pool
from tqdm import tqdm
from geopy.geocoders import Nominatim

In [2]:
import os 
## my own directory
os.chdir("/g/data/k10/dl6968/Semi-variogram_AU/")

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def lat_lon_tz(lat,lon):
    tz = tzwhere.tzwhere()
    timezone_str = tz.tzNameAt(lat, lon,forceTZ=True) 
    LocalZone = ZoneInfo(timezone_str)
    return LocalZone

def local_to_utc(local_time, local_tz, dt_format = "%Y%m%d %H:%M"):
    ## first convert strings to datetime
    Local = [datetime.strptime(s, dt_format).replace(tzinfo=local_tz) for s in local_time]
    UTC = [dt.astimezone(timezone.utc) for dt in Local]
    return UTC

def find_aus_tz(state):
    state = state.replace(" ", "")
    if state=="WA" or state=="Western Australia":
        tz_string = "Australia/West"
    elif state=="SA" or state=="South Australia":
        tz_string = "Australia/South"
    elif state == "VIC" or state=="Victoria":
        tz_string = "Australia/Victoria"
    elif state == "TAS" or state=="Tasmania":
        tz_string = "Australia/Tasmania" 
    elif state == "ANT" or state=="Antarctica":
        tz_string = "Antarctica/McMurdo"  
    elif state == "NSW" or state=="New South Wales" or state=="Australian Capital Territory" or state=="ACT":
        tz_string = "Australia/NSW"
    elif state == "QLD" or state=="Queensland":
        tz_string = "Australia/Queensland"
    elif state == "NT" or state=="Northern Territory":
        tz_string = "Australia/North"
    else:
        print("wrong state")
    LocalZone = ZoneInfo(tz_string)
    return LocalZone

In [5]:
def lat_lon_state(latitude, longitude):
    # Initialize geolocator with user agent
    geolocator = Nominatim(user_agent="geoapi")
    
    # Perform reverse geocoding
    location = geolocator.reverse((latitude, longitude), exactly_one=True)
    
    # Extract state and state code
    if location:
        address = location.raw['address']
        state = address.get('state', '')
        if len(state)==0:
            state = address.get('territory', '')
    else:
        print("Location not found")
        return "NaN"
    
    state_mapping = {
        'New South Wales': 'NSW',
        'Victoria': 'VIC',
        'Queensland': 'QLD',
        'South Australia': 'SA',
        'Western Australia': 'WA',
        'Tasmania': 'TAS',
        'Northern Territory': 'NT',
        'Australian Capital Territory': 'ACT'
    }
    
    # Get the state code using the mapping
    state_code = state_mapping.get(state, '')
    return state_code

In [5]:
files = glob(f"/g/data/w40/dl6968/BoM_daily_stations/all/csv/*.csv")

In [6]:
df_notes = pd.read_csv("./data/BoM_daily_stations_state.csv")

In [8]:

qflag_mapping = {
    "Y" : 0,
    "N" : 1,
}

In [9]:
def txt_to_nc(file):
    data_header = ["product_code", "Station_Number", "Year", "Month", "Day", "Rainfall", "Period", "Quality"]
    try:
        df = pd.read_csv(file)
        flag = 1
    except:
        print("check file: "+file)
        return None
    if flag==1:
        df = df.rename(columns=dict(zip(df, data_header)))
        station_id = file[-20:-14]
        out_file = f"/g/data/w40/dl6968/BoM_daily_stations/netcdf/{station_id}.nc"
        
        df["local_time"] = (
                    df["Year"].astype(str)
                    + df["Month"].astype(str).str.zfill(2)
                    + df["Day"].astype(str).str.zfill(2)
                    + " "
                    + "09"
                    + ":"
                    + "00"
                )
        lat = df_notes.loc[df_notes["ID"]==int(station_id),"Latitude"].values[0]
        lon = df_notes.loc[df_notes["ID"]==int(station_id),"Longitude"].values[0]
        state = str(df_notes.loc[df_notes["ID"]==int(station_id),"State"].values[0])
        state_note = ""
        if state=="nan":
            state_note = "No state forced to QLD"
            state = "QLD"
        local_tz = find_aus_tz(state)
        local_time = df["local_time"].tolist()
        utc_time = local_to_utc(local_time, local_tz)
        out_time = [np.datetime64(utc_ts) for utc_ts in utc_time]
        df["quality_int"] = df["Quality"].map(qflag_mapping)
        ## time lapse
        ## precipitation data
        lapse = []
        for days in df["Period"].values:
            try:
                lapse.append(float(days))
            except:
                lapse.append(np.nan)
        
        prcp = []
        for rain in df["Rainfall"].values:
            try:
                prcp.append(float(rain))
            except:
                prcp.append(np.nan)
        
        
        
        nc_output = xr.Dataset()
        for keys in df_notes.keys():
            nc_output.attrs[keys] = df_notes[keys].loc[df_notes["ID"]==float(station_id)].values[0]
        nc_output.attrs["State"] = state
        nc_output.attrs["State_note"] = state_note
        nc_output.attrs["Station_Number"] = station_id
        nc_output.attrs['author'] = 'Dongqi Lin (dongqi.lin@monash.edu)'
        
        nc_output['time'] = xr.DataArray(np.array(out_time), dims=['time'])
        nc_output['time'].encoding['units'] = "seconds since 1970-01-01 00:00:00"
        nc_output['time'].encoding['calendar'] = "proleptic_gregorian"
        nc_output['time'].attrs["notes"] = "UTC time"
        nc_output['prcp'] = xr.DataArray(np.array(prcp), dims=['time'],
                 attrs={'description':'Precipitation since last AWS observation','units':"mm"})
        nc_output['local_time'] = xr.DataArray(local_time, dims=['time'],
                 attrs={'description':'Local time','units':""})
        
        nc_output['Time_lapse'] = xr.DataArray(np.array(lapse), dims=['time'],
                 attrs={'description':'Period over which precipitation since last (AWS) observation is measured in days',
                        'units':"minutes"})
        
        nc_output['quality_flag'] = xr.DataArray(df["quality_int"].values, dims=['time'],
                 attrs={'description':'Quality flag: Y-0, N-1',
                        'Y': 'quality controlled',
                        'N': 'could be wrong if additionall quality control is done in future',
                        'units':""})
        nc_output.to_netcdf(out_file)

In [10]:
# use multiprocess  
# max_pool means maximum CPU to use
max_pool = 20

with Pool(max_pool) as p:
    pool_outputs = list(
        tqdm(
            p.imap(txt_to_nc,
                   files),
            total=len(files),
            position=0, leave=True
        )
    )
p.join()

  2%|▏         | 350/17830 [00:10<05:59, 48.64it/s] 

check file: /g/data/w40/dl6968/BoM_daily_stations/all/csv/IDCJAC0009_035285_1800_Data.csv


 92%|█████████▏| 16449/17830 [05:45<00:31, 43.21it/s]

check file: /g/data/w40/dl6968/BoM_daily_stations/all/csv/IDCJAC0009_040780_1800_Data.csv


100%|██████████| 17830/17830 [06:14<00:00, 47.56it/s]
