This code is heavily based on the code provided to us by Dr. Harding, Alex Toohey, Tommy Duong, and Sabrina Nazarzai on a research project using the pyaurorax package. The part of the code that downloads the data is essentially the same, with a few changes to variables and flow. The code for the multiple day downloading was also based on their code with changes to the logic and the specified parameters to allow for more control over data downloading. The saving of the frames as pdfs is original code.

In [None]:
import h5py
import numpy as np
from datetime import datetime, timedelta
import pyaurorax as auro
import os
import shutil
import matplotlib.pyplot as plt

In [None]:
def download_data(start_time,end_time,site_id,cleanup=True):
    aurorax = auro.PyAuroraX() #creating class instance
    dataset_name = "TREX_RGB_RAW_NOMINAL" #name of dataset that will be used for extracting data
    download = aurorax.data.ucalgary.download(dataset_name=dataset_name,start=start_time,end=end_time,site_uid=site_id) #downloads the data, takes a while

    frames = [] #don't know the number of elements and frames is in a shape that is pre-assigned, making list appending easier

    for filename in download.filenames: #loops over each image downloaded
        with h5py.File(filename,'r') as f: #uses h5py file for efficient data storage and extraction
            images = f['data/images'][:].transpose(3,0,1,2) #transposes the matrix so it is in shape [N,H,W,C] where N is frame number, H is height, 
            #W is width of image, and C is the RGB color chanel (0-2)
            timestamp_data = f['data/timestamp'][:] #extracts the time the image was taken
            times = [datetime.fromisoformat(t.decode('utf-8').replace(' UTC','')).replace(tzinfo=None,microsecond=0) for t in timestamp_data]
            #the above code takes a byte object from the h5py file decodes it into a string, replaces the timezone, makes it into a datetime object,
            #then ensures that object is timezone-naive and drops the fractional seconds

            #sampling frames so that we don't have too much data as most of the images will be the same. 
            last_minute = None
            for i,t in enumerate(times): #loops over the times
                if (t.hour,t.minute) != last_minute: #filters out data from the same minute, captures 1 frame per minute
                    frames.append(images[i])
                    last_minute = (t.hour,t.minute)

    #to delete the repository after samples frames from the original 
    if cleanup == True and download.filenames:
    # Find the pyaurorax_data root directory
        first_file = str(download.filenames[0])  # Convert to string
        # Navigate up to find pyaurorax_data
        path_parts = first_file.split(os.sep)
        if 'pyaurorax_data' in path_parts:
            idx = path_parts.index('pyaurorax_data')
            root_dir = os.sep.join(path_parts[:idx+1])
            try:
                shutil.rmtree(root_dir) #removes directory
                print(f"Removed download directory: {root_dir}")
            except Exception as e:
                print(f"Warning: Could not remove directory {root_dir}: {e}")
    return np.array(frames)

In [None]:
"""start = datetime(2024, 1, 15, 0, 0, 0)
end = datetime(2024, 1, 15, 23, 59, 59)
site_uid = "gill" 
frames, timestamps, locations = download_data(start, end, site_uid)

print(frames.shape)"""
#testing stuff

In [None]:
def run_download(start_date, end_date, site, start_hour = 0, end_hour=4):
    #actually runs the download for multiple days if necessary, has the time preset to the time when auroras are usually expected.
    #note: start_date, end_date are datetime objects, start_hour, end_hour are ints, site is a string.
    
    all_frames = [] #again, not sure how large the array or list needs to be, so using empty list
    all_sites = []
    
    if type(site) == str:
        site = [site]

    current_date = start_date
    while current_date <= end_date: #loop over days
        day_start = current_date.replace(hour=start_hour, minute=0, second=0) #using a datetime object and leaving the year, month, day unchanged
        day_end = current_date.replace(hour=end_hour, minute=59, second=59)
        
        for loc in site: #loops over multiple sites
            frames = download_data(day_start,day_end,site_id=loc,cleanup=True) #makes the frames
            n = frames.shape[0]
            all_frames.append(frames)
            all_sites.extend([loc]*n)

        current_date += timedelta(days=1) #increment the current date by 1 using timedelta

    return all_frames, np.array(all_sites)

In [None]:
def save_frames(frames, sites,dir_name = "observation_images"):
    #function saves the frames 
    out_dir = dir_name
    os.makedirs(out_dir,exist_ok=True) #makes a new directory, doesn't crash if the directory already exists
    for i in range(frames.shape[0]):
        plt.imsave(os.path.join(out_dir, f"{sites[i]}_frame_{i:07d}.png"),frames[i]) #saves the images as png for classification
    return

In [None]:
#test cell
#note datetime follow this pattern:
#(xxxx, xx, xx, xx, xx, xx)
#(year, month, day, hour, minute, second)
#all arguments are not necessary
# may 10-13 2024 geomagnetic storm as baseline 
start_date = datetime(2024,5,10)
end_date = datetime(2024,5,11)
start_hour = 0 
end_hour=4
# site = ["gill","gill","atha"]
# site = ["fsmi","luck","pina","rabb","yknf","gill","atha"]
site = ["luck","pina"]

all_frames, all_loc = run_download(start_date=start_date,end_date=end_date,site=site, start_hour=start_hour, end_hour=end_hour)

In [None]:
len(all_frames[0]), len(all_frames[1]), len(all_loc), all_frames[0].shape, all_frames[1].shape

In [None]:
# turn all_frames into numpy array 
list_of_frames = []
for frames in all_frames:
    list_of_frames.extend(frames)
list_of_frames = np.array(list_of_frames)

In [None]:
# save_frames(all_frames, all_loc)
save_frames(list_of_frames, all_loc)

In [None]:
test = auro.PyAuroraX()
print(test.data.ucalgary.list_datasets())
obs = test.data.ucalgary.list_observatories('trex_rgb')
for i in range(len(obs)):
    print(obs[i].uid)