This Notebook serves as a starting point for formatting and inputting data sets necessary for the ACE_RNN

In [1]:
import sys
import os
import numpy as np
from astropy.table import Table, vstack
from astropy.io import misc, ascii
from pathlib import Path
import h5py
import requests
import zipfile
import io
import urllib.request
from datetime import datetime
from tqdm.notebook import tqdm

In [2]:
#Format data around h5py library and astropy table
ARC_DIR = Path("/proj/sot/ska/data/arc3")
ACE_H5_FILE = h5py.File(ARC_DIR.joinpath("ACE.h5"))
GOES_H5_FILE = h5py.File(ARC_DIR.joinpath("GOES_X.h5"))

In [3]:
ACE_TABLE = misc.hdf5.read_table_hdf5(ACE_H5_FILE)
GOES_TABLE = misc.hdf5.read_table_hdf5(GOES_H5_FILE)

In [4]:
GOES_TABLE

year,month,dom,hhmm,mjd,secs,short,long,ratio,time,satellite
int64,int64,int64,int64,int64,int64,float64,float64,float64,float64,int64
2013,5,3,5,56415,300,7.4e-09,8.33e-07,0.00889,483926767.1839997,15
2013,5,3,10,56415,600,6.83e-09,8.35e-07,0.00817,483927067.1840001,15
2013,5,3,15,56415,900,6.98e-09,8.36e-07,0.00835,483927367.1839998,15
2013,5,3,20,56415,1200,8.16e-09,8.23e-07,0.00991,483927667.18400013,15
2013,5,3,25,56415,1500,5.59e-09,8.12e-07,0.00688,483927967.18399984,15
2013,5,3,30,56415,1800,2.91e-08,9.94e-07,0.0251,483928267.1840002,15
2013,5,3,35,56415,2100,6.87e-08,1.6e-06,0.0419,483928567.1839999,15
2013,5,3,40,56415,2400,1.45e-08,1.02e-06,0.0141,483928867.18400025,15
2013,5,3,45,56415,2700,9.55e-09,8.92e-07,0.0107,483929167.18399996,15
2013,5,3,50,56415,3000,2.77e-08,1.11e-06,0.025,483929467.1839997,15


In [2]:
#Data Directories for set fetched online
DATA_DIR = Path("/data/mta4/ACE_RNN/Data")
CELIAS_PATH = DATA_DIR.joinpath("CELIAS.h5")

In [3]:
#Web Links for fetching Data
#mtof = f"https://l1.umd.edu/data/{this_year}_CELIAS_Proton_Monitor_5min.zip"
CELIAS_LINK = f"https://l1.umd.edu/data"

In [5]:
#
#-- Function to fetch CELIAS Data
#
def pull_celias(start,stop):
    for year in tqdm(range(start,stop+1)):
        fetch_link = f"{CELIAS_LINK}/{year}_CELIAS_Proton_Monitor_5min.zip"
        r = requests.get(fetch_link, stream=True)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(DATA_DIR.joinpath("CELIAS_txt"))

Pulling only since 2013 even though 1996 is available, in order to match with current ACE and GOES timeframes. Can be adjusted. Note that this data fetch was already run and is here for reference

In [6]:
#pull_celias(1996,2024)

In [7]:
CELIAS_UNITS = [None, None, None, None, 'km/s', 'cm-3', 'km/s', 'deg',
                'km/s', 'Re', 'Re', 'Re', 'Mkm', 'deg','deg','#']

CELIAS_COLNAMES = ['YY',
 'MON',
 'DY',
 'DOY:HH:MM:SS',
 'SPEED',
 'Np',
 'Vth',
 'N/S',
 'V_He',
 'GSE_X',
 'GSE_Y',
 'GSE_Z',
 'RANGE',
 'HGLAT',
 'HGLONG',
 'CRN(E)']

CELIAS_DTYPE = [np.dtype('int64'),
 np.dtype('<U3'),
 np.dtype('int64'),
 np.dtype('<U12'),
 np.dtype('int64'),
 np.dtype('float64'),
 np.dtype('int64'),
 np.dtype('float64'),
 np.dtype('int64'),
 np.dtype('float64'),
 np.dtype('float64'),
 np.dtype('float64'),
 np.dtype('float64'),
 np.dtype('float64'),
 np.dtype('float64'),
 np.dtype('int64')]

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [8]:
#
# -- Function converting CELIAS textfiles to Astropy fits tables
#
def convert_celias(start,stop):
    celias_table = Table(names = CELIAS_COLNAMES, units = CELIAS_UNITS, dtype = CELIAS_DTYPE)
    for year in tqdm(range(start,stop+1)):
        subtable = ascii.read(DATA_DIR.joinpath("CELIAS_txt", f"{year}_CELIAS_Proton_Monitor_5min.txt"),
                         header_start=20)
        celias_table = vstack([celias_table, subtable])
    return celias_table

Using all text formatting. Note that this data fetch was already run and is here for reference.

In [9]:
#CELIAS_TABLE = convert_celias(1996,2024)

Reformat date information into a singular ordered time column in a datetime stirng format

In [18]:
CELIAS_TABLE

YY,MON,DY,DOY:HH:MM:SS,SPEED,Np,Vth,N/S,V_He,GSE_X,GSE_Y,GSE_Z,RANGE,HGLAT,HGLONG,CRN(E)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,km / s,1 / cm3,km / s,deg,km / s,Re,Re,Re,Mkm,deg,deg,#
int64,str3,int64,str12,int64,float64,int64,float64,int64,float64,float64,float64,float64,float64,float64,int64
96,Jan,20,020:20:18:00,446,10.14,44,1.9,466,211.2,-100.2,-10.9,145.9,-5.1,309.7,1905
96,Jan,20,020:20:23:04,445,10.16,44,1.5,465,211.2,-100.2,-10.9,145.9,-5.1,309.6,1905
96,Jan,20,020:20:28:04,448,10.06,44,1.5,468,211.2,-100.2,-10.9,145.9,-5.1,309.6,1905
96,Jan,20,020:20:33:07,450,10.65,44,1.4,470,211.2,-100.3,-10.9,145.9,-5.1,309.5,1905
96,Jan,20,020:20:38:08,449,10.56,43,1.6,468,211.2,-100.3,-10.9,145.9,-5.1,309.5,1905
96,Jan,20,020:20:43:10,448,10.78,43,1.6,468,211.2,-100.3,-10.9,145.9,-5.1,309.4,1905
96,Jan,20,020:20:48:13,446,10.53,42,1.5,465,211.2,-100.3,-10.9,145.9,-5.1,309.4,1905
96,Jan,20,020:20:53:16,448,8.92,44,1.7,469,211.2,-100.3,-10.9,145.9,-5.1,309.4,1905
96,Jan,20,020:20:58:16,446,7.87,43,2.1,466,211.2,-100.3,-10.9,145.9,-5.1,309.4,1905
96,Jan,20,020:21:03:19,442,8.81,44,2.0,464,211.2,-100.3,-10.9,145.9,-5.1,309.3,1905


In [21]:
#
#--- Function to combined date and time columns into a single datetime column
#
selection = CELIAS_TABLE[CELIAS_COLNAMES[:4]]

In [22]:
selection

YY,MON,DY,DOY:HH:MM:SS
int64,str3,int64,str12
96,Jan,20,020:20:18:00
96,Jan,20,020:20:23:04
96,Jan,20,020:20:28:04
96,Jan,20,020:20:33:07
96,Jan,20,020:20:38:08
96,Jan,20,020:20:43:10
96,Jan,20,020:20:48:13
96,Jan,20,020:20:53:16
96,Jan,20,020:20:58:16
96,Jan,20,020:21:03:19


Saving astropy table to a .h5 file for better storage methods.

In [23]:
#misc.hdf5.write_table_hdf5(CELIAS_TABLE, str(CELIAS_PATH), 
#                           serialize_meta=True, overwrite=True)



Pulling from the CELIUS_TABLE from the saved h5 file

In [59]:
CELIAS_TABLE = misc.hdf5.read_table_hdf5(str(CELIAS_PATH))

Table generation for complete list of CME events

In [3]:
CME_LINK = "https://cdaw.gsfc.nasa.gov/CME_list/UNIVERSAL_ver1/text_ver/univ_all.txt"

In [9]:
#! cd /data/mta4/ACE_RNN/Data/CME_txt ; wget https://cdaw.gsfc.nasa.gov/CME_list/UNIVERSAL_ver1/text_ver/univ_all.txt

--2024-01-24 15:58:07--  https://cdaw.gsfc.nasa.gov/CME_list/UNIVERSAL_ver1/text_ver/univ_all.txt
Resolving cdaw.gsfc.nasa.gov (cdaw.gsfc.nasa.gov)... 129.164.179.220, 2001:4d0:2310:150::220
Connecting to cdaw.gsfc.nasa.gov (cdaw.gsfc.nasa.gov)|129.164.179.220|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4066209 (3.9M) [text/plain]
Saving to: ‘univ_all.txt’


2024-01-24 15:58:08 (9.22 MB/s) - ‘univ_all.txt’ saved [4066209/4066209]



In [5]:
CME_TEXT_FILE = DATA_DIR.joinpath('CME_txt', 'univ_all.txt')

Because the text file is formatted too heavily for human readability rather than astropy function readability, will self parse the text file input to generate the data table

In [6]:
#
#--- Text file parsing for CME_TEXT_FILE
#
with open(CME_TEXT_FILE,'r') as f:
    f.readline()
    first_col_info = f.readline()
    second_col_info = f.readline()
    f.readline()
    data = [line.strip().split() for line in f.readlines()]

In [7]:
print(first_col_info)
print(second_col_info)

 Date       Time     Central  Width  Linear   2nd order speed      Accel     Mass      Kinetic    MPA   Remarks

                        PA           Speed  initial final   20R                        Energy                 



In [8]:
first_col_info.split()

['Date',
 'Time',
 'Central',
 'Width',
 'Linear',
 '2nd',
 'order',
 'speed',
 'Accel',
 'Mass',
 'Kinetic',
 'MPA',
 'Remarks']

In [9]:
CME_COLNAMES = ['Date',
 'Time',
 'Central_PA',
 'Width',
 'Lin_Speed',
 'ord2_Speed_Init',
 'ord2_Final',
 'ord2_Speed_20R',
 'Accel',
 'Mass',
 'KE',
 'MPA',
 'Remarks']

In [52]:
#
#-- Manual correction to ill-delimited remarks
#
corrected_data = []
for i,ent in enumerate(data):
    remark = " ".join(ent[12:])
    if remark == "":
        remark = "---"
    values = ent[:12]
#
#--- TODO: Manual correction to the uncertainty markers listed for 
#--- acceleration, mass, and kinetic, energy
#
    values.append(remark)
    corrected_data.append(values)

In [57]:
np_data = np.array(corrected_data,dtype=str)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (34557,) + inhomogeneous part.

In [58]:
np_data

array([list(['1996/01/11', '00:14:36', '267', '18', '499', '571', '426', '0', '-64.3*', '-------', '-------', '272', 'Only C3']),
       list(['1996/01/13', '22:08:30', '265', '16', '290', '278', '303', '372', '2.8*', '-------', '-------', '266', 'Only C3']),
       list(['1996/01/15', '07:01:10', '262', '43', '525', '600', '454', '0', '-31.1', '-------', '-------', '272', 'Only C3']),
       ...,
       list(['2023/08/30', '22:00:05', '143', '263', '634', '596', '671', '664', '3.6*', '-------', '-------', '79', 'Partial Halo']),
       list(['2023/08/31', '02:48:05', '292', '34', '492', '502', '483', '479', '-1.0', '-------', '-------', '288', '---']),
       list(['2023/08/31', '04:12:05', '296', '112', '750', '538', '945', '846', '17.7', '-------', '-------', '303', '---'])],
      dtype=object)

In [60]:
CELIAS_TABLE

YY,MON,DY,DOY:HH:MM:SS,SPEED,Np,Vth,N/S,V_He,GSE_X,GSE_Y,GSE_Z,RANGE,HGLAT,HGLONG,CRN(E)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,km / s,1 / cm3,km / s,deg,km / s,Re,Re,Re,Mkm,deg,deg,#
int64,bytes3,int64,bytes12,int64,float64,int64,float64,int64,float64,float64,float64,float64,float64,float64,int64
13,Jan,1,001:00:00:09,352,4.23,18,-1.6,354,257.2,2.9,-19.3,145.5,-3.1,327.0,2132
13,Jan,1,001:00:04:57,353,4.0,18,-1.2,354,257.2,2.9,-19.3,145.5,-3.1,327.0,2132
13,Jan,1,001:00:09:57,354,3.86,20,-1.6,357,257.2,2.9,-19.3,145.5,-3.1,327.0,2132
13,Jan,1,001:00:15:00,355,3.78,20,-1.5,358,257.2,2.9,-19.3,145.5,-3.1,326.9,2132
13,Jan,1,001:00:20:02,355,3.74,20,-1.5,358,257.2,2.8,-19.3,145.5,-3.1,326.8,2132
13,Jan,1,001:00:25:04,356,3.68,20,-1.2,358,257.2,2.8,-19.3,145.5,-3.1,326.8,2132
13,Jan,1,001:00:30:05,355,3.69,20,-1.2,358,257.2,2.8,-19.3,145.5,-3.1,326.7,2132
13,Jan,1,001:00:35:11,355,3.84,20,-1.3,357,257.2,2.8,-19.3,145.5,-3.1,326.7,2132
13,Jan,1,001:00:40:11,360,3.66,20,-1.3,363,257.2,2.8,-19.3,145.5,-3.1,326.6,2132
13,Jan,1,001:00:45:12,356,3.75,20,-0.9,358,257.2,2.8,-19.3,145.5,-3.1,326.6,2132
