In [None]:
import progressbar
import h5py
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import pytz

In [None]:
df = pd.read_csv('data/train.csv', parse_dates=['start_time', 'end_time'])
df.head()

In [None]:
hdf = h5py.File(df.iloc[0].filename, 'r')
hdf['Metadata']['Experiment Notes'][:]

# Read and Visualize Interval

In [None]:
class ReadFile:
    def __init__(self, filename):
        hdf = h5py.File(filename, 'r')
        self.t = np.array(
            [datetime(1970, 1, 1, tzinfo=pytz.utc) + timedelta(seconds=i)
             for i in hdf['Data']['Array Layout']['timestamps'][:]]
        )
        self.ch_energy = hdf['Data']['Array Layout']['ch_energy'][:]
        self.mlat = hdf['Data']['Array Layout']['1D Parameters']['mlat'][:]
        self.mlt = hdf['Data']['Array Layout']['1D Parameters']['mlt'][:]
        self.ion_d_flux = hdf['Data']['Array Layout']['2D Parameters']['ion_d_flux'][:]
        self.ion_d_ener = hdf['Data']['Array Layout']['2D Parameters']['ion_d_ener'][:]
        hdf.close()

In [None]:
file = ReadFile(df.iloc[0].filename)
print(df.iloc[0])

In [None]:
import pylab as plt
%matplotlib inline
from matplotlib.colors import LogNorm

In [None]:
i = file.t.searchsorted(df.iloc[0].start_time)
j = file.t.searchsorted(df.iloc[0].end_time)

plt.figure(figsize=(18, 4))
plt.pcolor(file.t[i:j], np.log10(file.ch_energy), file.ion_d_ener[:, i:j], 
           norm=LogNorm(vmin=1e3, vmax=1e8))
plt.colorbar().set_label('Log Energy Flux')
plt.gca().invert_yaxis()
plt.ylabel('Log Energy [eV] - Ions')

# Lengths of Intervals

In [None]:
without_dispersion = df[df['class'] == 0]
with_dispersion = df[df['class'] == 1]

plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title(f'With Dispersion (N = {len(with_dispersion)})')
plt.hist([delta_t.total_seconds()/60 for delta_t in with_dispersion.end_time - with_dispersion.start_time], bins=np.arange(10))
plt.xlabel('Sample Length (Minutes)')

plt.subplot(122)
plt.title(f'Without Dispersion (N = {len(without_dispersion)})')
plt.hist([delta_t.total_seconds()/60 for delta_t in without_dispersion.end_time - without_dispersion.start_time], bins=np.arange(10))
plt.xlabel('Sample Length (Minutes)')


# Which Satellites?

In [None]:
without_dispersion = df[df['class'] == 0]
with_dispersion = df[df['class'] == 1]
bins = np.arange(16, 20)
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title(f'With Dispersion (N = {len(with_dispersion)})')
plt.hist(with_dispersion.sat, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('Satellite')

plt.subplot(122)
plt.title(f'Without Dispersion (N = {len(without_dispersion)})')
plt.hist(without_dispersion.sat, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('Satellite')
None

# Which Year?

In [None]:
without_dispersion = df[df['class'] == 0]
with_dispersion = df[df['class'] == 1]
bins = np.arange(2010, 2018, 1)

plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title(f'With Dispersion (N = {len(with_dispersion)})')
plt.hist([t.year for t in with_dispersion.start_time], bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('Year')

plt.subplot(122)
plt.title(f'Without Dispersion (N = {len(without_dispersion)})')
plt.hist([t.year for t in without_dispersion.start_time], bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('Year')
None

# What are the Magnetic Latitudes?

In [None]:
def get_mlats(subset_df):
    mlats = []
    for _, row in subset_df.iterrows():
        fh = ReadFile(row.filename)
        i = fh.t.searchsorted(row.start_time)
        j = fh.t.searchsorted(row.end_time)
        mlat = fh.mlat[i:j]
        mlats.extend(mlat)
    return mlats

In [None]:
with_dispersion_mlats = get_mlats(df[df['class'] == 1])
without_dispersion_mlats = get_mlats(df[df['class'] == 0])

In [None]:
bins = np.arange(-90, 91, 1)

plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title(f'With Dispersion (N = {len(with_dispersion)} events)')
plt.hist(with_dispersion_mlats, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('MLAT (deg)')
plt.xticks(np.arange(-90, 96, 15))

plt.subplot(122)
plt.title(f'Without Dispersion (N = {len(without_dispersion)} events)')
plt.hist(without_dispersion_mlats, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('MLAT (deg)')
plt.xticks(np.arange(-90, 96, 15))

None

# What are the MLT's?

In [None]:
def get_mlts(subset_df):
    mlts = []
    for _, row in subset_df.iterrows():
        fh = ReadFile(row.filename)
        i = fh.t.searchsorted(row.start_time)
        j = fh.t.searchsorted(row.end_time)
        mlt = fh.mlt[i:j]
        mlts.extend(mlt)
    return mlts

In [None]:
mlats = np.array(get_mlats(df))
mlts = np.array(get_mlts(df))

In [None]:
mlts_south = mlts[mlats < 0]
mlts_north = mlts[mlats > 0]

In [None]:
bins = np.arange(0, 25, .25)

plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title('North Examples')
plt.hist(mlts_north, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('MLT')
plt.xticks(np.arange(0, 25, 2))

plt.subplot(122)
plt.title(f'South Examples')
plt.hist(mlts_south, bins=bins)
plt.xticks(bins)
plt.ylabel('Bin Count')
plt.xlabel('MLT')
plt.xticks(np.arange(0, 25, 2))

None