This jupyter notebook clean and merge the initial datasets into the one to be used with the geomagnetic storms forecasting algorithms.
It process a single dataset: OMNI dataset.

# Filtering the data to use in forecasting geomagnetic storms
***


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load dataframes
df_omni = pd.read_csv("../data/omni_data/omni_data.csv")

# OMNI dataset
***

In [None]:
print(df_omni.info())
print(df_omni.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262969 entries, 0 to 262968
Data columns (total 55 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   datetime           262969 non-null  object 
 1   Rot#               262969 non-null  int64  
 2   IMF                262969 non-null  int64  
 3   PLS                262969 non-null  int64  
 4   IMF_PTS            262969 non-null  int64  
 5   PLS_PTS            262969 non-null  int64  
 6   ABS_B              262217 non-null  float64
 7   F                  262217 non-null  float64
 8   THETA_AV           262217 non-null  float64
 9   PHI_AV             262217 non-null  float64
 10  BX_GSE             262217 non-null  float64
 11  BY_GSE             262217 non-null  float64
 12  BZ_GSE             262217 non-null  float64
 13  BY_GSM             262191 non-null  float64
 14  BZ_GSM             262191 non-null  float64
 15  SIGMA-ABS_B        262217 non-null  float64
 16  SI

In [None]:
print(df_omni.head())
print(df_omni.tail())

              datetime  Rot#  IMF  PLS  IMF_PTS  PLS_PTS  ABS_B    F  \
0  1995-01-01 00:00:00  2204   51   52       56       38    4.0  3.9   
1  1995-01-01 01:00:00  2204   51   52       60       42    3.0  2.9   
2  1995-01-01 02:00:00  2204   51   52       60       40    3.2  1.9   
3  1995-01-01 03:00:00  2204   51   52       59       41    4.3  4.0   
4  1995-01-01 04:00:00  2204   51   52       59       39    4.8  4.8   

   THETA_AV  PHI_AV  ...  F10_INDEX  KP  DST  AE  AP_INDEX  AL_INDEX  \
0     -32.9   100.8  ...       72.9  10   -2  32         4        -6   
1     -27.1   110.7  ...       72.9  10    3  38         4       -11   
2     -13.0    75.1  ...       72.9  10    6  34         4       -12   
3       4.4   142.1  ...       72.9   0    5  36         0       -11   
4       3.9   154.3  ...       72.9   0    3  31         0       -10   

   AU_INDEX  PC_N_INDEX  Solar_Lyman_alpha  Proton_QI  
0        25         0.3           0.006201     0.0047  
1        26         0.

In [None]:
# convert datetime to pandas datetime format
df_omni['datetime'] = pd.to_datetime(df_omni['datetime'])
print(df_omni.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262969 entries, 0 to 262968
Data columns (total 55 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   datetime           262969 non-null  datetime64[ns]
 1   Rot#               262969 non-null  int64         
 2   IMF                262969 non-null  int64         
 3   PLS                262969 non-null  int64         
 4   IMF_PTS            262969 non-null  int64         
 5   PLS_PTS            262969 non-null  int64         
 6   ABS_B              262217 non-null  float64       
 7   F                  262217 non-null  float64       
 8   THETA_AV           262217 non-null  float64       
 9   PHI_AV             262217 non-null  float64       
 10  BX_GSE             262217 non-null  float64       
 11  BY_GSE             262217 non-null  float64       
 12  BZ_GSE             262217 non-null  float64       
 13  BY_GSM             262191 non-null  float64 

In [None]:
# some null values are represented as '*', -1, or 9999
df_omni.replace('*', np.nan, inplace=True)
df_omni.replace(-1, np.nan, inplace=True)
df_omni.replace('-999', np.nan, inplace=True)

print(df_omni.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262969 entries, 0 to 262968
Data columns (total 55 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   datetime           262969 non-null  datetime64[ns]
 1   Rot#               262969 non-null  int64         
 2   IMF                262969 non-null  int64         
 3   PLS                262969 non-null  int64         
 4   IMF_PTS            262969 non-null  int64         
 5   PLS_PTS            262969 non-null  int64         
 6   ABS_B              262217 non-null  float64       
 7   F                  262217 non-null  float64       
 8   THETA_AV           261871 non-null  float64       
 9   PHI_AV             262217 non-null  float64       
 10  BX_GSE             259775 non-null  float64       
 11  BY_GSE             259645 non-null  float64       
 12  BZ_GSE             257671 non-null  float64       
 13  BY_GSM             259487 non-null  float64 

In [None]:
# search for NaNs

# count NaNs per column
nan_count = df_omni.isna().sum()
print(nan_count)

datetime                  0
Rot#                      0
IMF                       0
PLS                       0
IMF_PTS                   0
PLS_PTS                   0
ABS_B                   752
F                       752
THETA_AV               1098
PHI_AV                  752
BX_GSE                 3194
BY_GSE                 3324
BZ_GSE                 5298
BY_GSM                 3482
BZ_GSM                 5107
SIGMA-ABS_B             752
SIGMA-B                 752
SIGMA-Bx                752
SIGMA-By                752
SIGMA-Bz                752
T                      5383
N                      5936
V                      1254
PHI-V                  5743
THETA-V                6332
Ratio                 24094
Pressure               5944
SIGMA-T                5383
SIGMA-N                5936
SIGMA-V                1254
SIGMA-PHI-V            1254
SIGMA-THETA-V          1279
SIGMA-ratio           24094
E                      1855
Beta                   6112
Mach_num            

In [None]:
# fraction between 0 and 1
nan_fraction = df_omni.isna().mean()
print(nan_fraction.sort_values(ascending=False))

PR-FLX_4             0.768475
PR-FLX_2             0.768471
PR-FLX_1             0.768452
MFLX                 0.701805
PR-FLX_60            0.298427
PR-FLX_30            0.298374
PR-FLX_10            0.298252
Ratio                0.091623
SIGMA-ratio          0.091623
PC_N_INDEX           0.036757
DST                  0.029380
THETA-V              0.024079
Beta                 0.023242
Mgs_mach_num         0.023223
Mach_num             0.022961
Proton_QI            0.022934
Pressure             0.022603
SIGMA-N              0.022573
N                    0.022573
PHI-V                0.021839
SIGMA-T              0.020470
T                    0.020470
BZ_GSE               0.020147
BZ_GSM               0.019421
BY_GSM               0.013241
BY_GSE               0.012640
BX_GSE               0.012146
E                    0.007054
AL_INDEX             0.006354
SIGMA-THETA-V        0.004864
V                    0.004769
SIGMA-V              0.004769
SIGMA-PHI-V          0.004769
THETA_AV  

In [None]:
# Search for infinities
print(np.isinf(df_omni).sum().sort_values(ascending=False))

datetime             0
PR-FLX_30            0
SIGMA-PHI-V          0
SIGMA-THETA-V        0
SIGMA-ratio          0
E                    0
Beta                 0
Mach_num             0
Mgs_mach_num         0
PR-FLX_1             0
PR-FLX_2             0
PR-FLX_4             0
PR-FLX_10            0
PR-FLX_60            0
SIGMA-N              0
MFLX                 0
R                    0
F10_INDEX            0
KP                   0
DST                  0
AE                   0
AP_INDEX             0
AL_INDEX             0
AU_INDEX             0
PC_N_INDEX           0
Solar_Lyman_alpha    0
SIGMA-V              0
SIGMA-T              0
Rot#                 0
BY_GSM               0
IMF                  0
PLS                  0
IMF_PTS              0
PLS_PTS              0
ABS_B                0
F                    0
THETA_AV             0
PHI_AV               0
BX_GSE               0
BY_GSE               0
BZ_GSE               0
BZ_GSM               0
Pressure             0
SIGMA-ABS_B

In [None]:
print(df_omni.columns)

Index(['datetime', 'Rot#', 'IMF', 'PLS', 'IMF_PTS', 'PLS_PTS', 'ABS_B', 'F',
       'THETA_AV', 'PHI_AV', 'BX_GSE', 'BY_GSE', 'BZ_GSE', 'BY_GSM', 'BZ_GSM',
       'SIGMA-ABS_B', 'SIGMA-B', 'SIGMA-Bx', 'SIGMA-By', 'SIGMA-Bz', 'T', 'N',
       'V', 'PHI-V', 'THETA-V', 'Ratio', 'Pressure', 'SIGMA-T', 'SIGMA-N',
       'SIGMA-V', 'SIGMA-PHI-V', 'SIGMA-THETA-V', 'SIGMA-ratio', 'E', 'Beta',
       'Mach_num', 'Mgs_mach_num', 'PR-FLX_1', 'PR-FLX_2', 'PR-FLX_4',
       'PR-FLX_10', 'PR-FLX_30', 'PR-FLX_60', 'MFLX', 'R', 'F10_INDEX', 'KP',
       'DST', 'AE', 'AP_INDEX', 'AL_INDEX', 'AU_INDEX', 'PC_N_INDEX',
       'Solar_Lyman_alpha', 'Proton_QI'],
      dtype='object')


In [None]:
# clumns to use
columns_to_use = [
    'datetime', 
    'ABS_B', 
    'F', 
    'BX_GSE', 
    'BY_GSE', 
    'BZ_GSE',
    'SIGMA-ABS_B', 
    'SIGMA-B', 
    'SIGMA-Bx', 
    'SIGMA-By',
    'SIGMA-Bz', 
    'T', 
    'N', 
    'V', 
    'Ratio', 
    'Pressure',
    'R', 
    'DST',
]

df_omni_small = df_omni[columns_to_use].copy()
print(df_omni_small.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262969 entries, 0 to 262968
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   datetime     262969 non-null  datetime64[ns]
 1   ABS_B        262217 non-null  float64       
 2   F            262217 non-null  float64       
 3   BX_GSE       259775 non-null  float64       
 4   BY_GSE       259645 non-null  float64       
 5   BZ_GSE       257671 non-null  float64       
 6   SIGMA-ABS_B  262217 non-null  float64       
 7   SIGMA-B      262217 non-null  float64       
 8   SIGMA-Bx     262217 non-null  float64       
 9   SIGMA-By     262217 non-null  float64       
 10  SIGMA-Bz     262217 non-null  float64       
 11  T            257586 non-null  float64       
 12  N            257033 non-null  float64       
 13  V            261715 non-null  float64       
 14  Ratio        238875 non-null  float64       
 15  Pressure     257025 non-null  floa

In [None]:
# include label for geomagnetic storms (DsT < -50 nT. -100 is for intense storms, and -250 for super-storm)
df_omni_small['storm_now'] = ((df_omni_small['DST'] < -50)).astype(int)

# check on how many hours there were storms
print(df_omni_small['storm_now'].value_counts()/len(df_omni_small))

storm_now
0    0.963916
1    0.036084
Name: count, dtype: float64


In [None]:
# interpolate some missing values
df_omni_small = df_omni_small.set_index('datetime') # set datetime as index for time interpolation

df_omni_small = (
    df_omni_small
    .interpolate(method="time", limit=3)  # interpolate up to 3 consecutive hours
    .fillna(df_omni_small.rolling(6, min_periods=1, center=True).mean())  # fallback to rolling average as interpolation
)

In [None]:
nan_fraction = df_omni_small.isna().mean()
print(nan_fraction.sort_values(ascending=False))

Ratio          0.060817
Pressure       0.015542
N              0.015542
T              0.014112
V              0.002723
BX_GSE         0.001692
BY_GSE         0.001692
ABS_B          0.001688
F              0.001688
SIGMA-Bz       0.001688
SIGMA-By       0.001688
SIGMA-Bx       0.001688
SIGMA-B        0.001688
SIGMA-ABS_B    0.001688
BZ_GSE         0.001688
DST            0.000015
R              0.000000
storm_now      0.000000
dtype: float64


In [None]:
# reset index
df_omni_small = df_omni_small.reset_index()

nan_count = df_omni_small.isna().sum()
print(nan_count)

print(df_omni_small.head())

datetime           0
ABS_B            444
F                444
BX_GSE           445
BY_GSE           445
BZ_GSE           444
SIGMA-ABS_B      444
SIGMA-B          444
SIGMA-Bx         444
SIGMA-By         444
SIGMA-Bz         444
T               3711
N               4087
V                716
Ratio          15993
Pressure        4087
R                  0
DST                4
storm_now          0
dtype: int64
             datetime  ABS_B    F  BX_GSE  BY_GSE  BZ_GSE  SIGMA-ABS_B  \
0 1995-01-01 00:00:00    4.0  3.9    -0.6     3.2    -2.1          0.6   
1 1995-01-01 01:00:00    3.0  2.9    -0.9     2.4    -1.3          0.3   
2 1995-01-01 02:00:00    3.2  1.9     0.5     1.8    -0.4          0.7   
3 1995-01-01 03:00:00    4.3  4.0    -3.2     2.5     0.3          0.4   
4 1995-01-01 04:00:00    4.8  4.8    -4.3     2.1     0.3          0.1   

   SIGMA-B  SIGMA-Bx  SIGMA-By  SIGMA-Bz        T     N      V  Ratio  \
0      1.2       0.5       0.9       0.5  15816.0  16.3  315.0  0.016 

In [None]:
df_omni_small = df_omni_small.dropna()

nan_count = df_omni_small.isna().sum()
print(nan_count)

print(df_omni_small.head())

datetime       0
ABS_B          0
F              0
BX_GSE         0
BY_GSE         0
BZ_GSE         0
SIGMA-ABS_B    0
SIGMA-B        0
SIGMA-Bx       0
SIGMA-By       0
SIGMA-Bz       0
T              0
N              0
V              0
Ratio          0
Pressure       0
R              0
DST            0
storm_now      0
dtype: int64
             datetime  ABS_B    F  BX_GSE  BY_GSE  BZ_GSE  SIGMA-ABS_B  \
0 1995-01-01 00:00:00    4.0  3.9    -0.6     3.2    -2.1          0.6   
1 1995-01-01 01:00:00    3.0  2.9    -0.9     2.4    -1.3          0.3   
2 1995-01-01 02:00:00    3.2  1.9     0.5     1.8    -0.4          0.7   
3 1995-01-01 03:00:00    4.3  4.0    -3.2     2.5     0.3          0.4   
4 1995-01-01 04:00:00    4.8  4.8    -4.3     2.1     0.3          0.1   

   SIGMA-B  SIGMA-Bx  SIGMA-By  SIGMA-Bz        T     N      V  Ratio  \
0      1.2       0.5       0.9       0.5  15816.0  16.3  315.0  0.016   
1      0.8       0.2       0.3       0.8  15601.0  18.8  315.0  0.013   


In [None]:
# save final datafreme
df_omni_small.to_csv('../data/data_storms.csv', index=False)