In [53]:
import pandas as pd
from datetime import datetime, timedelta
import time

In [54]:
# Load dataframes
df_sharp = pd.read_csv("./sharp_data/sharp_data.csv")
df_goes = pd.read_csv("./goes_data/goes_all_probes.csv")
df_solar = pd.read_csv("./other_data/daily_solar_data.csv")
df_noaa_radio = pd.read_csv("./other_data/noaa_radio_flux.csv")
df_penticton_radio = pd.read_csv("./other_data/penticton_radio_flux.csv")
df_xrt = pd.read_csv("./other_data/xrt_flarecat.csv")

# SHARP dataset
***

In [55]:
print(df_sharp.info())
print(df_sharp.shape)
print(df_sharp.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2754143 entries, 0 to 2754142
Data columns (total 21 columns):
 #   Column    Dtype  
---  ------    -----  
 0   DATE-OBS  object 
 1   T_OBS     object 
 2   T_REC     object 
 3   USFLUX    float64
 4   MEANGAM   float64
 5   MEANGBT   float64
 6   MEANGBZ   float64
 7   MEANGBH   float64
 8   MEANJZD   float64
 9   TOTUSJZ   float64
 10  MEANALP   float64
 11  MEANJZH   float64
 12  TOTUSJH   float64
 13  ABSNJZH   float64
 14  SAVNCPP   float64
 15  MEANPOT   float64
 16  TOTPOT    float64
 17  MEANSHR   float64
 18  SHRGT45   float64
 19  R_VALUE   float64
 20  HARPNUM   int64  
dtypes: float64(17), int64(1), object(3)
memory usage: 441.3+ MB
None
(2754143, 21)
                  DATE-OBS                    T_OBS                    T_REC  \
0  2010-04-30T23:58:22.20Z  2010.05.01_00:00:04_TAI  2010.05.01_00:00:00_TAI   
1  2010-05-01T00:10:22.20Z  2010.05.01_00:12:04_TAI  2010.05.01_00:12:00_TAI   
2  2010-05-01T00:22:22.20Z  2010.0

In [56]:
# Function to convert TAI to UTC
def tai_to_utc(dt):
    # dt is naive datetime (parsed from _TAI)
    if dt < pd.Timestamp("2012-07-01"):
        offset = 34
    elif dt < pd.Timestamp("2015-07-01"):
        offset = 35
    elif dt < pd.Timestamp("2017-01-01"):
        offset = 36
    else:
        offset = 37
    return dt - pd.Timedelta(seconds=offset)

In [57]:
# Convert the time labels to datetime objects
df_sharp["DATE-OBS_time"] = pd.to_datetime(df_sharp["DATE-OBS"], format="%Y-%m-%dT%H:%M:%S.%fZ", utc=True)

df_sharp["T_OBS_time"] = pd.to_datetime(df_sharp["T_OBS"].str.replace("_TAI",""), format="%Y.%m.%d_%H:%M:%S")
df_sharp["T_REC_time"] = pd.to_datetime(df_sharp["T_REC"].str.replace("_TAI",""), format="%Y.%m.%d_%H:%M:%S")

df_sharp["T_OBS_time"] = df_sharp["T_OBS_time"].apply(tai_to_utc).dt.tz_localize("UTC")
df_sharp["T_REC_time"] = df_sharp["T_REC_time"].apply(tai_to_utc).dt.tz_localize("UTC")

print(df_sharp.head())

                  DATE-OBS                    T_OBS                    T_REC  \
0  2010-04-30T23:58:22.20Z  2010.05.01_00:00:04_TAI  2010.05.01_00:00:00_TAI   
1  2010-05-01T00:10:22.20Z  2010.05.01_00:12:04_TAI  2010.05.01_00:12:00_TAI   
2  2010-05-01T00:22:22.20Z  2010.05.01_00:24:04_TAI  2010.05.01_00:24:00_TAI   
3  2010-05-01T00:34:22.20Z  2010.05.01_00:36:04_TAI  2010.05.01_00:36:00_TAI   
4  2010-05-01T00:46:22.20Z  2010.05.01_00:48:04_TAI  2010.05.01_00:48:00_TAI   

         USFLUX  MEANGAM  MEANGBT  MEANGBZ  MEANGBH   MEANJZD       TOTUSJZ  \
0  6.510776e+21   28.337   66.808   84.497   32.193 -0.131873  5.777592e+12   
1  6.521054e+21   29.678   68.349   90.781   32.345 -0.113589  5.654726e+12   
2  6.917875e+21   28.441   67.682   89.127   32.411  0.061197  6.488687e+12   
3  6.973706e+21   28.031   67.166   85.321   31.966  0.053302  6.193157e+12   
4  7.228647e+21   26.980   64.805   76.349   32.647  0.011571  5.797055e+12   

   ...       SAVNCPP   MEANPOT        TOTPOT

In [58]:
# Create a new data with only day for the aggeregation
# floor to day
df_sharp["DATE"] = df_sharp["DATE-OBS_time"].dt.floor("D")

# Many measureaments start just before midnight of the last day, but should be counted as part of the next day
# if very close to midnight, push to next day
# e.g. within last 5 minutes of the day
mask = df_sharp["DATE-OBS_time"].dt.hour == 23
mask &= df_sharp["DATE-OBS_time"].dt.minute >= 57
df_sharp.loc[mask, "DATE"] += pd.Timedelta(days=1)

print(df_sharp.head())

                  DATE-OBS                    T_OBS                    T_REC  \
0  2010-04-30T23:58:22.20Z  2010.05.01_00:00:04_TAI  2010.05.01_00:00:00_TAI   
1  2010-05-01T00:10:22.20Z  2010.05.01_00:12:04_TAI  2010.05.01_00:12:00_TAI   
2  2010-05-01T00:22:22.20Z  2010.05.01_00:24:04_TAI  2010.05.01_00:24:00_TAI   
3  2010-05-01T00:34:22.20Z  2010.05.01_00:36:04_TAI  2010.05.01_00:36:00_TAI   
4  2010-05-01T00:46:22.20Z  2010.05.01_00:48:04_TAI  2010.05.01_00:48:00_TAI   

         USFLUX  MEANGAM  MEANGBT  MEANGBZ  MEANGBH   MEANJZD       TOTUSJZ  \
0  6.510776e+21   28.337   66.808   84.497   32.193 -0.131873  5.777592e+12   
1  6.521054e+21   29.678   68.349   90.781   32.345 -0.113589  5.654726e+12   
2  6.917875e+21   28.441   67.682   89.127   32.411  0.061197  6.488687e+12   
3  6.973706e+21   28.031   67.166   85.321   31.966  0.053302  6.193157e+12   
4  7.228647e+21   26.980   64.805   76.349   32.647  0.011571  5.797055e+12   

   ...   MEANPOT        TOTPOT  MEANSHR  SHR

In [59]:
# Deal with NaNs

# count NaNs per column
nan_count = df_sharp.isna().sum()
print(nan_count)

# fraction between 0 and 1
nan_fraction = df_sharp.isna().mean()
print(nan_fraction)

DATE-OBS             0
T_OBS                0
T_REC                0
USFLUX               0
MEANGAM          34764
MEANGBT          34100
MEANGBZ          34090
MEANGBH          34090
MEANJZD          34090
TOTUSJZ              0
MEANALP          34100
MEANJZH          34095
TOTUSJH              0
ABSNJZH              0
SAVNCPP              0
MEANPOT          34090
TOTPOT               0
MEANSHR          36765
SHRGT45          34090
R_VALUE           4318
HARPNUM              0
DATE-OBS_time        0
T_OBS_time           0
T_REC_time           0
DATE                 0
dtype: int64
DATE-OBS         0.000000
T_OBS            0.000000
T_REC            0.000000
USFLUX           0.000000
MEANGAM          0.012622
MEANGBT          0.012381
MEANGBZ          0.012378
MEANGBH          0.012378
MEANJZD          0.012378
TOTUSJZ          0.000000
MEANALP          0.012381
MEANJZH          0.012380
TOTUSJH          0.000000
ABSNJZH          0.000000
SAVNCPP          0.000000
MEANPOT          0.012

In [60]:
# count NaNs per HARPNUM per day
nan_counts = (
    df_sharp.groupby(["HARPNUM", "DATE"])
      .apply(lambda g: g.isna().sum(), include_groups=False)
)
print(nan_counts)

# also total rows, to compute fractions
nan_fractions = (
    df_sharp.groupby(["HARPNUM", "DATE"])
      .apply(lambda g: g.isna().mean(), include_groups=False)
)
print(nan_fractions)

                                   DATE-OBS  T_OBS  T_REC  USFLUX  MEANGAM  \
HARPNUM DATE                                                                 
1       2010-05-01 00:00:00+00:00         0      0      0       0        0   
        2010-05-02 00:00:00+00:00         0      0      0       0        0   
        2010-05-03 00:00:00+00:00         0      0      0       0        0   
        2010-05-04 00:00:00+00:00         0      0      0       0        0   
        2010-05-05 00:00:00+00:00         0      0      0       0        0   
...                                     ...    ...    ...     ...      ...   
7528    2020-12-31 00:00:00+00:00         0      0      0       0        0   
7529    2020-12-31 00:00:00+00:00         0      0      0       0        0   
        2021-01-01 00:00:00+00:00         0      0      0       0        0   
7530    2020-12-31 00:00:00+00:00         0      0      0       0        0   
        2021-01-01 00:00:00+00:00         0      0      0       

In [61]:
# check if NaNs are correlated with HARPNUM or DATE
nan_fractions = nan_fractions.reset_index()
summary = nan_fractions.melt(id_vars=["HARPNUM", "DATE"], 
                             var_name="column", value_name="nan_fraction")

# filter only cases with >0
summary = summary[summary["nan_fraction"] > 0].sort_values("nan_fraction", ascending=False)

print(summary.head(100))

        HARPNUM                      DATE   column  nan_fraction
207447     1576 2012-04-15 00:00:00+00:00  MEANGBH           1.0
153552     2480 2013-02-14 00:00:00+00:00  MEANGBT           1.0
314036     6972 2017-04-05 00:00:00+00:00  MEANALP           1.0
185686     3341 2013-11-01 00:00:00+00:00  MEANGBZ           1.0
527363     2472 2013-02-10 00:00:00+00:00  SHRGT45           1.0
...         ...                       ...      ...           ...
153859     2554 2013-03-12 00:00:00+00:00  MEANGBT           1.0
153856     2554 2013-03-09 00:00:00+00:00  MEANGBT           1.0
230345       78 2010-06-30 00:00:00+00:00  MEANJZD           1.0
153838     2546 2013-03-23 00:00:00+00:00  MEANGBT           1.0
153793     2533 2013-03-18 00:00:00+00:00  MEANGBT           1.0

[100 rows x 4 columns]


In [62]:
# NaNs are not correlated with HARPNUM or DATE. Entire days or HARPNUMs are not missing.
# We can delete rows with NaNs without introducing bias.

df_sharp_clean = df_sharp.dropna()

In [80]:
# average over days and spots

# 1. daily average per spot
df_daily_per_spot = (
    df_sharp_clean.groupby(["HARPNUM", "DATE"])
      .mean(numeric_only=True)
      .reset_index()
)
print(df_daily_per_spot.head())

# 2. daily average across spots (ignore HARPNUM)
df_daily_all_spots = (
    df_sharp_clean.groupby("DATE")
      .mean(numeric_only=True)
      .reset_index()
)
print(df_daily_all_spots.head())

   HARPNUM                      DATE        USFLUX    MEANGAM     MEANGBT  \
0        1 2010-05-01 00:00:00+00:00  6.703258e+21  25.728111   72.767846   
1        1 2010-05-02 00:00:00+00:00  9.855321e+21  25.814383   79.668617   
2        1 2010-05-03 00:00:00+00:00  9.420379e+21  28.120769   94.159909   
3        1 2010-05-04 00:00:00+00:00  6.441307e+21  29.274752  111.013884   
4        1 2010-05-05 00:00:00+00:00  5.100163e+21  30.340793  120.692843   

      MEANGBZ    MEANGBH   MEANJZD       TOTUSJZ   MEANALP   MEANJZH  \
0   82.501701  32.511701 -0.091966  6.021028e+12 -0.001429 -0.000457   
1   81.215375  33.471042 -0.181589  8.825466e+12  0.000624  0.000202   
2   94.493545  37.063504 -0.002756  9.878426e+12  0.007282  0.002162   
3  110.362157  43.795851  0.243887  7.983555e+12  0.002613  0.000733   
4  119.220595  48.880603  0.358636  7.487984e+12 -0.004161 -0.001062   

      TOTUSJH    ABSNJZH       SAVNCPP      MEANPOT        TOTPOT    MEANSHR  \
0  293.488923  19.927444

# GOES dataset
***

In [64]:
print(df_goes.info())
print(df_goes.shape)
print(df_goes.head())
print(df_goes.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11399040 entries, 0 to 11399039
Data columns (total 25 columns):
 #   Column               Dtype  
---  ------               -----  
 0   datetime             object 
 1   g10_xrs_xs           float64
 2   g10_xrs_xl           float64
 3   g11_xrs_xs           float64
 4   g11_xrs_xl           float64
 5   g12_xrs_xs           float64
 6   g12_xrs_xl           float64
 7   g13_xrs_A_QUAL_FLAG  float64
 8   g13_xrs_A_NUM_PTS    float64
 9   g13_xrs_A_AVG        float64
 10  g13_xrs_B_QUAL_FLAG  float64
 11  g13_xrs_B_NUM_PTS    float64
 12  g13_xrs_B_AVG        float64
 13  g14_xrs_A_QUAL_FLAG  float64
 14  g14_xrs_A_NUM_PTS    float64
 15  g14_xrs_A_AVG        float64
 16  g14_xrs_B_QUAL_FLAG  float64
 17  g14_xrs_B_NUM_PTS    float64
 18  g14_xrs_B_AVG        float64
 19  g15_xrs_A_QUAL_FLAG  float64
 20  g15_xrs_A_NUM_PTS    float64
 21  g15_xrs_A_AVG        float64
 22  g15_xrs_B_QUAL_FLAG  float64
 23  g15_xrs_B_NUM_PTS    float64
 

In [None]:
# Convert the time labels to datetime objects
df_goes["datetime_time"] = pd.to_datetime(df_goes["datetime"], utc=True)

# Create a new data with only day for the aggeregation
# floor to day
df_goes["DATE"] = df_goes["datetime_time"].dt.floor("D")

In [73]:
# some values are not NaN but are invalid, e.g. -1 or 9999
for col in df_goes.columns:
    if df_goes[col].dtype in [float, int]:
        df_goes.loc[df_goes[col] < 0, col] = pd.NA

In [74]:
# count NaNs per column
nan_count = df_goes.isna().sum()
print(nan_count)

# fraction between 0 and 1
nan_fraction = df_goes.isna().mean()
print(nan_fraction)

DATE                          0
g10_xrs_xs              5747222
g10_xrs_xl              5747253
g11_xrs_xs             10506767
g11_xrs_xl             10506781
g12_xrs_xs              9197341
g12_xrs_xl              9197349
g13_xrs_A_QUAL_FLAG     9866634
g13_xrs_A_NUM_PTS       9866634
g13_xrs_A_AVG           9925929
g13_xrs_B_QUAL_FLAG     9866634
g13_xrs_B_NUM_PTS       9866634
g13_xrs_B_AVG           9925929
g14_xrs_A_QUAL_FLAG     9249120
g14_xrs_A_NUM_PTS       9249120
g14_xrs_A_AVG           9412571
g14_xrs_B_QUAL_FLAG     9249120
g14_xrs_B_NUM_PTS       9249120
g14_xrs_B_AVG           9412571
g15_xrs_A_QUAL_FLAG     6393600
g15_xrs_A_NUM_PTS       6393600
g15_xrs_A_AVG           6547104
g15_xrs_B_QUAL_FLAG     6393600
g15_xrs_B_NUM_PTS       6393600
g15_xrs_B_AVG           6547104
datetime_time                 0
xrs_A                   2337912
xrs_B                   2337956
dtype: int64
DATE                   0.000000
g10_xrs_xs             0.504185
g10_xrs_xl             0.50

In [75]:
# count NaNs per day
nan_counts = (
    df_goes.groupby(["DATE"])
      .apply(lambda g: g.isna().sum(), include_groups=False)
)
print(nan_counts)

# also total rows, to compute fractions
nan_fractions = (
    df_goes.groupby(["DATE"])
      .apply(lambda g: g.isna().mean(), include_groups=False)
)
print(nan_fractions)

                           g10_xrs_xs  g10_xrs_xl  g11_xrs_xs  g11_xrs_xl  \
DATE                                                                        
1998-07-01 00:00:00+00:00           0           0        1440        1440   
1998-07-02 00:00:00+00:00           1           1        1440        1440   
1998-07-03 00:00:00+00:00         121         121        1440        1440   
1998-07-04 00:00:00+00:00           0           0        1440        1440   
1998-07-05 00:00:00+00:00           0           0        1440        1440   
...                               ...         ...         ...         ...   
2020-02-29 00:00:00+00:00        1440        1440        1440        1440   
2020-03-01 00:00:00+00:00        1440        1440        1440        1440   
2020-03-02 00:00:00+00:00        1440        1440        1440        1440   
2020-03-03 00:00:00+00:00        1440        1440        1440        1440   
2020-03-04 00:00:00+00:00        1440        1440        1440        1440   

In [76]:
# Sometimes, entire days are missing. Better not to filter them out. Better to interpolate them.
# First, later GOES satellites have better coverage, so we can use them first and fill in missing data using earlier satellites only when necessary.

# define priority list: latest satellite first
xrs_A_cols = ['g15_xrs_A_AVG','g14_xrs_A_AVG','g13_xrs_A_AVG', 'g12_xrs_xs', 'g11_xrs_xs', 'g10_xrs_xs']
xrs_B_cols = ['g15_xrs_B_AVG','g14_xrs_B_AVG','g13_xrs_B_AVG', 'g12_xrs_xl', 'g11_xrs_xl', 'g10_xrs_xl']

# create a single column per channel by filling NaNs from newest to oldest
df_goes['xrs_A'] = df_goes[xrs_A_cols].bfill(axis=1).iloc[:,0]
df_goes['xrs_B'] = df_goes[xrs_B_cols].bfill(axis=1).iloc[:,0]

print(df_goes.head())

                       DATE    g10_xrs_xs    g10_xrs_xl  g11_xrs_xs  \
0 1998-07-01 00:00:00+00:00  3.350000e-09  1.270000e-07         NaN   
1 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
2 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
3 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
4 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   

   g11_xrs_xl  g12_xrs_xs  g12_xrs_xl  g13_xrs_A_QUAL_FLAG  g13_xrs_A_NUM_PTS  \
0         NaN         NaN         NaN                  NaN                NaN   
1         NaN         NaN         NaN                  NaN                NaN   
2         NaN         NaN         NaN                  NaN                NaN   
3         NaN         NaN         NaN                  NaN                NaN   
4         NaN         NaN         NaN                  NaN                NaN   

   g13_xrs_A_AVG  ...  g14_xrs_B_AVG  g15_xrs_A_QUAL_FLAG  g15_xrs_A_NUM_PTS  \
0     

In [71]:
# Interpolating on the short and long x-rays channels

# set datetime as index (required for method='time')
df_goes = df_goes.set_index('DATE')

# interpolate only selected columns
df_goes[['xrs_A', 'xrs_B']] = df_goes[['xrs_A', 'xrs_B']].interpolate(method='time')

# optional: reset index if needed
df_goes = df_goes.reset_index()

print(df_goes.head())

                       DATE    g10_xrs_xs    g10_xrs_xl  g11_xrs_xs  \
0 1998-07-01 00:00:00+00:00  3.350000e-09  1.270000e-07         NaN   
1 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
2 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
3 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   
4 1998-07-01 00:00:00+00:00  3.330000e-09  1.270000e-07         NaN   

   g11_xrs_xl  g12_xrs_xs  g12_xrs_xl  g13_xrs_A_QUAL_FLAG  g13_xrs_A_NUM_PTS  \
0         NaN         NaN         NaN                  NaN                NaN   
1         NaN         NaN         NaN                  NaN                NaN   
2         NaN         NaN         NaN                  NaN                NaN   
3         NaN         NaN         NaN                  NaN                NaN   
4         NaN         NaN         NaN                  NaN                NaN   

   g13_xrs_A_AVG  ...  g14_xrs_B_AVG  g15_xrs_A_QUAL_FLAG  g15_xrs_A_NUM_PTS  \
0     

In [81]:
# Average over days and keeping also min and max of the day
df_daily_goes = (
    df_goes.groupby("DATE")
      .agg(
          xrs_A_mean=('xrs_A', 'mean'),
          xrs_A_min=('xrs_A', 'min'),
          xrs_A_max=('xrs_A', 'max'),
          xrs_B_mean=('xrs_B', 'mean'),
          xrs_B_min=('xrs_B', 'min'),
          xrs_B_max=('xrs_B', 'max'),
      )
      .reset_index()
)

print(df_daily_goes.head())

                       DATE    xrs_A_mean     xrs_A_min     xrs_A_max  \
0 1998-07-01 00:00:00+00:00  5.222819e-09  1.450000e-09  1.410000e-07   
1 1998-07-02 00:00:00+00:00  8.807846e-09  1.570000e-09  4.510000e-08   
2 1998-07-03 00:00:00+00:00  1.686650e-08  1.470000e-09  5.210000e-07   
3 1998-07-04 00:00:00+00:00  6.156368e-09  1.440000e-09  1.380000e-07   
4 1998-07-05 00:00:00+00:00  2.525917e-08  1.450000e-09  7.450000e-07   

     xrs_B_mean     xrs_B_min     xrs_B_max  
0  2.292431e-07  1.270000e-07  1.370000e-06  
1  4.277818e-07  2.140000e-07  8.990000e-07  
2  5.265921e-07  2.370000e-07  5.780000e-06  
3  3.646153e-07  2.480000e-07  1.430000e-06  
4  5.796479e-07  2.280000e-07  7.660000e-06  


# Solar data
***

In [78]:
print(df_solar.info())
print(df_solar.shape)
print(df_solar.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10330 entries, 0 to 10329
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Date                                10330 non-null  object
 1   Radio Flux 10.7cm                   10330 non-null  int64 
 2   Sunspot Number                      10330 non-null  int64 
 3   Sunspot Area (10^6 Hemis.)          10330 non-null  int64 
 4   New Regions                         10330 non-null  int64 
 5   Stanford Mean Solar Field (GOES15)  10330 non-null  object
 6   Stanford Background X-Ray Flux      10330 non-null  object
 7   Flares: C                           10330 non-null  int64 
 8   Flares: M                           10330 non-null  int64 
 9   Flares: X                           10330 non-null  int64 
 10  Flares: S                           10330 non-null  int64 
 11  Flares: 1                           10330 non-null  in

In [79]:
# Convert the time labels to datetime objects
df_solar["DATE"] = pd.to_datetime(df_solar["Date"], utc=True)

print(df_solar.head())

         Date  Radio Flux 10.7cm  Sunspot Number  Sunspot Area (10^6 Hemis.)  \
0  1997-01-01                 72               0                           0   
1  1997-01-02                 72               0                           0   
2  1997-01-03                 73               0                           0   
3  1997-01-04                 74              13                          10   
4  1997-01-05                 74              15                          20   

   New Regions Stanford Mean Solar Field (GOES15)  \
0            0                                  *   
1            0                                  *   
2            0                                  3   
3            1                                  *   
4            0                                  5   

  Stanford Background X-Ray Flux  Flares: C  Flares: M  Flares: X  Flares: S  \
0                           A0.5          0          0          0         -1   
1                           A0.5        

# Creating final dataframe
***

In [88]:
# Merge dataframes

# merge GOES data
df_merged = df_daily_all_spots.merge(
    df_daily_goes, on='DATE', how='left')

# merge solar indices
df_merged = df_merged.merge(
    df_solar, on='DATE', how='left')

In [89]:
# Check columns
print(df_merged.columns)

Index(['DATE', 'USFLUX', 'MEANGAM', 'MEANGBT', 'MEANGBZ', 'MEANGBH', 'MEANJZD',
       'TOTUSJZ', 'MEANALP', 'MEANJZH', 'TOTUSJH', 'ABSNJZH', 'SAVNCPP',
       'MEANPOT', 'TOTPOT', 'MEANSHR', 'SHRGT45', 'R_VALUE', 'HARPNUM',
       'xrs_A_mean', 'xrs_A_min', 'xrs_A_max', 'xrs_B_mean', 'xrs_B_min',
       'xrs_B_max', 'Date', 'Radio Flux 10.7cm', 'Sunspot Number',
       'Sunspot Area (10^6 Hemis.)', 'New Regions',
       'Stanford Mean Solar Field (GOES15)', 'Stanford Background X-Ray Flux',
       'Flares: C', 'Flares: M', 'Flares: X', 'Flares: S', 'Flares: 1',
       'Flares: 2', 'Flares: 3'],
      dtype='object')


In [90]:
columns = [
    'DATE', 
    'USFLUX', 
    'MEANGAM', 
    'MEANGBT', 
    'MEANGBZ', 
    'MEANGBH', 
    'MEANJZD',
    'TOTUSJZ', 
    'MEANALP', 
    'MEANJZH', 
    'TOTUSJH', 
    'ABSNJZH', 
    'SAVNCPP',
    'MEANPOT', 
    'TOTPOT', 
    'MEANSHR', 
    'SHRGT45', 
    'R_VALUE',
    'xrs_A_mean', 
    'xrs_A_min', 
    'xrs_A_max', 
    'xrs_B_mean', 
    'xrs_B_min',
    'xrs_B_max',
    'Radio Flux 10.7cm', 
    'Sunspot Number',
    'Sunspot Area (10^6 Hemis.)', 
    'New Regions',
    'Flares: C', 
    'Flares: M', 
    'Flares: X', 
    'Flares: S', 
    'Flares: 1',
    'Flares: 2', 
    'Flares: 3']

In [91]:
df_flares = df_merged[columns]
print(df_flares.head())

                       DATE        USFLUX    MEANGAM     MEANGBT     MEANGBZ  \
0 2010-05-01 00:00:00+00:00  2.325578e+21  29.580376  121.361911  123.196711   
1 2010-05-02 00:00:00+00:00  2.696859e+21  28.751549  122.228577  122.295676   
2 2010-05-03 00:00:00+00:00  2.736471e+21  32.679652  114.202538  116.841407   
3 2010-05-04 00:00:00+00:00  3.459664e+21  29.875355  111.631345  113.176893   
4 2010-05-05 00:00:00+00:00  4.498598e+21  32.720370  114.997301  116.687383   

     MEANGBH   MEANJZD       TOTUSJZ   MEANALP   MEANJZH  ...  Sunspot Number  \
0  51.438868  0.322770  2.560842e+12 -0.003638 -0.001203  ...              13   
1  49.115366  0.482847  2.705160e+12 -0.007909 -0.001914  ...              47   
2  50.343133  0.221901  3.094572e+12 -0.002675 -0.000645  ...              61   
3  47.272120  0.142148  3.900520e+12 -0.015443 -0.005443  ...              70   
4  53.344742  0.286059  5.273561e+12 -0.019879 -0.008015  ...              77   

   Sunspot Area (10^6 Hemis.)  N

In [92]:
# save final datafreme
df_flares.to_csv('./Data/data_flares.csv', index=False)