# MagPySV example workflow - European observatories

# Setup

In [None]:
# Setup python paths and import some modules
from IPython.display import Image
import sys
sys.path.append('..')
import os
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Import all of the MagPySV modules
import magpysv.denoise as denoise
import magpysv.io as io
import magpysv.model_prediction as model_prediction
import magpysv.plots as plots
import magpysv.tools as tools

In [None]:
%matplotlib notebook

# Data download

In [None]:
from gmdata_webinterface import consume_webservices as cws

# Required dataset - only the hourly WDC dataset is currently supported 
cadence = 'hour'
service = 'WDC'

# Start and end dates of the data download
start_date = dt.date(1960, 1, 1)
end_date = dt.date(2010, 12, 31)

# Observatories of interest
observatory_list = ['CLF', 'NGK', 'WNG']

# Output path for data
download_dir = 'data'

cws.fetch_data(start_date= start_date, end_date=end_date,
        station_list=observatory_list, cadence=cadence,
        service=service, saveroot=download_dir)

# Initial processing

Extract all data from the WDC files, convert into the proper hourly means using the tabular base and save the X, Y and Z components to CSV files.  This may take a few minutes.

In [None]:
write_dir = os.path.join(download_dir, 'hourly')
io.wdc_to_hourly_csv(wdc_path=download_dir, write_dir=write_dir, obs_list=observatory_list,
                  print_obs=True)

In [None]:
# Path to file containing baseline discontinuity information
baseline_data = tools.get_baseline_info(fname='baseline_records')

In [None]:
# Loop over all observatories and calculate SV series as first differences of monthly means (FDMM) for each
for observatory in observatory_list:
    print(observatory)
    # Load hourly data
    data_file = observatory + '.csv'
    hourly_data = io.read_csv_data(
        fname=os.path.join(download_dir, 'hourly', data_file),
        data_type='mf')
    # Resample to monthly means
    resampled_field_data = tools.data_resampling(hourly_data, sampling='MS', average_date=True)
    # Correct documented baseline changes
    tools.correct_baseline_change(observatory=observatory,
                          field_data=resampled_field_data,
                          baseline_data=baseline_data, print_data=True)
    # Write out the monthly means for magnetic field
    io.write_csv_data(data=resampled_field_data,
                            write_dir=os.path.join(download_dir, 'monthly_mf'),
                            obs_name=observatory)
    # Calculate SV from monthly field means
    sv_data = tools.calculate_sv(resampled_field_data,
                                   mean_spacing=1)
    # Write out the SV data
    io.write_csv_data(data=sv_data,
                               write_dir=os.path.join(download_dir, 'monthly_sv', 'fdmm'),
                               obs_name=observatory)

# Concatenate the data for our selected observatories

Besides the Setup section, everything preceding this cell only needs to be run once.

In [None]:
# Observatories of interest
observatory_list = ['CLF', 'NGK', 'WNG']

# Where the data are stored
download_dir = 'data'

# Start and end dates of the analysis as (year, month, day)
start = dt.datetime(1960, 1, 1)
end = dt.datetime(2010, 12, 31)

obs_data, model_sv_data, model_mf_data = io.combine_csv_data(
    start_date=start, end_date=end, obs_list=observatory_list,
    data_path=os.path.join(download_dir, 'monthly_sv', 'fdmm'),
    model_path='model_predictions', day_of_month=1)

dates = obs_data['date']

In [None]:
obs_data

# SV plots

In [None]:
for observatory in observatory_list:
    fig = plots.plot_sv(dates=dates, sv=obs_data.filter(regex=observatory),
                    model=model_sv_data.filter(regex=observatory),
                    fig_size=(6, 6), font_size=10, label_size=16, plot_legend=False,
                    obs=observatory, model_name='COV-OBS')

# Residuals

To calculate SV residuals, we need SV predictions from a geomagnetic field model. This example uses output from the COV-OBS model by Gillet et al. (2013, Geochem. Geophys. Geosyst.,
https://doi.org/10.1002/ggge.20041; 2015, Earth, Planets and Space,
https://doi.org/10.1186/s40623-015-0225-z2013) to obtain model
predictions for these observatory locations. The code can be obtained from
http://www.spacecenter.dk/files/magnetic-models/COV-OBSx1/ and no modifications
are necessary to run it using functions found MagPySV's model_prediction module. For convenience, model output for the locations used in this notebook are included in the examples directory.

In [None]:
residuals = tools.calculate_residuals(obs_data=obs_data, model_data=model_sv_data)

In [None]:
model_sv_data.drop(['date'], axis=1, inplace=True)
obs_data.drop(['date'], axis=1, inplace=True)

# External noise removal

Compute covariance matrix of the residuals (for all observatories combined) and its eigenvalues and eigenvectors. Since the residuals represent signals present in the data, but not the internal field model, we use them to find a proxy for external magnetic fields (Wardinski & Holme, 2011, GJI, https://doi.org/10.1111/j.1365-246X.2011.04988.x). 

In [None]:
denoised, proxy, eigenvals, eigenvecs, projected_residuals, corrected_residuals = denoise.eigenvalue_analysis(
    dates=dates, obs_data=obs_data, model_data=model_sv_data, residuals=residuals,
    proxy_number=2)

# Denoised SV plots

Plots showing the original SV data, the denoised data (optionally with a running average) and the field model predictions.

In [None]:
for observatory in observatory_list:
    xratio, yratio, zratio = plots.plot_sv_comparison(dates=dates, denoised_sv=denoised.filter(regex=observatory),
        residuals=residuals.filter(regex=observatory),
        corrected_residuals = corrected_residuals.filter(regex=observatory),
        noisy_sv=obs_data.filter(regex=observatory), model=model_sv_data.filter(regex=observatory),
        model_name='COV-OBS', fig_size=(6, 6), font_size=10, label_size=14, obs=observatory, plot_rms=True)

Plots showing the denoised data (optionally with a running average) and the field model predictions.

In [None]:
for observatory in observatory_list:
    plots.plot_sv(dates=dates, sv=denoised.filter(regex=observatory), model=model_sv_data.filter(regex=observatory),
                  fig_size=(6, 6), font_size=10, label_size=14, plot_legend=False, obs=observatory,
                  model_name='COV-OBS')

# Plot proxy signal, eigenvalues and eigenvectors

Compare the proxy signal used to denoise the data with the Dst index, measures the intensity of the equatorial electrojet (the "ring current"). Files for the ap (ap_fdmm.csv) and AE (ae_fdmm.csv) are also included.

In [None]:
plots.plot_index_dft(index_file='index_data/dst_fdmm.csv', dates=denoised.date, signal=proxy, fig_size=(6, 6), font_size=10,
                       label_size=14, plot_legend=True, index_name='Dst')

Plot the eigenvalues of the covariance matrix of the residuals

In [None]:
plots.plot_eigenvalues(values=eigenvals, font_size=12, label_size=16, fig_size=(6, 3))

Plot the eigenvectors corresponding to the three largest eigenvalues. The noisiest direction (used to denoise in this example) is mostly X, with some Z, which is consistent with the ring current for European observatories. The second noisiest direction (also used to denoise in this example) is predominantly Z, with some X, and has a large semi-annual contribution that is likely of external origin. However, the third noisiest direction is a coherent Y signal across Europe, which does not correspond to a known direction of external signal. We did not remove this direction during denoising as it could be a real internal field variation that is not captured by the field model. However, its DFT shows a significant semi-annual contribution so this eigendircetion is likely to be in part of external origin.

In [None]:
plots.plot_eigenvectors(obs_names=observatory_list, eigenvecs=eigenvecs[:,0:3], fig_size=(6, 4),
                          font_size=10, label_size=14)

# Outlier detection

Remove remaining spikes in the time series.

In [None]:
denoised.drop(['date'], axis=1, inplace=True)
for column in denoised:
    denoised[column] = denoise.detect_outliers(dates=dates, signal=denoised[column], obs_name=column, threshold=5,
                                               window_length=120, plot_fig=False, fig_size=(10, 3), font_size=10, label_size=14)
denoised.insert(0, 'date', dates)

# Write denoised data to file

In [None]:
for observatory in observatory_list:
    print(observatory)
    sv_data=denoised.filter(regex=observatory)
    sv_data.insert(0, 'date', dates)
    sv_data.columns = ["date", "dX", "dY", "dZ"]
    io.write_csv_data(data=sv_data, write_dir=os.path.join(download_dir, 'denoised', 'european'),
                               obs_name=observatory, decimal_dates=False)

# Averaging data over Europe

Select denoised data for each SV component at all observatories

In [None]:
obs_X = denoised.filter(regex='dX')
model_X = model_sv_data.filter(regex='dX')
obs_Y = denoised.filter(regex='dY')
model_Y = model_sv_data.filter(regex='dY')
obs_Z = denoised.filter(regex='dZ')
model_Z = model_sv_data.filter(regex='dZ')

Average data and model for each component

In [None]:
mean_X = pd.DataFrame(np.mean(obs_X.values, axis=1))
mean_X.columns = ['dX']
mean_model_X = np.mean(model_X, axis=1)
mean_Y = pd.DataFrame(np.mean(obs_Y.values, axis=1))
mean_Y.columns = ['dY']
mean_model_Y = np.mean(model_Y, axis=1)
mean_Z = pd.DataFrame(np.mean(obs_Z.values, axis=1))
mean_Z.columns = ['dZ']
mean_model_Z = np.mean(model_Z, axis=1)

Remove outliers from averaged data

In [None]:
mean_X = denoise.detect_outliers(dates=dates, signal=mean_X, obs_name='X', threshold=2.5,
                                               window_length=72, plot_fig=False, fig_size=(10, 3), font_size=10, label_size=14)
mean_Y = denoise.detect_outliers(dates=dates, signal=mean_Y, obs_name='Y', threshold=2.5,
                                               window_length=72, plot_fig=False, fig_size=(10, 3), font_size=10, label_size=14)
mean_Z = denoise.detect_outliers(dates=dates, signal=mean_Z, obs_name='Z', threshold=2.5,
                                               window_length=72, plot_fig=False, fig_size=(10, 3), font_size=10, label_size=14)

Look at model predictions for all observatories, and the averaged model, to see if the average is representative of the trend at all locations

In [None]:
plt.figure(figsize=(6,6))
plt.subplot(3, 1, 1)
plt.plot(dates, model_X)
plt.plot(dates, mean_model_X, 'k--')
plt.legend(['CLF', 'NGK', 'WNG', 'Average'], frameon=False, fontsize=10, loc=(0.1,1.04), ncol=4)
plt.subplot(3, 1, 2)
plt.plot(dates, model_Y)
plt.plot(dates, mean_model_Y, 'k--')
plt.ylabel('SV (nT/yr)',  fontsize=14)
plt.subplot(3, 1, 3)
plt.plot(dates, model_Z)
plt.plot(dates, mean_model_Z, 'k--')
plt.xlabel('Year',  fontsize=14)

Plot the averaged data and model

In [None]:
plt.figure(figsize=(6, 6))
plt.subplot(3,1,1)
plt.plot(dates, mean_X, 'b')
plt.plot(dates, np.mean(model_X, axis=1), 'r')
plt.subplot(3,1,2)
plt.plot(dates, mean_Y, 'b')
plt.plot(dates, np.mean(model_Y, axis=1), 'r')
plt.ylabel('SV (nT/yr)', fontsize=14)
plt.subplot(3,1,3)
plt.plot(dates, mean_Z, 'b', label='Averaged data')
plt.plot(dates, np.mean(model_Z, axis=1), 'r', label='Averaged COV-OBS')
plt.xlabel('Year',  fontsize=14)
plt.legend(loc='best', fontsize=10, frameon=False)

## Data selection using the ap index

Select an observatory, load its hourly magnetic field data and correct documented baseline changes

In [None]:
observatory = 'CLF'
data_file = observatory + '.csv'

hourly_data = io.read_csv_data(
    fname=os.path.join(download_dir, 'hourly', data_file),
    data_type='mf')

# Path to file containing baseline discontinuity information
baseline_data = tools.get_baseline_info(fname='baseline_records')

# Correct documented baseline changes
tools.correct_baseline_change(observatory=observatory,
                      field_data=hourly_data,
                      baseline_data=baseline_data, print_data=True)

Apply an ap criterion to discard noisy data

In [None]:
# Discard hours with ap > threshold
ap_hourly_applied = tools.apply_Ap_threshold(obs_data=hourly_data, Ap_file=os.path.join('index_data', 'ap_hourly.csv'),
                               threshold=7.0)

# Discard days with Ap > threshold (where Ap is the daily average of the 3-hourly ap values)
ap_daily_applied = tools.apply_Ap_threshold(obs_data=hourly_data, Ap_file=os.path.join('index_data', 'ap_daily.csv'),
                               threshold=7.0)

In [None]:
hourly_data

Calculate the percentage of data remaining after applying the ap threshold

In [None]:
print('Hourly ap threshold applied: ', ap_hourly_applied.X.count()/hourly_data.X.count() * 100, '% remaining')
print('Daily Ap threshold applied: ', ap_daily_applied.X.count()/hourly_data.X.count() * 100, '% remaining')

Compare the hourly magnetic field data before and after appyling the ap threshold

In [None]:
plt.figure(figsize=(6, 6))
plt.subplot(3, 1, 1)
plt.plot(hourly_data.date, hourly_data.X, 'b')
plt.plot(hourly_data.date, ap_hourly_applied.X, 'r')
plt.plot(hourly_data.date, ap_daily_applied.X, 'c')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.subplot(3, 1, 2)
plt.plot(hourly_data.date, hourly_data.Y, 'b')
plt.plot(hourly_data.date, ap_hourly_applied.Y, 'r')
plt.plot(hourly_data.date, ap_daily_applied.Y, 'c')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.ylabel('Magnetic Field (nT)', fontsize=16)
plt.subplot(3, 1, 3)
plt.plot(hourly_data.date, hourly_data.Z, 'b', label='All data')
plt.plot(hourly_data.date, ap_hourly_applied.Z, 'r', label='ap ≤ 7')
plt.plot(hourly_data.date, ap_daily_applied.Z, 'c', label='Ap ≤ 7')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.xlabel('Year', fontsize=16)
plt.legend(frameon=False)
plt.tight_layout()

In [None]:
d = hourly_data['date']
hourly_data.drop(['date'], axis=1, inplace=True)
for column in hourly_data:
    hourly_data[column] = denoise.detect_outliers(dates=d, signal=hourly_data[column], obs_name=column, threshold=10,
                                                  signal_type='MF', window_length=24*365*10, plot_fig=True,
                                                  fig_size=(7, 4), font_size=10, label_size=14)
hourly_data.insert(0, 'date', d)

Compare the SV obtained when calculated using all hourly data and hourly the ap threshold applied

# Comparing FDMM and ADMM

In [None]:
# Resample the hourly data above to monthly means
resampled_field_data = tools.data_resampling(hourly_data, sampling='MS', average_date=True)

# Calculate SV from monthly field means
sv_fdmm = tools.calculate_sv(resampled_field_data,
                               mean_spacing=1)
sv_admm = tools.calculate_sv(resampled_field_data,
                               mean_spacing=12)

In [None]:
# Plot the SV calculated as FDMM and ADMM
plt.figure(figsize=(7, 6))
plt.subplot(3, 1, 1)
plt.plot(sv_fdmm.date, sv_fdmm.dx, 'b')
plt.plot(sv_admm.date, sv_admm.dx, 'r')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.subplot(3, 1, 2)
plt.plot(sv_fdmm.date, sv_fdmm.dy, 'b')
plt.plot(sv_admm.date, sv_admm.dy, 'r')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.ylabel('SV (nT/yr)', fontsize=16)
plt.subplot(3, 1, 3)
plt.plot(sv_fdmm.date, sv_fdmm.dz, 'b', label='FDMM')
plt.plot(sv_admm.date, sv_admm.dz, 'r', label = 'ADMM')
plt.xlim([dt.date(1960, 1, 1), dt.date(2010, 1, 1)])
plt.gca().xaxis_date()
plt.xlabel('Year', fontsize=16)
plt.legend(frameon=False)