In [1]:
from src.arctic import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Given Data
- from Institute of Atmospheric Physics Kühlungsborn (IAP)
- Differentiated between ERA5 and UA-ICON(NWP) data

1. ERA5: daily means at pressure levels and analysis
    - Intercomparison with Hannachi et al. (2011)
    - SWE1.tar: contains major warming diagnostics
    - SWE1.tar: elliptic diagnostics of Z10
    - SWX1.tar: extended diagnostics (reachable via VPN)
    - SWC1.tar: classfiication synthesis (reachable via VPN)
2. NWP4A60: data at pressure levels and analysis
    - SWM2.tar: major warming diagnostics
    - SWE2.tar: elliptic diagnostics of Z10
    - SWX2.tar: extended diagnostics (VPN)
    - SWC2.tar: classification synthesis (VPN)

#### File structure
netCDF files are 3-dimensional data, investigated with ncdump. .tar-files contain all output from Christoph Zülickes diagnosis, in particular:
- SWM: ASCII values in 2 files, 
    - a) *cen.csv with the major warming events as identified with their central day when the wind turned from westerly to easterly along with some characteristics and
    - b) *msw.csv with daily lists with wind and a marker mw for major warming
- SWE: 1 csv file
    - *d.csv: daily list of indices, including the aspect ratio of the mother vortex (ar), centrail latitude (latcent) and area (area)

In [30]:
# load data
# preliminary, ideally include api from climate data storage (cds)
era5_d = pd.read_csv("data/SWXClust/ERA5/e5e1_d.csv", skiprows=75)
era5_cen = pd.read_csv("data/SWXClust/ERA5/e5m1t-cen.csv", skiprows=30)
era5_msw = pd.read_csv("data/SWXClust/ERA5/e5m1t-msw.csv", skiprows=25)

nwp_d = pd.read_csv("data/SWXClust/NWP4A60/i4a60e2_d.csv", skiprows=75)
nwp_cen= pd.read_csv("data/SWXClust/NWP4A60/i4a60m2t-cen.csv", skiprows=30)
nwp_msw = pd.read_csv("data/SWXClust/NWP4A60/i4a60m2t-msw.csv", skiprows=25)

In [38]:
# delete space in front of strings
def no_white_space(df, sep = ' '):
    col = []
    for c in df.columns:
        col.append(c.split(sep)[-1])
    df.columns = col

no_white_space(era5_d)
no_white_space(era5_cen)
no_white_space(era5_msw)

no_white_space(nwp_d)
no_white_space(nwp_cen)
no_white_space(nwp_msw)

era5_d = era5_d.iloc[4:, :]
nwp_d = nwp_d.iloc[4:, :]

era5_d.columns

Index(['D', 'counter', 'time', 'string', 'year', 'month', 'day', 'hour',
       'level', 'edge', 'form', 'area', 'obj_area', 'latcent', 'loncent',
       'theta', 'ar', 'kurtosis', 'area1', 'obj_area1', 'latcent1', 'loncent1',
       'theta1', 'ar1', 'area2', 'obj_area2', 'latcent2', 'loncent2', 'theta2',
       'ar2', 'amp0', 'amp1', 'amp2', 'lon1', 'lon2'],
      dtype='object')

### Focus and To Do
- ERA5 Data, replicate results from Hannachi
- descriptive statistics:
    - PSA
    - correlation matrix/chi2
- hierarchical Clustering, k-means, k-nearest neighbours


In [8]:
# preprocessing

In [None]:
# clustering

In [3]:
# visualisation