In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pgf import FigureCanvasPgf

matplotlib.backend_bases.register_backend('pdf', FigureCanvasPgf)
sns.set_theme()
plt.rcParams.update({
    'pgf.texsystem': 'pdflatex',
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'pgf.preamble': '\\usepackage{lmodern}',
})

vienna: pd.DataFrame = pd.read_pickle('zamg_vienna.pickle')
linz: pd.DataFrame = pd.read_pickle('zamg_linz.pickle')
columns = linz.columns

vienna = vienna.reindex(columns=columns)
vienna.loc[:, 'Station'] = 'Vienna Inner City'
linz.loc[:, 'Station'] = 'Linz City'
df = pd.concat([vienna, linz], copy=True)
df

In [None]:
# First look at the high-level characteristics:
print('Vienna:')
print(vienna.describe())
print('Linz:')
print(linz.describe())

for col in columns:
    print(f'{col} Pearson correlation coefficient: {np.corrcoef(linz[col].values, vienna[col].values)[0, 1]}')

In [None]:
# Then look at every column in closer detail:
# For the sunshine duration (SO), the plotted values do not make sense, I would need to compute the daily sum.
# However, it is sufficient to compare the distribution between the datasets.
import os

out_dir = 'analysis'
os.makedirs(out_dir, exist_ok=True)


def plot_daily_averages(data: pd.DataFrame, y: str, hue: str, y_label: str = None, group=True):
    fig = plt.Figure(figsize=(5, 4))
    ax: plt.Axes = fig.add_subplot(1, 1, 1)
    for label, df in data.groupby(hue):
        if group:
            daily: pd.Series = df[y].groupby([df.index.month, df.index.day]).mean()
        else:
            daily = df[y]
        ax.plot(pd.date_range('2020-01-01', '2020-12-31'), daily.values, label=label)
    ax.legend()
    if y_label is not None:
        ax.set_ylabel(y_label)
    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %d'))
    fig.savefig(os.path.join(out_dir, f'daily_avg_{y}.pdf'), bbox_inches='tight', pad_inches=0.1)


def plot_boxplot(data: pd.DataFrame, x: str, y: str, y_label: str = None):
    fig = plt.Figure(figsize=(5, 5))
    ax: plt.Axes = fig.add_subplot(1, 1, 1)
    sns.boxplot(data=data, x=x, y=y, ax=ax)
    if y_label is not None:
        ax.set_ylabel(y_label)
    fig.savefig(os.path.join(out_dir, f'boxplot_{y}.pdf'), bbox_inches='tight', pad_inches=0.1)

In [None]:
col_to_labels = {
    'TL': 'TL [°C]',
    'P': 'P [hPa]',
    'RF': 'RF [%]',
}
for col, label in col_to_labels.items():
    plot_boxplot(data=df, x='Station', y=col, y_label=label)
    plot_daily_averages(data=df, y=col, hue='Station', y_label=label)

In [None]:
# Corrected sunshine (SO) averages:
sunshine_yearly = pd.DataFrame(columns=['SO', 'Station'])
sunshine_daily = pd.DataFrame(columns=['SO', 'Station'])
for label, data in df.groupby('Station'):
    # First calculate the total sunshine duration (in hours) per day.
    daily_sum = data['SO'].groupby([data.index.year, data.index.month, data.index.day]).sum() / (60 * 60)
    if label == 'Linz City':
        linz_daily = daily_sum
    else:
        vienna_daily = daily_sum
    print(f'Station {label}:')
    print(daily_sum.describe())

    tmp = pd.DataFrame(daily_sum.values, columns=['SO'])
    tmp.loc[:, 'Station'] = label
    sunshine_daily = pd.concat([sunshine_daily, tmp], copy=False)

    # Then compute the mean over all years
    label_means = daily_sum.groupby(level=[1, 2]).mean()
    tmp = pd.DataFrame(label_means.values, columns=['SO'])
    tmp.loc[:, 'Station'] = label
    sunshine_yearly = pd.concat([sunshine_yearly, tmp], copy=False)

plot_boxplot(sunshine_daily, x='Station', y='SO', y_label='SO [h]')
plot_daily_averages(sunshine_yearly, y='SO', hue='Station', y_label='SO [h]', group=False)
print(f'SO Pearson correlation coefficient: {np.corrcoef(linz_daily.values, vienna_daily.values)[0, 1]}')

In [None]:
vienna.loc['2010':'2019'].TL.mean()