In [None]:
import datetime
from pathlib import Path
from typing import Union

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pgf import FigureCanvasPgf

matplotlib.backend_bases.register_backend('pdf', FigureCanvasPgf)
sns.set_theme()
plt.rcParams.update({
    'pgf.texsystem': 'pdflatex',
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'pgf.preamble': '\\usepackage{lmodern}',
})

frames = [
    pd.read_csv(Path.cwd().joinpath('data/linz_20100101_20131231.csv'), index_col='time', parse_dates=True),
    pd.read_csv(Path.cwd().joinpath('data/linz_20140101_20171231.csv'), index_col='time', parse_dates=True),
    pd.read_csv(Path.cwd().joinpath('data/linz_20180101_20211231.csv'), index_col='time', parse_dates=True)
]
df = pd.concat(frames)
df.index = df.index.tz_convert(None)
df

In [None]:
def get_unique_column_values(df: pd.DataFrame) -> [str]:
    # returns a list of all columns in the dataframe that contain only one unique value (i.e. all rows are equal)
    # cf. https://stackoverflow.com/a/54405767
    def is_unique(s: pd.Series):
        a = s.to_numpy()
        return (a[0] == a).all()

    result = []
    for col in df.columns:
        if is_unique(df[col]):
            print(f'Column {col} has only a single value: {df[col][0]}')
            result.append(col)

    return result


def remove_duplicate_indices(df: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
    duplicates = df[df.index.duplicated(keep=False)]
    if duplicates.empty:
        print('There are no duplicate indices')
        return df
    print('Duplicated indices:')
    print(duplicates.index)

    remove = df.index.duplicated(keep='last')
    return df[~remove]


# Remove columns without any information and duplicate indices
df.drop(columns=get_unique_column_values(df), inplace=True)
df = remove_duplicate_indices(df)
df

We will predict the air temperature measurements 2m above ground ("TL").
Possible input attributes are air temperature (TL), air pressure (P), reduced air pressure (P0), wind direction (DD), mean wind speed (FFAM), relative humidity (RF), precipitation (RR), sun shine duration (SO), and dew point (TP).
In the parameter study we only use air temperature, air pressure, relative humidity, and sunshine duration.
Therefore, we remove the other columns.

In [None]:
# first, we check whether the index is indeed complete (every 10 minutes)
complete = (df.index == pd.date_range(df.index.min(), df.index.max(), freq='10min')).all(axis=0)
print(f'Dataset ranging from {df.index.min()} to {df.index.max()} in 10-minute steps: {complete}')
df = df.reindex(columns=['TL', 'TL_FLAG', 'P', 'P_FLAG', 'RF', 'RF_FLAG', 'SO', 'SO_FLAG'], copy=False)
df

In [None]:
df['TL'].plot()
plt.show()

In [None]:
# we only have some missing values (~0.1%)
df[df.isna().any(axis=1)]

In [None]:
# we have some small gaps for sun shine duration which we can fill with linear interpolation up to 2 hours
df = df.interpolate(method='linear', limit=11, limit_area='inside')
gaps = df[df.isna().any(axis=1)]
gaps

In [None]:
# there's an outlier for the pressure value
sns.boxplot(df['P'])
df.loc['2016-11-24 14:50:00', 'P'] = 987.8

In [None]:
# we still have some bigger gaps in the data concentrated on a few days
# interpolation is not sufficient, as we cannot interpolate over a gap of multiple days.
gap_days = gaps.index.map(pd.Timestamp.date).unique()
print('Missing values on :')
for day in gap_days:
    daily = df.loc[str(day)]
    missing = daily[daily.isna().any(axis=1)]
    print(f'{day}: {len(missing)}\t(={len(missing) / (60 / 10 * 24) * 100:.2f}%)')

In [None]:
# let's look at the gaps one after another
from datetime import datetime, timedelta


def extend_gap(gap: slice, delta: timedelta):
    return slice(datetime.fromisoformat(gap.start) - delta, datetime.fromisoformat(gap.stop) + delta)


# the first one has only missing sun shine duration
gap = slice('2012-07-24 17:10:00', '2012-07-25 06:40:00')
df.loc[extend_gap(gap, timedelta(hours=48)), 'SO'].plot()
plt.show()
# it is reasonable that the sun stopped shining at 17:10, similar to the previous day
df.loc[gap, 'SO'] = 0

In [None]:
# and then we have one big gap where all data is missing
# %matplotlib qt
gap = slice('2017-04-06 01:50:00', '2017-04-09 23:50:00')
df.loc[extend_gap(gap, timedelta(hours=144))].plot()
plt.show()
# we cannot identify a significant weather change in these 4 days, hence we use the average of the last 3 days
for dt in pd.date_range(gap.start, gap.stop, freq='10min'):
    past_hours = [24, 48, 72]
    df.loc[dt] = 0
    for h in past_hours:
        df.loc[dt] += df.loc[dt - timedelta(h)]
    df.loc[dt] /= len(past_hours)
# %matplotlib inline

In [None]:
# do a simple plausibility check of the final data (flag values above 300 indicate a potential faulty measurement)
df[(df['TL'] < -15) | (df['TL'] > 40) | (df['TL_FLAG'] > 300) | (df['RF_FLAG'] > 300) | (df['P_FLAG'] > 300)]

In [None]:
# remove the remaining flag attributes and arrive at a dataset without null values
df.drop(columns=['TL_FLAG', 'P_FLAG', 'RF_FLAG', 'SO_FLAG'], inplace=True)
print(f'Remaining NaN values: {df[df.isna().any(axis=1)]}')
# also check the high-level metrics whether they make sense
df.describe()

In [None]:
# in the correlation matrix we see that all attributes are quite unique
# the correlation between relative humidity, sunshine duration and temperature is plausible
df.corr()

In [None]:
df.to_pickle(Path.cwd().joinpath('zamg_linz.pickle'))
df

In [None]:
# when we sample hourly data we need to sum up the sunshine duration and precipitation for 1 hour
df['SO'] = df['SO'].rolling(6).sum()
df = df.iloc[6:, :]  # remove created NaN entries (start with next full hour)
df.to_pickle(Path.cwd().joinpath('zamg_linz_hourly.pickle'))
df

In [None]:
# let's plot the yearly averages of the data
climate: pd.Series = df['TL'].groupby([df.index.month, df.index.day]).mean()
climate: pd.DataFrame = pd.DataFrame(columns=['TL'], data=climate.values,
                                     index=pd.date_range('2020-01-01', '2020-12-31'))
climate.plot()

In [None]:
# first calculate the total sunshine duration per day and undo the rolling hourly sum
daily_sunshine: pd.Series = df['SO'].groupby([df.index.year, df.index.month, df.index.day]).sum() / 6
# then take the mean over all years
sunshine = daily_sunshine.groupby(level=[1, 2]).mean()
sunshine.plot()
plt.show()
sunshine