# Explore Weather Data

This notebook is used to explore the DWD weather data, both with and without 
accompanying waiting time information. Its main purpose is to find out which weather
attributes are redundant or not descriptive.

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px

## First overview over weather data and selecting relevant attributes

In [None]:
koelnbonn_df = pd.read_csv("../../data/processed/weather_station02667_Koeln-Bonn.csv", index_col="date")
lommersum_df = pd.read_csv("../../data/processed/weather_station01327_Lommersum.csv")

In [None]:
koelnbonn_df.head()


In [None]:
sns.pairplot(koelnbonn_df)

In [None]:
short_df = koelnbonn_df.copy()
short_df.drop(columns=["quality_wind", "quality_other", "snow_depth", "min_temperature_5cm", "max_temperature_2m", "min_temperature_2m"], inplace=True)

In [None]:
sns.pairplot(short_df)

In [None]:
short_df[short_df.precipitation_height > 30]

=> drop precipitation outlier 2021-07-14 (Flutkatastrophe)

In [None]:
short_df.drop(index=["2021-07-14"], inplace=True)

In [None]:
pst_df = short_df[["precipitation_height", "sunshine_duration", "mean_temperature"]].copy()

In [None]:
sns.pairplot(pst_df)

## Load and process waiting time data

In [None]:
waiting_times_df = pd.read_csv("../../data/processed/waiting_times.csv", index_col="id")

In [None]:
mean_waiting_times_df = waiting_times_df[waiting_times_df.waiting_time >= 0].groupby("date").aggregate("mean")
mean_waiting_times_df.drop(index=["2021-07-14"], inplace=True)

In [None]:
sns.histplot(mean_waiting_times_df, kde=True)

## Plot weather data enriched with waiting time data

In [None]:
pst_df["mean_waiting_time"] = mean_waiting_times_df.waiting_time
pst_df.dropna(axis="index", inplace=True)
pst_df["bin_waiting_time"] = pst_df.mean_waiting_time.apply(lambda x: "<15m" if x < 15 else ">=15m")

In [None]:
sns.pairplot(pst_df, hue="mean_waiting_time")

In [None]:
sns.pairplot(pst_df.drop(columns=["mean_waiting_time"]), hue="bin_waiting_time", plot_kws={"alpha": 0.5})

In [None]:
fig, axs = plt.subplots(ncols=3, sharey=True, figsize=(20, 7))
sns.scatterplot(data=pst_df, x="precipitation_height", y="mean_waiting_time", ax=axs[0])
sns.scatterplot(data=pst_df, x="sunshine_duration", y="mean_waiting_time", ax=axs[1])
sns.scatterplot(data=pst_df, x="mean_temperature", y="mean_waiting_time", ax=axs[2])

In [None]:
short_df["bin_waiting_time"] = mean_waiting_times_df.waiting_time.apply(lambda x: np.nan if np.isnan(x) else ("<15m" if x < 15 else ">=15m"))
short_df.dropna(axis="index", inplace=True)

In [None]:
sns.pairplot(short_df, hue="bin_waiting_time", plot_kws={"alpha": 0.5})

## Pairwise correlation of weather attributes

In [None]:
weather_pairwise_corr_df = lommersum_df.drop(columns=["quality_wind", "quality_other", "precipitation_form"]).dropna(axis="columns", how="all").corr()
px.imshow(
    weather_pairwise_corr_df,
    width=800,
    height=800,
    title="pairwise Pearson correlation between weather attributes (Lommersum station)",
    color_continuous_scale=px.colors.sequential.thermal,
    color_continuous_midpoint=0.0,
).show()


In [None]:
weather_pairwise_corr_df = koelnbonn_df.drop(columns=["quality_wind", "quality_other", "precipitation_form"]).dropna(axis="columns", how="all").corr()
px.imshow(
    weather_pairwise_corr_df,
    width=800,
    height=800,
    title="pairwise Pearson correlation between weather attributes (Köln/Bonn station)",
    color_continuous_scale=px.colors.sequential.thermal,
    color_continuous_midpoint=0.0,
).show()
