# EDA

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
from dask_ml.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dask_xgboost
from collections import OrderedDict
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime

sns.set_theme(style="white")
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
%matplotlib inline

In [None]:
PATH = 'SMC01/2021-05-11-ppdataset'

## 0.0 Start Cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(config_name='slurm')

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## 1.0 Load data

In [None]:
df = dd.read_parquet(PATH)

In [None]:
df

In [None]:
df = df.persist()

In [None]:
df.head()

In [None]:
len(df)

## 2.0 Preparation

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
df.head()

In [None]:
cat_columns = []
cont_columns = ['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 
                'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 
                'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target = 'error_2t'

In [None]:
print(f'min: {df.date.min().compute()}')
print(f'max: {df.date.max().compute()}')

In [None]:
X_train = df[df.date <= '2019-12-02'][[target, 'date', 'station'] + cat_columns + cont_columns]
y_train = df[df.date <= '2019-12-02'][target]
X_valid = df[(df.date > '2019-12-02') & (df.date <= '2020-01-01')][[target, 'date', 'station'] + cat_columns + cont_columns]
y_valid = df[(df.date > '2019-12-02') & (df.date <= '2020-01-01')][target]
X_test = df[df.date > '2020-01-01'][[target, 'date', 'station'] + cat_columns + cont_columns]
y_test = df[df.date > '2020-01-01'][target]

print(f'train: {len(X_train)/len(df)}, valid: {len(X_valid)/len(df)}, test: {len(X_test)/len(df)}')

train_counts = X_train.date.value_counts().compute()
valid_counts = X_valid.date.value_counts().compute()
test_counts = X_test.date.value_counts().compute()

fig, ax = plt.subplots()
ax.scatter(train_counts.index, train_counts, s=1, c='g')
ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
ax.scatter(test_counts.index, test_counts, s=1, c='r');

* Deal with missing data

In [None]:
assert not X_train.isnull().values.any(), "There are NaN values in the dataframe"
assert not X_valid.isnull().values.any(), "There are NaN values in the dataframe"
assert not X_test.isnull().values.any(), "There are NaN values in the dataframe"

* We do not find all stations that we trained on in the valid and test set

In [None]:
train_stations = X_train.station.unique()
valid_stations = X_valid.station.unique()
test_stations = X_test.station.unique()
print(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
print(f"{len(set(train_stations) - set(test_stations))} stations not in test set")

## 3.0 Training

### 3.0.0 Baseline

In [None]:
baselines = OrderedDict()

In [None]:
abs(y_train).describe().compute()

* No error (error_2t=0) can be a valid baseline, without abs() the mean is basically zero.

In [None]:
baselines['baseline_zero'] = 0
y_train.describe().compute()

* La moyenne/médiane sur toutes les stations (train set) est un première baseline

In [None]:
baselines['baseline_mean_pos'] = abs(y_train).mean().compute()
baselines['baseline_mean_neg'] = -abs(y_train).mean().compute()
baselines['baseline_median_pos'] = abs(y_train).describe().compute().loc['50%']
baselines['baseline_median_neg'] = -abs(y_train).describe().compute().loc['50%']

* La moyenne/médiane par station (train set) est un deuxième baseline

In [None]:
baselines['baseline_station_mean_pos'] = abs(X_train.groupby('station').error_2t.mean().compute())
baselines['baseline_station_mean_neg'] = -abs(X_train.groupby('station').error_2t.mean().compute())
# Not working with parallelization
# baselines['baseline_station_median_pos'] = abs(df.iloc[X_train.index].groupby('station').error_2t.median())
# baselines['baseline_station_median_neg'] = -abs(df.iloc[X_train.index].groupby('station').error_2t.median())

* Prendre l'erreur de l'année précédente pour un tuple (station, date, step)

In [None]:
X_train['date_min'] = X_train.date.dt.strftime('%m-%d') 
baseline_last_year = X_train.groupby(['station', 'date_min']).error_2t.mean().compute().reset_index()
baselines['baseline_last_year'] = baseline_last_year

* Compute metrics for each baseline

In [None]:
for k, v in baselines.items():
    print(k)
    if 'station' in k:
        predictions_ = dd.merge(X_test, v, on='station', suffixes=('', '_pred'), how='left').error_2t_pred.compute()
        print(f'{predictions_.isna().sum()} NaN values: fill with 0.')
        predictions_ = predictions_.fillna(0)
    elif 'year' in k:
        X_test['date_min'] = X_test.date.dt.strftime('%m-%d') 
        predictions_ = dd.merge(X_test, v, on=['station', 'date_min'], suffixes=('', '_pred'), how='left').error_2t_pred.compute()
        print(f'{predictions_.isna().sum()} NaN values: fill with 0.')
        predictions_ = predictions_.fillna(0)
    else:
        predictions_ = np.full(len(X_test), v)
    print(f'\tMAE: {mean_absolute_error(y_test, predictions_)}')
    print(f'\tRMSE: {mean_squared_error(y_test, predictions_, squared=False)}')