# EDA

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
from dask_ml.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dask_xgboost
from collections import OrderedDict
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime

sns.set_theme(style="white")
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
%matplotlib inline

In [None]:
PATH = ''

## 0.0 Start Cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(config_name='slurm')

In [None]:
cluster.scale(jobs=4)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## 1.0 Load data

In [None]:
df = dd.read_parquet(PATH)

In [None]:
df

In [None]:
df = df.persist()

In [None]:
df.head()

In [None]:
len(df)

## 2.0 Preparation

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
df.head()

In [None]:
cat_columns = []
cont_columns = ['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 
                'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 
                'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target = 'error_2t'

* Let's keep 2019-01-01 to 2020-07-01 as train set (18 months)
* 2020-07-01 to 2020-10-01 as valid set (3 months)
* 2020-10-01 to 2020-12-31 as test set (3 months)

In [None]:
print(f'min: {df.date.min().compute()}')
print(f'max: {df.date.max().compute()}')

In [None]:
X_train = df[df.date <= '2020-07-01'][['date', 'station'] + cat_columns + cont_columns]
y_train = df[df.date <= '2020-07-01'][target]
X_valid = df[(df.date > '2020-07-01') & (df.date <= '2020-10-01')][['date', 'station'] + cat_columns + cont_columns]
y_valid = df[(df.date > '2020-07-01') & (df.date <= '2020-10-01')][target]
X_test = df[df.date > '2020-10-01'][['date', 'station'] + cat_columns + cont_columns]
y_test = df[df.date > '2020-10-01'][target]

print(f'train: {len(X_train)/len(df)}, valid: {len(X_valid)/len(df)}, test: {len(X_test)/len(df)}')

train_counts = X_train.date.value_counts().compute()
valid_counts = X_valid.date.value_counts().compute()
test_counts = X_test.date.value_counts().compute()

fig, ax = plt.subplots()
ax.scatter(train_counts.index, train_counts, s=1, c='g')
ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
ax.scatter(test_counts.index, test_counts, s=1, c='r');

print(f'Train: {X_train.date.min().compute()} -> {X_train.date.max().compute()}, Valid: {X_valid.date.min().compute()} -> {X_valid.date.max().compute()}, Test: {X_test.date.min().compute()} -> {X_test.date.max().compute()}')

* Deal with missing data

In [None]:
assert not X_train.isnull().values.any(), "There are NaN values in the dataframe"
assert not X_valid.isnull().values.any(), "There are NaN values in the dataframe"
assert not X_test.isnull().values.any(), "There are NaN values in the dataframe"

* We do not find all stations that we trained on in the valid and test set

In [None]:
train_stations = X_train.station.unique()
valid_stations = X_valid.station.unique()
test_stations = X_test.station.unique()
print(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
print(f"{len(set(train_stations) - set(test_stations))} stations not in test set")

## 3.0 Training

### 3.0.2 Train XGboost model, split temporal by dates

In [None]:
X_train = X_train.drop(['date', 'station'], axis=1)
X_test = X_test.drop(['date', 'station'], axis=1)
X_valid = X_valid.drop(['date', 'station'], axis=1)

* DMatrix is an internal data structure that is used by XGBoost.

In [None]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
dtest = xgb.dask.DaskDMatrix(client, X_test)

In [None]:
num_boost_round = 100
params = {'objective': 'reg:squarederror', 
          'tree_method': 'hist',
          'eval_metric': ['rmse', 'mae'],
          'eta': 0.3
         }
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)

* Why is loss lower for Valid?

In [None]:
plt.figure(figsize=(10,6))
plt.plot(model['history']['train']['rmse'], label='Train')
plt.plot(model['history']['valid']['rmse'], label='Valid')
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('RMSE Loss')
plt.show()

In [None]:
xgb.plot_importance(model['booster']);

* But loss is higher for test, as expected?

In [None]:
predictions = xgb.dask.predict(client, model, dtest)
print(f'MAE: {mean_absolute_error(y_test, predictions)}')
print(f'RMSE: {mean_squared_error(y_test, predictions, squared=False)}')