# EDA

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
from dask_ml.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dask_xgboost
from collections import OrderedDict
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime

sns.set_theme(style="white")
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
%matplotlib inline

In [None]:
PATH = ''

## 0.0 Start Cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(config_name='slurm')

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## 1.0 Load data

In [None]:
df = dd.read_parquet(PATH)

In [None]:
df

In [None]:
df = df.persist()

In [None]:
df.head()

In [None]:
len(df)

## 2.0 Preparation

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
df.head()

In [None]:
cat_columns = []
cont_columns = ['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 
                'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 
                'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target = 'error_2t'

* Split temporel naïf: on prendre X premières semaines pour train, X suivantes pour valid, X suivantes pour test
* 70/10/20
* shuffle=False pour garder l'ordre temporel
* **Even with shuffle=False, it seems that the way dask parallelize the task still shuffle the instances.**

In [None]:
X = df[['date'] + cat_columns + cont_columns]
y = df[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.30, shuffle=False)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=2/3, shuffle=False)

train_counts = X_train.date.value_counts().compute()
valid_counts = X_valid.date.value_counts().compute()
test_counts = X_test.date.value_counts().compute()

fig, ax = plt.subplots()
ax.scatter(train_counts.index, train_counts, s=1, c='g')
ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
ax.scatter(test_counts.index, test_counts, s=1, c='r')

In [None]:
X_train = X_train.drop('date', axis=1)
X_valid = X_valid.drop('date', axis=1)
X_test = X_test.drop('date', axis=1)

* Deal with missing data

In [None]:
assert not X.isnull().values.any(), "There are NaN values in the dataframe"

* We do not find all stations that we trained on in the valid and test set

In [None]:
train_stations = df.loc[X_train.index].station.unique()
valid_stations = df.loc[X_valid.index].station.unique()
test_stations = df.loc[X_test.index].station.unique()
print(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
print(f"{len(set(train_stations) - set(test_stations))} stations not in test set")

## 3.0 Training

### 3.0.0 Baseline

In [None]:
baselines = OrderedDict()

* En moyenne, la prédiction est off de 2.98 degC par rapport à la valeur observée. La médiane est de 3.01 degC d'erreur.

In [None]:
errors = df.loc[X_train.index].error_2t
abs(errors).describe().compute()

* No error (error_2t=0) can be a valid baseline, without abs() the mean is basically zero.

In [None]:
baselines['baseline_zero'] = 0
errors.describe().compute()

* La moyenne/médiane sur toutes les stations (train set) est un première baseline

In [None]:
baselines['baseline_mean_pos'] = abs(errors).mean().compute()
baselines['baseline_mean_neg'] = -abs(errors).mean().compute()
baselines['baseline_median_pos'] = abs(errors).describe().compute().loc['50%']
baselines['baseline_median_neg'] = -abs(errors).describe().compute().loc['50%']

* La moyenne/médiane par station (train set) est un deuxième baseline

In [None]:
baselines['baseline_station_mean_pos'] = abs(df.loc[X_train.index].groupby('station').error_2t.mean().compute())
baselines['baseline_station_mean_neg'] = -abs(df.loc[X_train.index].groupby('station').error_2t.mean().compute())
# Not working with parallelization
# baselines['baseline_station_median_pos'] = abs(df.iloc[X_train.index].groupby('station').error_2t.median())
# baselines['baseline_station_median_neg'] = -abs(df.iloc[X_train.index].groupby('station').error_2t.median())

* Prendre l'erreur de l'année précédente pour un tuple (station, date, step)

In [None]:
# Not enough data analyzed for now because of memories issues

* Compute metrics for each baseline

In [None]:
for k, v in baselines.items():
    if 'station' in k:
        predictions_ = dd.merge(df.loc[X_test.index], v, on='station', suffixes=('', '_pred'), how='left').error_2t_pred.compute()
        print(f'{predictions_.isna().sum()} NaN values: fill with 0.')
        predictions_ = predictions_.fillna(0)
    else:
        predictions_ = np.full(len(X_test), v)
    print(k)
    print(f'\tMAE: {mean_absolute_error(y_test, predictions_)}')
    print(f'\tRMSE: {mean_squared_error(y_test, predictions_, squared=False)}')

### 3.0.1 Train XGboost model

In [None]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
dtest = xgb.dask.DaskDMatrix(client, X_test)

In [None]:
num_boost_round = 100
params = {'objective': 'reg:squarederror', 
          'tree_method': 'hist',
          'eval_metric': ['rmse', 'mae'],
          'eta': 0.3
         }
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(model['history']['train']['rmse'], label='Train')
plt.plot(model['history']['valid']['rmse'], label='Valid')
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('RMSE Loss')
plt.show()

In [None]:
xgb.plot_importance(model['booster']);

In [None]:
predictions = xgb.dask.predict(client, model, dtest)
print(f'MAE: {mean_absolute_error(y_test, predictions)}')
print(f'RMSE: {mean_squared_error(y_test, predictions, squared=False)}')

### 3.0.2 Train XGboost model, split temporal by dates

* Let's keep 2019-01-01 to 2020-07-01 as train set (18 months)
* 2020-07-01 to 2020-10-01 as valid set (3 months)
* 2020-10-01 to 2020-12-31 as test set (3 months)

In [None]:
print(f'min: {df.date.min().compute()}')
print(f'max: {df.date.max().compute()}')

In [None]:
X = df[['date'] + cat_columns + cont_columns]
y = df[target]

train_idx = df[df.date <= '2020-07-01'].index
valid_idx = df[(df.date > '2020-07-01') & (df.date <= '2020-10-01')].index
test_idx = df[df.date > '2020-10-01'].index

X_train = X.loc[train_idx]
y_train = y.loc[train_idx]
X_valid = X.loc[valid_idx]
y_valid = y.loc[valid_idx]
X_test = X.loc[test_idx]
y_test = y.loc[test_idx]

print(f'train: {len(X_train)/len(X)}, valid: {len(X_valid)/len(X)}, test: {len(X_test)/len(X)}')

train_counts = X_train.date.value_counts().compute()
valid_counts = X_valid.date.value_counts().compute()
test_counts = X_test.date.value_counts().compute()

fig, ax = plt.subplots()
ax.scatter(train_counts.index, train_counts, s=1, c='g')
ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
ax.scatter(test_counts.index, test_counts, s=1, c='r');

In [None]:
X_train = X_train.drop('date', axis=1)
X_valid = X_valid.drop('date', axis=1)
X_test = X_test.drop('date', axis=1)

* We do not find all stations that we trained on in the valid and test set

In [None]:
train_stations = df.loc[X_train.index].station.unique()
valid_stations = df.loc[X_valid.index].station.unique()
test_stations = df.loc[X_test.index].station.unique()
print(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
print(f"{len(set(train_stations) - set(test_stations))} stations not in test set")

* DMatrix is an internal data structure that is used by XGBoost.

In [None]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
dtest = xgb.dask.DaskDMatrix(client, X_test)

In [None]:
num_boost_round = 100
params = {'objective': 'reg:squarederror', 
          'tree_method': 'hist',
          'eval_metric': ['rmse', 'mae'],
          'eta': 0.3
         }
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(model['history']['train']['rmse'], label='Train')
plt.plot(model['history']['valid']['rmse'], label='Valid')
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('RMSE Loss')
plt.show()

In [None]:
xgb.plot_importance(model['booster']);

In [None]:
predictions = xgb.dask.predict(client, model, dtest)
print(f'MAE: {mean_absolute_error(y_test, predictions)}')
print(f'RMSE: {mean_squared_error(y_test, predictions, squared=False)}')

### 3.0.3 Train XGboost model, backtesting

* Let's train with X months, validate with 3 months and test with 3 months

In [None]:
print(f'min: {df.date.min().compute()}')
print(f'max: {df.date.max().compute()}')

In [None]:
def backtest(df, train_start=3, period=3, valid_split=3, test_split=3,
             cat_columns=[], cont_columns=['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick'], target='error_2t',
             num_boost_round=100, params={'objective': 'reg:squarederror', 'tree_method': 'hist', 'eval_metric': ['rmse', 'mae'], 'eta': 0.3}):
    
    # Isolate selected features
    X = df[['date'] + cat_columns + cont_columns]
    y = df[target]
    
    # Get min and max date
    start_date = df.date.min().compute()
    end_date = df.date.max().compute()
    total_months = round(((end_date - start_date)/np.timedelta64(1, 'M')))
    
    # Dict of splits
    split_idx = np.arange(0, total_months-test_split-valid_split, period)
    split_definitions = dict.fromkeys(range(len(split_idx)))
    
    for idx in split_idx:
        backtest_id = int(idx/period)
        print(f'Backtest {backtest_id}...')
        
        # Get split indices
        train_end_date = start_date + datetime.timedelta(days=train_start*30) + datetime.timedelta(days=int(idx*30))
        train_idx = df[(df.date >= start_date) & (df.date <= train_end_date)].index
        valid_end_date = train_end_date + datetime.timedelta(days=valid_split*30)
        valid_idx = df[(df.date > train_end_date) & (df.date <= valid_end_date)].index
        test_end_date = valid_end_date + datetime.timedelta(days=test_split*30)
        test_idx = df[(df.date > valid_end_date) & (df.date <= test_end_date)].index
        
        # Save backtest split dates
        bt_dict = dict()
        bt_dict['train_start'] = start_date
        bt_dict['train_end_date'] = train_end_date
        bt_dict['valid_start'] = train_end_date
        bt_dict['valid_end_date'] = valid_end_date
        bt_dict['test_start'] = valid_end_date
        bt_dict['test_end_date'] = test_end_date
        split_definitions[backtest_id] = bt_dict
        
        # Split 
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]
        X_valid = X.loc[valid_idx]
        y_valid = y.loc[valid_idx]
        X_test = X.loc[test_idx]
        y_test = y.loc[test_idx]
        print(f'train: {len(X_train)/len(X)}, valid: {len(X_valid)/len(X)}, test: {len(X_test)/len(X)}')
        
        # Split validation
        train_counts = X_train.date.value_counts().compute()
        valid_counts = X_valid.date.value_counts().compute()
        test_counts = X_test.date.value_counts().compute()
        
        fig, ax = plt.subplots()
        ax.scatter(train_counts.index, train_counts, s=1, c='g')
        ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
        ax.scatter(test_counts.index, test_counts, s=1, c='r')
        plt.show();
        
        X_train = X_train.drop('date', axis=1)
        X_valid = X_valid.drop('date', axis=1)
        X_test = X_test.drop('date', axis=1)
        
        # Convert to DMatrix for xgboost
        dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
        dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
        dtest = xgb.dask.DaskDMatrix(client, X_test)
        
        # Train XGBoost
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)
        
        # Train validation
        plt.figure(figsize=(10,6))
        plt.plot(model['history']['train']['rmse'], label='Train')
        plt.plot(model['history']['valid']['rmse'], label='Valid')
        plt.legend()
        plt.xlabel('Iterations')
        plt.ylabel('RMSE')
        plt.title('RMSE Loss')
        plt.show();
        
        # Feature importance
        xgb.plot_importance(model['booster'])
        plt.show();
        
        # Performance
        predictions = xgb.dask.predict(client, model, dtest)
        print(f'MAE: {mean_absolute_error(y_test, predictions)}')
        print(f'RMSE: {mean_squared_error(y_test, predictions, squared=False)}')
    
    return split_definitions

In [None]:
sd = backtest(df)