# EDA

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
from dask_ml.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dask_xgboost
from collections import OrderedDict
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime

sns.set_theme(style="white")
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
%matplotlib inline

In [None]:
PATH = ''

## 0.0 Start Cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(config_name='slurm')

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## 1.0 Load data

In [None]:
df = dd.read_parquet(PATH)

In [None]:
df

In [None]:
df = df.persist()

In [None]:
df.head()

In [None]:
len(df)

## 2.0 Preparation

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
df.head()

In [None]:
cat_columns = []
cont_columns = ['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 
                'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 
                'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target = 'error_2t'

## 3.0 Training

### 3.0.0 Train XGboost model, backtesting

* Let's train with X months, validate with 3 months and test with 3 months

In [None]:
def backtest(df, train_start=3, period=3, valid_split=3, test_split=3,
             cat_columns=[], cont_columns=['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick'], target='error_2t',
             num_boost_round=100, params={'objective': 'reg:squarederror', 'tree_method': 'hist', 'eval_metric': ['rmse', 'mae'], 'eta': 0.3}):
    
    # Isolate selected features
    X = df[['date', 'station'] + cat_columns + cont_columns]
    y = df[['date', target]]
    
    # Get min and max date
    start_date = df.date.min().compute()
    end_date = df.date.max().compute()
    total_months = round(((end_date - start_date)/np.timedelta64(1, 'M')))
    
    # Dict of splits
    split_idx = np.arange(0, total_months-test_split-valid_split, period)
    split_definitions = dict.fromkeys(range(len(split_idx)))
    
    for idx in split_idx:
        backtest_id = int(idx/period)
        print(f'Backtest {backtest_id}...')
        
        # Get splits
        train_end_date = start_date + datetime.timedelta(days=train_start*30) + datetime.timedelta(days=int(idx*30))
        X_train = X[(X.date >= start_date) & (X.date <= train_end_date)]
        y_train = y[(y.date >= start_date) & (y.date <= train_end_date)].drop('date', axis=1)
        
        valid_end_date = train_end_date + datetime.timedelta(days=valid_split*30)
        X_valid = X[(X.date > train_end_date) & (X.date <= valid_end_date)]
        y_valid = y[(y.date > train_end_date) & (y.date <= valid_end_date)].drop('date', axis=1)
        
        test_end_date = valid_end_date + datetime.timedelta(days=test_split*30)
        X_test = X[(X.date > valid_end_date) & (X.date <= test_end_date)]
        y_test = y[(y.date > valid_end_date) & (y.date <= test_end_date)].drop('date', axis=1)
        
        print(f'train: {len(X_train)/len(X)}, valid: {len(X_valid)/len(X)}, test: {len(X_test)/len(X)}')
        
        # Save backtest split dates
        bt_dict = dict()
        bt_dict['train_start'] = start_date
        bt_dict['train_end_date'] = train_end_date
        bt_dict['valid_start'] = train_end_date
        bt_dict['valid_end_date'] = valid_end_date
        bt_dict['test_start'] = valid_end_date
        bt_dict['test_end_date'] = test_end_date
        split_definitions[backtest_id] = bt_dict
                
        # Split validation
        train_stations = X_train.station.unique()
        valid_stations = X_valid.station.unique()
        test_stations = X_test.station.unique()
        print(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
        print(f"{len(set(train_stations) - set(test_stations))} stations not in test set")
        
        train_counts = X_train.date.value_counts().compute()
        valid_counts = X_valid.date.value_counts().compute()
        test_counts = X_test.date.value_counts().compute()
        
        fig, ax = plt.subplots()
        ax.scatter(train_counts.index, train_counts, s=1, c='g')
        ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
        ax.scatter(test_counts.index, test_counts, s=1, c='r')
        plt.show();
        
        print(f'Train: {X_train.date.min().compute()} -> {X_train.date.max().compute()}, Valid: {X_valid.date.min().compute()} -> {X_valid.date.max().compute()}, Test: {X_test.date.min().compute()} -> {X_test.date.max().compute()}')        
        
        # Convert to DMatrix for xgboost
        X_train = X_train.drop(['date', 'station'], axis=1)
        X_test = X_test.drop(['date', 'station'], axis=1)
        X_valid = X_valid.drop(['date', 'station'], axis=1)
        dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
        dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
        dtest = xgb.dask.DaskDMatrix(client, X_test)
        
        # Train XGBoost
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)
        
        # Train validation
        plt.figure(figsize=(10,6))
        plt.plot(model['history']['train']['rmse'], label='Train')
        plt.plot(model['history']['valid']['rmse'], label='Valid')
        plt.legend()
        plt.xlabel('Iterations')
        plt.ylabel('RMSE')
        plt.title('RMSE Loss')
        plt.show();
        
        # Feature importance
        xgb.plot_importance(model['booster'])
        plt.show();
        
        # Performance
        predictions = xgb.dask.predict(client, model, dtest)
        print(f'MAE: {mean_absolute_error(y_test, predictions)}')
        print(f'RMSE: {mean_squared_error(y_test, predictions, squared=False)}')
    
    return split_definitions

In [None]:
sd = backtest(df)