# Analysis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
from dask_ml.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dask_xgboost
from collections import OrderedDict
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
from tqdm import tqdm
import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_USER

sys.path.insert(0, '../smc01')
from utils.splitter import BacktestSplitter
from utils.mlflow import *
from utils.logger import *

sns.set_theme(style="white")
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
%matplotlib inline

## 1.0 Setup

In [None]:
PATH = ''
model = 'xgboost'
bench = 'backtest'
label = 'test'
exp_suffix = 'test'
rootdir = ''
verbose = True
user = ''
mlflow_tracking_uri = 'mongodb://localhost:27017/mlflow'
logger = get_logger(__name__)
logging.getLogger('matplotlib.font_manager').disabled = True

In [None]:
# Start cluster
cluster = dask_jobqueue.SLURMCluster(config_name='slurm', cores=10, processes=10)
cluster.scale(jobs=3)
client = dask.distributed.Client(cluster)
client

In [None]:
# Load dataset
df = dd.read_parquet(PATH)
df = df.persist()

In [None]:
# Feature engineering
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
# Features selection
cat_columns = []
cont_columns = ['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 
                'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 
                'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target = 'error_2t'

In [None]:
def start_experiment(bench, exp_suffix, model, label,
                     rootdir='',
                     verbose=True,
                     user='',
                     mlflow_tracking_uri=''):
    '''
    Start mlflow server first
    '''

    # Generate run name and experiment name
    job_id = gen_run_id(model_name=model, label=label) # Generate run name, eg: 2021-05-18_163000-YMW-xgboost-test
    exp_name = gen_exp_name(bench=bench, suffix=exp_suffix) # Generate experiment name, eg: backtest_test

    # Create log dir
    rootdir = Path(rootdir) / job_id
    rootdir.mkdir(parents=True, exist_ok=False)
    
    # Configure logger (optional)
    log_path = rootdir / "run.log"
    log_level = logging.DEBUG if verbose else logging.INFO
    configure_logger(path=log_path, level=log_level)
    logger.info(f"'{bench}' run started")

    # Dict of mlflow config
    metadata = {
        'exp_name': exp_name,
        'job_id': job_id,
        'log_path': log_path,
        'job_rootdir': str(rootdir),
        'mlflow_user': user,
        'metrics': dict(),
        'mlflow_tracking_uri': mlflow_tracking_uri,
        'user': user
    }

    # Set tracking URI to communicate with mlflow server
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    metadata['mlflow_server'] = mlflow.tracking.get_tracking_uri()
    logger.info(f"MLFlow tracking enabled: {mlflow.tracking.get_tracking_uri()}")

    # Set experiment name and run name, start run
    logger.info(f"MLFlow Experiment name = {exp_name}")
    logger.info(f"MLFlow Run name = {job_id}")
    mlflow.set_experiment(exp_name)
    mlflow.start_run(run_name=job_id)
    mlflow.set_tag(MLFLOW_USER, user)

    return metadata

In [None]:
def backtest(df, metadata, valid_duration='30D', test_frequency='2MS', test_n_periods=11, test_gap='0h',
             cat_columns=[], cont_columns=['gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick'], target='error_2t',
             num_boost_round=100, params={'objective': 'reg:squarederror', 'tree_method': 'hist', 'eval_metric': ['rmse', 'mae'], 'eta': 0.3},
             verbose=True):
    
    # Log mlflow parameters
    mlflow_log_params(metadata)
    
    # Isolate selected features
    X = df[['date', 'station'] + cat_columns + cont_columns]
    y = df[['date', target]]
    mlflow_log_params({'categoricals': cat_columns, 'continuous': cont_columns})
    
    # Instantiate backtest splitter
    bs = BacktestSplitter(date_time_column='date', test_frequency=test_frequency, test_n_periods=test_n_periods, test_gap=test_gap)
    mlflow_log_params(bs.get_params(), prefix='splitter')
    n_splits = bs.get_n_splits(df)
    mlflow_log_params({'n_splits': n_splits})
    splits = list(enumerate(bs.get_split_indices(df)))
    split_definitions = dict.fromkeys([str(i) for i in range(n_splits)])
    logger.info(f'Processing {n_splits} backtest splits.')
    
    all_metrics = dict.fromkeys(range(n_splits))
    for fold_i, (train_start, train_end, test_start, next_test_end) in splits:
        logger.info(f'Backtest {fold_i}...')
        if verbose:
            artifact_path = f"fold_{fold_i}"
            out_dir = '/'.join((metadata['job_rootdir'], artifact_path))
            os.makedirs(out_dir, exist_ok=True)
        
        # Define valid start and end date
        valid_start = train_end - pd.to_timedelta(valid_duration)
        valid_end = train_end
        train_end = valid_start
        
        # Get splits
        X_train = X[(X.date >= train_start) & (X.date < train_end)]
        y_train = y[(y.date >= train_start) & (y.date < train_end)].drop('date', axis=1)
        
        X_valid = X[(X.date >= valid_start) & (X.date < valid_end)]
        y_valid = y[(y.date >= valid_start) & (y.date < valid_end)].drop('date', axis=1)
        
        X_test = X[(X.date >= test_start) & (X.date < next_test_end)]
        y_test = y[(y.date >= test_start) & (y.date < next_test_end)].drop('date', axis=1)
        
        if verbose:
            logger.info(f"TRAIN=[{train_start}, {train_end}[, VALID=[{valid_start}, {valid_end}[, TEST= [{test_start} - {next_test_end}[")
            # logger.info(f'train: {"{:.0%}".format(len(X_train)/len(X))}, valid: {"{:.0%}".format(len(X_valid)/len(X))}, test: {"{:.0%}".format(len(X_test)/len(X))}')
        
        # Save backtest split dates
        bt_dict = dict()
        bt_dict['train_start'] = train_start
        bt_dict['train_end_date'] = train_end
        bt_dict['valid_start'] = valid_start
        bt_dict['valid_end_date'] = valid_end
        bt_dict['test_start'] = test_start
        bt_dict['test_end_date'] = next_test_end
        split_definitions[str(fold_i)] = bt_dict
                
        if verbose:
            # Split validation
            train_stations = X_train.station.unique()
            valid_stations = X_valid.station.unique()
            test_stations = X_test.station.unique()
            logger.info(f"{len(set(train_stations) - set(valid_stations))} stations not in valid set")
            logger.info(f"{len(set(train_stations) - set(test_stations))} stations not in test set")

            train_counts = X_train.date.value_counts().compute()
            valid_counts = X_valid.date.value_counts().compute()
            test_counts = X_test.date.value_counts().compute()

            fig, ax = plt.subplots()
            ax.scatter(train_counts.index, train_counts, s=1, c='g')
            ax.scatter(valid_counts.index, valid_counts, s=1, c='b')
            ax.scatter(test_counts.index, test_counts, s=1, c='r')
            ax.tick_params(axis='x', labelrotation = 45)
            backtest_splits_path = str('/'.join((out_dir, 'backtest_splits.png')))
            fig.savefig(backtest_splits_path, format='png')
            mlflow.log_artifact(local_path=backtest_splits_path, artifact_path=artifact_path)
                
        # Convert to DMatrix for xgboost
        X_train = X_train.drop(['date', 'station'], axis=1)
        X_test = X_test.drop(['date', 'station'], axis=1)
        X_valid = X_valid.drop(['date', 'station'], axis=1)
        dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
        dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
        dtest = xgb.dask.DaskDMatrix(client, X_test)
        
        # Train XGBoost
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        mlflow_log_params(params, prefix='model')
        model = xgb.dask.train(client, params, dtrain, num_boost_round, evals=watchlist, verbose_eval=10, early_stopping_rounds=10)
        
        if verbose:
            # Train validations
            fig = plt.figure(figsize=(10,6))
            plt.plot(model['history']['train']['rmse'], label='Train')
            plt.plot(model['history']['valid']['rmse'], label='Valid')
            plt.legend()
            plt.xlabel('Iterations')
            plt.ylabel('RMSE')
            plt.title('RMSE Loss')
            losses_path = str('/'.join((out_dir, 'losses.png')))
            fig.savefig(losses_path, format='png')
            mlflow.log_artifact(local_path=losses_path, artifact_path=artifact_path)
        
            # Feature importance
            ax = xgb.plot_importance(model['booster'])
            feature_importance_path = str('/'.join((out_dir, 'feature_importance.png')))
            ax.figure.savefig(feature_importance_path, format='png')
            mlflow.log_artifact(local_path=feature_importance_path, artifact_path=artifact_path)
        
        # Performance
        predictions = xgb.dask.predict(client, model, dtest)
        mae = mean_absolute_error(y_test, predictions)
        rmse = mean_squared_error(y_test, predictions, squared=False)
        logger.info(f'MAE: {mae}')
        logger.info(f'RMSE: {rmse}')
        metrics = {'MAE': mae, 'RMSE': rmse}
        mlflow.log_metrics({f"fold_{k}": v for k, v in metrics.items()}, step=fold_i)
        all_metrics[fold_i] = metrics
        
        # del X_train, y_train, X_valid, y_valid, X_test, y_test, dtrain, dvalid, dtest, model, predictions
        
    # Log
    mlflow_log_params({f'{fold}_{date}': value for fold, dates in split_definitions.items() for date, value in dates.items()}, prefix='split')
    runs_avg_metrics = {f'avg_{metric}': v for metric, v in pd.DataFrame(all_metrics).mean(axis=1).to_dict().items()}
    mlflow.log_metrics(runs_avg_metrics)
    mlflow.log_artifact(metadata['log_path'])
    
    # End run
    mlflow.end_run()
    
    # del X, y
    
    return bs, X, y, split_definitions, model, predictions, all_metrics

## 2.0 Training

In [None]:
metadata = start_experiment(bench='temporal', exp_suffix='analysis', model='xgboost', label='test')

In [None]:
cont_columns=['step', 'gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
bs, X, y, split_definitions, model, predictions, all_metrics = backtest(df, metadata, test_frequency='1YS', test_n_periods=1, num_boost_round=1000, verbose=False, cont_columns=cont_columns)

## 3.0 Analysis

In [None]:
valid_duration='30D'
test_frequency='1YS'
test_n_periods=1
test_gap='0h'
cat_columns=[]
cont_columns=['step', 'gdps_prate', 'gdps_prmsl', 'gdps_2t', 'gdps_2d', 'gdps_2r', 'gdps_10u', 'gdps_10v', 'gdps_10si', 'gdps_10wdir', 'gdps_al', 'gdps_t_850', 'gdps_t_500', 'gdps_gh_1000', 'gdps_gh_850', 'gdps_gh_500', 'gdps_u_500', 'gdps_v_500', 'gdps_q_850', 'gdps_q_500', 'gdps_thick']
target='error_2t'
num_boost_round=1000
params={'objective': 'reg:squarederror', 'tree_method': 'hist', 'eval_metric': ['rmse', 'mae'], 'eta': 0.3}
verbose=True

In [None]:
splits = list(enumerate(bs.get_split_indices(df)))
fold_i, (train_start, train_end, test_start, next_test_end) = splits[0]

In [None]:
# Define valid start and end date
valid_start = train_end - pd.to_timedelta(valid_duration)
valid_end = train_end
train_end = valid_start

# Get splits
df_train = df[(df.date >= train_start) & (df.date < train_end)]
df_valid = df[(df.date >= valid_start) & (df.date < valid_end)]
df_test = df[(df.date >= test_start) & (df.date < next_test_end)]

In [None]:
results = df_test.compute()
results['error_2t_predictions'] = predictions.compute()

* Similar distribution between real errors and predicted errors
* Less variance for predictions
* Predictions slightly more on the negative side, like the real errors

In [None]:
fig, ax = plt.subplots()
ax.hist(results['error_2t'], bins=np.arange(-20, 20, 1), alpha=0.5, label='Ground Truth')
ax.hist(results['error_2t_predictions'], bins=np.arange(-20, 20, 1), alpha=0.5, label='Predictions')
fig.suptitle('Distributions of real errors and predicted errors')
ax.legend();

* Most of the errors are just a couple of celsius

In [None]:
fig, ax = plt.subplots()
ax.hist(abs(results['error_2t'] - results['error_2t_predictions']), bins=np.arange(0, 20, 1), alpha=0.5)
fig.suptitle('Distribution of absolute difference between ground truth and predictions');

* The prediction error seems lower generally when the observed temperature is higher.
* **Try with other features**

In [None]:
quantiles = results['obs_2t'].quantile([0, 0.25, 0.5, 0.75, 1])
fig, ax = plt.subplots(4, 1, figsize=(10,15), sharex=True, sharey=True)
for i, (bot, top) in enumerate([(0, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1)]):
    sub_df = results[(results.obs_2t >= quantiles[bot]) & (results.obs_2t < quantiles[top])]
    errors = abs(sub_df['error_2t'] - sub_df['error_2t_predictions'])
    ax[i].hist(errors, bins=np.arange(0, 20, 1), alpha=0.5);
    ax[i].text(1.1, 0.5, f'Mean: {np.around(errors.mean(), 2)}', transform=ax[i].transAxes)
    ax[i].set_title(f'{round(quantiles[bot])} - {round(quantiles[top])} (degC)')
plt.tight_layout();
fig.suptitle('Error distribution based on observed temperature', y=1.01);

* Does not seem to have any significant trend (over or undershooting predictions) between east and west or south/north
* Mean absolute error seems higher for northern stations and higher for eastern stations
* Predictions seems worse where the density of weather stations is low

In [None]:
results['error'] = results['error_2t'] - results['error_2t_predictions']
results['abs_error'] = abs(results['error_2t'] - results['error_2t_predictions'])
lon_lat_impact = results.groupby(['longitude', 'latitude'])['error'].mean().reset_index().sort_values(['longitude', 'latitude'])

fig, ax = plt.subplots(1, 2, figsize=(18,6))
ax[0].grid(axis='y')
ax[0].axhline(0, color='black', lw=0.75)
ax[0].scatter(x=lon_lat_impact['longitude'], y=lon_lat_impact['error'], alpha=0.1)
ax[0].set_title('Longitude')
ax[1].grid(axis='y')
ax[1].axhline(0, color='black', lw=0.75)
ax[1].scatter(x=lon_lat_impact['latitude'], y=lon_lat_impact['error'], alpha=0.1)
ax[1].set_title('Latitude');

In [None]:
lon_lat_impact = results.groupby(['longitude', 'latitude'])['abs_error'].mean().reset_index().sort_values(['longitude', 'latitude'])

fig, ax = plt.subplots(1, 2, figsize=(18,6))
z = np.polyfit(lon_lat_impact['longitude'], lon_lat_impact['abs_error'], 1)
p = np.poly1d(z)
ax[0].grid(axis='y')
ax[0].plot(lon_lat_impact['longitude'], p(lon_lat_impact['longitude']), "b--")
ax[0].scatter(x=lon_lat_impact['longitude'], y=lon_lat_impact['abs_error'], alpha=0.1)
ax[0].set_title('Longitude')
z = np.polyfit(lon_lat_impact['latitude'], lon_lat_impact['abs_error'], 1)
p = np.poly1d(z)
ax[1].grid(axis='y')
ax[1].plot(lon_lat_impact['latitude'], p(lon_lat_impact['latitude']), "b--")
ax[1].scatter(x=lon_lat_impact['latitude'], y=lon_lat_impact['abs_error'], alpha=0.1)
ax[1].set_title('Latitude');

In [None]:
lon_lat_impact.plot.scatter(x='longitude', y='latitude', c='abs_error', figsize=(16,8), vmax=5)
plt.xlim(-200,0);

* Mean absolute error per station is between 1 and 4.
* Distribution of absolute difference between ground truth and predictions (**grouped by stations**)

In [None]:
station_mean = results.groupby('station')['abs_error'].mean()
fig, ax = plt.subplots()
ax.hist(station_mean, bins=np.arange(0, 10, 0.5), alpha=0.5)
fig.suptitle('Distribution of absolute difference between ground truth and predictions');

* Having a single model per station does not seem to give better results than having a single model for all stations.

In [None]:
mlflow_tracking_uri = 'mongodb://localhost:27017/mlflow'
mlflow.set_tracking_uri(mlflow_tracking_uri)
station_experiments = mlflow.search_runs(experiment_ids='2')
# station_experiments.to_csv('../temporal_stations_results.csv')

In [None]:
print(f'MAE: {station_experiments["metrics.avg_MAE"].mean()}')
print(f'RMSE: {station_experiments["metrics.avg_RMSE"].mean()}')

* The bigger the step, the higher the error

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,6))
results.groupby('step')['error'].mean().plot(ax=ax[0])
ax[0].set_title('Mean error per step')
results.groupby('step')['abs_error'].mean().plot(ax=ax[1])
ax[1].set_title('Mean absolute error per step');