In [None]:
%load_ext autoreload
%autoreload 2

# Create DMatrix

Make a proof of concept where we build the XGBoost DMatrices without Dask.
Everything happens on one host, in memory, using only Pandas as XGBoost.

In [None]:
import numpy as np
import os
import pandas as pd
import pathlib
import plotly.express as px
import tqdm.notebook as tqdm
import xgboost as xgb

In [None]:
def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
DATASET_PATH = DATA_DIR / 'interpolated/2021-12-20-gdps-metar/'
dataset_path = pathlib.Path(DATASET_PATH)
dataset_files = sorted(list(dataset_path.glob('*.parquet')))

In [None]:
columns = [
    "gdps_2t",
    "gdps_u_500",
    "gdps_v_500",
    "gdps_gh_500",
    "gdps_10si",
    "step",
    "date",
    "station",
    "obs_2t"
]

In [None]:
dfs = []
for f in tqdm.tqdm(dataset_files):
    dfs.append(pd.read_parquet(f, columns=columns))

In [None]:
df = pd.concat(dfs, copy=False)
df = df.set_index(['date', 'station'])
df

In [None]:
df['error'] = df['obs_2t'] - df['gdps_2t']

In [None]:
sizeof_fmt(df.memory_usage().sum())

In [None]:
df_train = df.xs(slice('2019-01-01', '2021-01-01'), level=0)

In [None]:
df_train

In [None]:
df_val

In [None]:
df_val = df.xs(slice('2021-01-01', '2022-01-01'), level=0)

In [None]:
df_train.index.unique()

In [None]:
for station in ['ZZV']:
    X = xgb.DMatrix(
        df_train.loc[station].drop(columns=['obs_2t', 'error']).values, 
        label=df_train.loc[station][['error']].values
    )
    
    params = dict(
        gamma=1,                 
        learning_rate=0.01,
        max_depth=3,
        n_estimators=10000,                                                                    
        subsample=0.8,
        objective='reg:squarederror',
        eval_metric=['rmse', 'mae'],
    )
    
    model = xgb.train(params, X)

In [None]:
dmatrix_val = xgb.DMatrix(
    df_val.loc[station].drop(columns=['obs_2t', 'error', 'prediction']).values, 
)

In [None]:
p = model.predict(dmatrix_val)

In [None]:
df_val = df_val.copy()

In [None]:
df_val.loc[:, 'prediction'] = np.nan

In [None]:
df_val.loc[station, 'prediction'] = p

In [None]:
df_val.loc[station]

In [None]:
cyul = df_val.loc[station].copy()

In [None]:
cyul['squared_error'] = np.square(cyul['obs_2t'] - (cyul['gdps_2t'] + cyul['prediction']))

In [None]:
cyul['raw_error'] = np.square(cyul['obs_2t'] - cyul['gdps_2t'])

In [None]:
error_by_step = np.sqrt(cyul.groupby('step').mean()[['squared_error', 'raw_error']])

In [None]:
error_by_step

In [None]:
error_melt = error_by_step.reset_index().melt(value_vars=['squared_error', 'raw_error'], id_vars=['step'])

In [None]:
error_melt

In [None]:
px.line(data_frame=error_melt, x='step', y='value', color='variable')