In [None]:
%load_ext autoreload
%autoreload 2

Use the HDF5 file generated in notebook 28 to train an XGBoost model.

In [None]:
import h5py
import numpy as np
import os
import pathlib
import xgboost as xgb

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT_FILE = DATA_DIR / '2021-03-17-ppdataset/test.hdf'

In [None]:
dataset = h5py.File(INPUT_FILE, 'r')

In [None]:
dataset_x = dataset['/train/x']

In [None]:
dataset_x.shape

In [None]:
dataset_y = dataset['/train/y']

In [None]:
dataset_stations = dataset['/train/stations']

In [None]:
one_hot = np.zeros((100000, 2359), dtype=bool)
one_hot[np.arange(100000),dataset_stations[:100000]] = True

In [None]:
one_hot.sum(axis=1)

In [None]:
in_features = np.concatenate((one_hot, dataset_x[:100000]), axis=1)

In [None]:
one_hot.shape

In [None]:
in_features.shape

In [None]:
feature_names = [f'stn{i}' for i in range(2359)]
feature_names.extend([
 'year',
    'hour',
    'step',
 'latitude',
 'longitude',
 'elevation',
 'gdps_prate',
 'gdps_prmsl',
 'gdps_2t',
 'gdps_2d',
 'gdps_2r',
 'gdps_10u',
 'gdps_10v',
 'gdps_10si',
 'gdps_10wdir',
 'gdps_al',
 'gdps_t_850',
 'gdps_t_500',
 'gdps_gh_1000',
 'gdps_gh_850',
 'gdps_gh_500',
 'gdps_u_500',
 'gdps_v_500',
 'gdps_q_850',
 'gdps_q_500',
 'gdps_thick'])

In [None]:
dtrain = xgb.DMatrix(in_features, label=dataset_y[:100000], feature_names=feature_names)

In [None]:
param = {'max_depth':4, 'eta':1, 'objective':'reg:squarederror', 'eval_metric': ['rmse', 'mae'] }

In [None]:
booster = xgb.train(param, dtrain)

In [None]:
booster.eval(dtrain)

In [None]:
val_x = dataset['/val/x'][:100000]

In [None]:
val_y = dataset['/val/y'][:100000]

In [None]:
val_one_hot = np.zeros((100000, 2359), dtype=bool)
val_one_hot[np.arange(100000),dataset['/val/stations'][:100000]] = True

In [None]:
val_x = np.concatenate((val_one_hot, val_x), axis=1)

In [None]:
len(feature_names)

In [None]:
val_x.shape

In [None]:
dval = xgb.DMatrix(val_x, label=val_y, feature_names=feature_names)

In [None]:
booster.eval(dval)

In [None]:
xgb.plot_importance(booster)

In [None]:
booster.get_score(importance_type='weight')

In [None]:
xgb.plot_tree(booster)