In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold

from xgboost import XGBRegressor

from funding.preprocessing import build_time_series, split_last_year
from funding.training import extract_features
from funding.validation import evaluate
from funding.io import load_features

from interfaces.db import DB
from utils.text.io import log

## Build data from DB

In [3]:
min_year = 2018
max_year = 2021

debug = True

name = f'simple_{min_year}_{max_year}'

In [4]:
db = DB()
concept_ids = db.get_crunchbase_concept_ids()

# Create time series with data from database
log(f'Creating time series for time window {min_year}-{max_year}...', debug)
df = build_time_series(min_year, max_year, concept_ids=concept_ids, debug=False)

# Split df rows into < max_year (training data) and = max_year (response variable)
df, y = split_last_year(df, max_year)

# Load features
log(f'Loading model features from disk...', debug)
features = load_features(name)

# Extract model features
log(f'Extracting model features...', debug)
X = extract_features(df, features)

Creating time series for time window 2018-2021...
Loading model features from disk...
Extracting model features...


Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00,  9.41it/s]

Shape of df after feature extraction:  (685, 122)





## Step 1: Set high`learning_rate` to determine `n_estimators`

Start with general, sensible parameters, with high learning rate and n_estimators.

After `early_stopping_rounds` iterations with no improvement in validation, training will stop.

That will give an idea of a good value for `n_estimators`.

In [44]:
xgb_params = {
    'n_estimators': 1000,
    'max_depth': 6,
    'learning_rate': 0.1,
    'gamma': 0,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'early_stopping_rounds': 50
}

In [49]:
evaluate(X, y, xgb_params, debug=True)

Optimal number of trees, averaged over all cv folds and rounded: 9
CV SCORE [R2 score (1 - (residual sum squares)/(total sum squares))] after 10x3 cross-validation: -0.16105932081775104
TRAIN SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: 0.421005631844809
TEST SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: -0.09325017468324237


{'train': 0.421005631844809,
 'cv': -0.16105932081775104,
 'test': -0.09325017468324237,
 'diff-train-cv': 0.5820649526625601,
 'diff-train-test': 0.5142558065280514,
 'diff-test-cv': 0.06780914613450867}

We observe the optimal number of estimators for this set of parameters is **9**.

In [50]:
xgb_params['n_estimators'] = 9

## Step 2: Tune `max_depth` and `min_child_weight`

We tune these two parameters as they typically have the highest impact.

In [57]:
max_depth_grid = [1, 3, 5, 7, 9]
min_child_weight_grid = [1, 3, 5, 7]

for max_depth in max_depth_grid:
    xgb_params['max_depth'] = max_depth
    for min_child_weight in min_child_weight_grid:
        xgb_params['min_child_weight'] = min_child_weight
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        print(f'max_depth={max_depth}', f'min_child_weight={min_child_weight}', f'cv_score={cv_score}')

max_depth=1 min_child_weight=1 cv_score=-0.055073602484757206
max_depth=1 min_child_weight=3 cv_score=-0.05521409866921561
max_depth=1 min_child_weight=5 cv_score=-0.051496680131384955
max_depth=1 min_child_weight=7 cv_score=-0.04893860816275363
max_depth=3 min_child_weight=1 cv_score=-0.107592893366294
max_depth=3 min_child_weight=3 cv_score=-0.10286554882292513
max_depth=3 min_child_weight=5 cv_score=-0.098778984834914
max_depth=3 min_child_weight=7 cv_score=-0.08843891880564311
max_depth=5 min_child_weight=1 cv_score=-0.1452005904239301
max_depth=5 min_child_weight=3 cv_score=-0.12546054761202122
max_depth=5 min_child_weight=5 cv_score=-0.10804882408993777
max_depth=5 min_child_weight=7 cv_score=-0.10747981603818926
max_depth=7 min_child_weight=1 cv_score=-0.19008435074954233
max_depth=7 min_child_weight=3 cv_score=-0.13985268469561404
max_depth=7 min_child_weight=5 cv_score=-0.13518870983091671
max_depth=7 min_child_weight=7 cv_score=-0.12125489572216737
max_depth=9 min_child_weigh

The combination yielding the best results is `max_depth=3`, `min_child_weight=5`.