In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold

from xgboost import XGBRegressor

from funding.preprocessing import build_time_series, split_last_year
from funding.training import extract_features
from funding.validation import evaluate
from funding.io import load_features

from interfaces.db import DB
from utils.text.io import log

## Build data from DB

In [3]:
min_year = 2018
max_year = 2021

debug = True

name = f'simple_{min_year}_{max_year}'

In [4]:
db = DB()
concept_ids = db.get_crunchbase_concept_ids()

# Create time series with data from database
log(f'Creating time series for time window {min_year}-{max_year}...', debug)
df = build_time_series(min_year, max_year, concept_ids=concept_ids, debug=False)

# Split df rows into < max_year (training data) and = max_year (response variable)
df, y = split_last_year(df, max_year)

# Load features
log(f'Loading model features from disk...', debug)
features = load_features(name)

# Extract model features
log(f'Extracting model features...', debug)
X = extract_features(df, features)

Creating time series for time window 2018-2021...
Loading model features from disk...
Extracting model features...


Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00,  9.41it/s]

Shape of df after feature extraction:  (685, 122)





## Step 1: Set high`learning_rate` to determine `n_estimators`

Start with general, sensible parameters, with high learning rate and n_estimators.

After `early_stopping_rounds` iterations with no improvement in validation, training will stop.

That will give an idea of a good value for `n_estimators`.

In [44]:
xgb_params = {
    'n_estimators': 1000,
    'max_depth': 6,
    'learning_rate': 0.1,
    'gamma': 0,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'early_stopping_rounds': 50
}

In [49]:
evaluate(X, y, xgb_params, debug=True)

Optimal number of trees, averaged over all cv folds and rounded: 9
CV SCORE [R2 score (1 - (residual sum squares)/(total sum squares))] after 10x3 cross-validation: -0.16105932081775104
TRAIN SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: 0.421005631844809
TEST SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: -0.09325017468324237


{'train': 0.421005631844809,
 'cv': -0.16105932081775104,
 'test': -0.09325017468324237,
 'diff-train-cv': 0.5820649526625601,
 'diff-train-test': 0.5142558065280514,
 'diff-test-cv': 0.06780914613450867}

We observe the optimal number of estimators for this set of parameters is **9**.

In [50]:
xgb_params['n_estimators'] = 9

## Step 2: Tune `max_depth` and `min_child_weight`

We tune these two parameters as they typically have the highest impact.

In [91]:
max_depth_grid = [3, 5, 7, 9]
min_child_weight_grid = [1, 3, 5]

best_score = None
best_combination = None
for max_depth in max_depth_grid:
    xgb_params['max_depth'] = max_depth
    for min_child_weight in min_child_weight_grid:
        xgb_params['min_child_weight'] = min_child_weight
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (max_depth, min_child_weight)

        print(f'max_depth={max_depth}', f'min_child_weight={min_child_weight}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'max_depth={best_combination[0]}', f'min_child_weight={best_combination[1]}', f'cv_score={best_score}')

max_depth=3 min_child_weight=1 cv_score=-0.08219119594139777
max_depth=3 min_child_weight=3 cv_score=-0.08074599557530335
max_depth=3 min_child_weight=5 cv_score=-0.09764373880024536
max_depth=5 min_child_weight=1 cv_score=-0.12203864954541305
max_depth=5 min_child_weight=3 cv_score=-0.11272164637649976
max_depth=5 min_child_weight=5 cv_score=-0.11237204439930831
max_depth=7 min_child_weight=1 cv_score=-0.1326823629285182
max_depth=7 min_child_weight=3 cv_score=-0.12439632809325461
max_depth=7 min_child_weight=5 cv_score=-0.11657140065761344
max_depth=9 min_child_weight=1 cv_score=-0.15707277908311781
max_depth=9 min_child_weight=3 cv_score=-0.1280681044370781
max_depth=9 min_child_weight=5 cv_score=-0.12170612066190362

Best combination:
max_depth=3 min_child_weight=3 cv_score=-0.08074599557530335


The combination yielding the best results is `max_depth=3`, `min_child_weight=5`. We repeat the process with different grids closer to the obtained values.

In [92]:
max_depth_grid = [1, 2, 3, 4]
min_child_weight_grid = [5, 6, 7, 8, 9]

best_score = None
best_combination = None
for max_depth in max_depth_grid:
    xgb_params['max_depth'] = max_depth
    for min_child_weight in min_child_weight_grid:
        xgb_params['min_child_weight'] = min_child_weight
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (max_depth, min_child_weight)

        print(f'max_depth={max_depth}', f'min_child_weight={min_child_weight}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'max_depth={best_combination[0]}', f'min_child_weight={best_combination[1]}', f'cv_score={best_score}')

max_depth=1 min_child_weight=5 cv_score=-0.10081788517237976
max_depth=1 min_child_weight=6 cv_score=-0.09723244190873709
max_depth=1 min_child_weight=7 cv_score=-0.09175427541912443
max_depth=1 min_child_weight=8 cv_score=-0.09488333835800894
max_depth=1 min_child_weight=9 cv_score=-0.09752685052864672
max_depth=2 min_child_weight=5 cv_score=-0.08692708540423962
max_depth=2 min_child_weight=6 cv_score=-0.08074101856412606
max_depth=2 min_child_weight=7 cv_score=-0.0804149931528667
max_depth=2 min_child_weight=8 cv_score=-0.07926998468428562
max_depth=2 min_child_weight=9 cv_score=-0.08596118576353028
max_depth=3 min_child_weight=5 cv_score=-0.09764373880024536
max_depth=3 min_child_weight=6 cv_score=-0.08586318168909876
max_depth=3 min_child_weight=7 cv_score=-0.08573641682769831
max_depth=3 min_child_weight=8 cv_score=-0.08252533659939267
max_depth=3 min_child_weight=9 cv_score=-0.09473510027140279
max_depth=4 min_child_weight=5 cv_score=-0.10828229845131544
max_depth=4 min_child_wei

The combination yielding the best results is `max_depth=1`, `min_child_weight=8`. We keep these values for the moment.

In [64]:
xgb_params['max_depth'] = 1
xgb_params['min_child_weight'] = 8

## Step 3: Tune `gamma`

In [69]:
gamma_grid = [0, 0.5, 1, 1.5, 2, 2.5]

best_score = None
best_param = None
for gamma in gamma_grid:
    xgb_params['gamma'] = gamma
    cv_score = evaluate(X, y, xgb_params, debug=False)['cv']

    if best_score is None or best_score < cv_score:
        best_score = cv_score
        best_param = gamma

    print(f'gamma={gamma}', f'cv_score={cv_score}')

print()
print('Best parameter:')
print(f'gamma={best_param}', f'cv_score={best_score}')

gamma=0 cv_score=-0.04869845177376186
gamma=0.5 cv_score=-0.04869845177376186
gamma=1 cv_score=-0.04869845177376186
gamma=1.5 cv_score=-0.04869845177376186
gamma=2 cv_score=-0.04869845177376186
gamma=2.5 cv_score=-0.04869845177376186

Best parameter:
gamma=0 cv_score=-0.04869845177376186


`gamma` does not seem to affect the cv score. We set it back to 0, the default value.

In [71]:
xgb_params['gamma'] = 0

## Step 4: Recalibrate the number of estimators

In [72]:
evaluate(X, y, xgb_params, debug=True)

Optimal number of trees, averaged over all cv folds and rounded: 9
CV SCORE [R2 score (1 - (residual sum squares)/(total sum squares))] after 10x3 cross-validation: -0.04869845177376186
TRAIN SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: 0.005808709889873809
TEST SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: -0.054544020256831915


{'train': 0.005808709889873809,
 'cv': -0.04869845177376186,
 'test': -0.054544020256831915,
 'diff-train-cv': 0.054507161663635666,
 'diff-train-test': 0.060352730146705724,
 'diff-test-cv': 0.0058455684830700586}

Still the optimal value is 9, nothing to change.

## Step 5: Tune `subsample`, `colsample_bytree`

In [73]:
subsample_grid = [0.6, 0.7, 0.8, 0.9]
colsample_bytree_grid = [0.6, 0.7, 0.8, 0.9]

best_score = None
best_combination = None
for subsample in subsample_grid:
    xgb_params['subsample'] = subsample
    for colsample_bytree in colsample_bytree_grid:
        xgb_params['colsample_bytree'] = colsample_bytree
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (subsample, colsample_bytree)

        print(f'subsample={subsample}', f'colsample_bytree={colsample_bytree}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'subsample={best_combination[0]}', f'colsample_bytree={best_combination[1]}', f'cv_score={best_score}')

subsample=0.6 colsample_bytree=0.6 cv_score=-0.05418126225145467
subsample=0.6 colsample_bytree=0.7 cv_score=-0.05524874709341067
subsample=0.6 colsample_bytree=0.8 cv_score=-0.05547567420876541
subsample=0.6 colsample_bytree=0.9 cv_score=-0.056724076206346494
subsample=0.7 colsample_bytree=0.6 cv_score=-0.05396833870079646
subsample=0.7 colsample_bytree=0.7 cv_score=-0.054015747246429074
subsample=0.7 colsample_bytree=0.8 cv_score=-0.055535598742020735
subsample=0.7 colsample_bytree=0.9 cv_score=-0.05395721838972627
subsample=0.8 colsample_bytree=0.6 cv_score=-0.04849489716382564
subsample=0.8 colsample_bytree=0.7 cv_score=-0.04810394114652001
subsample=0.8 colsample_bytree=0.8 cv_score=-0.04869845177376186
subsample=0.8 colsample_bytree=0.9 cv_score=-0.04811103033285473
subsample=0.9 colsample_bytree=0.6 cv_score=-0.047483232271403684
subsample=0.9 colsample_bytree=0.7 cv_score=-0.04680245879418173
subsample=0.9 colsample_bytree=0.8 cv_score=-0.047514253526024564
subsample=0.9 colsam

Best combination: `subsample=0.9` and `colsample_bytree=0.7`. We repeat the process with a finer grid.

In [74]:
subsample_grid = [0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98]
colsample_bytree_grid = [0.62, 0.64, 0.66, 0.68, 0.7, 0.72, 0.74, 0.76, 0.78]

best_score = None
best_combination = None
for subsample in subsample_grid:
    xgb_params['subsample'] = subsample
    for colsample_bytree in colsample_bytree_grid:
        xgb_params['colsample_bytree'] = colsample_bytree
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (subsample, colsample_bytree)

        print(f'subsample={subsample}', f'colsample_bytree={colsample_bytree}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'subsample={best_combination[0]}', f'colsample_bytree={best_combination[1]}', f'cv_score={best_score}')

subsample=0.82 colsample_bytree=0.62 cv_score=-0.04814284300456329
subsample=0.82 colsample_bytree=0.64 cv_score=-0.04729786197205464
subsample=0.82 colsample_bytree=0.66 cv_score=-0.047870108776347446
subsample=0.82 colsample_bytree=0.68 cv_score=-0.047870108776347446
subsample=0.82 colsample_bytree=0.7 cv_score=-0.04965653698639308
subsample=0.82 colsample_bytree=0.72 cv_score=-0.04965653698639308
subsample=0.82 colsample_bytree=0.74 cv_score=-0.04848344746168098
subsample=0.82 colsample_bytree=0.76 cv_score=-0.04854092395846531
subsample=0.82 colsample_bytree=0.78 cv_score=-0.04901971713871885
subsample=0.84 colsample_bytree=0.62 cv_score=-0.046125924877266654
subsample=0.84 colsample_bytree=0.64 cv_score=-0.045876204739272425
subsample=0.84 colsample_bytree=0.66 cv_score=-0.0464071730143792
subsample=0.84 colsample_bytree=0.68 cv_score=-0.0464071730143792
subsample=0.84 colsample_bytree=0.7 cv_score=-0.047747591255027415
subsample=0.84 colsample_bytree=0.72 cv_score=-0.047747591255

We set the values to `subsample=0.9` and `colsample_bytree=0.66`

In [75]:
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.66

## Step 6: Tune regularization parameters `reg_alpha` and `reg_lambda`

In [76]:
reg_alpha_grid = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
reg_lambda_grid = [1, 2, 4, 8, 16]

best_score = None
best_combination = None
for reg_alpha in reg_alpha_grid:
    xgb_params['reg_alpha'] = reg_alpha
    for reg_lambda in reg_lambda_grid:
        xgb_params['reg_lambda'] = reg_lambda
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (reg_alpha, reg_lambda)

        print(f'reg_alpha={reg_alpha}', f'reg_lambda={reg_lambda}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'reg_alpha={best_combination[0]}', f'reg_lambda={best_combination[1]}', f'cv_score={best_score}')

reg_alpha=1e-05 reg_lambda=1 cv_score=-0.04578278364243649
reg_alpha=1e-05 reg_lambda=2 cv_score=-0.047945825763791666
reg_alpha=1e-05 reg_lambda=4 cv_score=-0.046920397674903504
reg_alpha=1e-05 reg_lambda=8 cv_score=-0.04841047165461595
reg_alpha=1e-05 reg_lambda=16 cv_score=-0.047778442263431115
reg_alpha=0.0001 reg_lambda=1 cv_score=-0.04578278354502468
reg_alpha=0.0001 reg_lambda=2 cv_score=-0.047945825763791666
reg_alpha=0.0001 reg_lambda=4 cv_score=-0.046920397674903504
reg_alpha=0.0001 reg_lambda=8 cv_score=-0.04841047165461595
reg_alpha=0.0001 reg_lambda=16 cv_score=-0.047778442263431115
reg_alpha=0.001 reg_lambda=1 cv_score=-0.04578278354502468
reg_alpha=0.001 reg_lambda=2 cv_score=-0.047945825763791666
reg_alpha=0.001 reg_lambda=4 cv_score=-0.046920397674903504
reg_alpha=0.001 reg_lambda=8 cv_score=-0.04841047165461595
reg_alpha=0.001 reg_lambda=16 cv_score=-0.047778442263431115
reg_alpha=0.01 reg_lambda=1 cv_score=-0.04578278354502468
reg_alpha=0.01 reg_lambda=2 cv_score=-0.

Best combination is `reg_alpha=1e-4` and `reg_lambda=1`. We repeat the process with a finer grid.

In [78]:
reg_alpha_grid = [1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4]
reg_lambda_grid = [1, 1.1, 1.2, 1.3, 1.4]

best_score = None
best_combination = None
for reg_alpha in reg_alpha_grid:
    xgb_params['reg_alpha'] = reg_alpha
    for reg_lambda in reg_lambda_grid:
        xgb_params['reg_lambda'] = reg_lambda
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (reg_alpha, reg_lambda)

        print(f'reg_alpha={reg_alpha}', f'reg_lambda={reg_lambda}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'reg_alpha={best_combination[0]}', f'reg_lambda={best_combination[1]}', f'cv_score={best_score}')

reg_alpha=1e-05 reg_lambda=1 cv_score=-0.04578278364243649
reg_alpha=1e-05 reg_lambda=1.1 cv_score=-0.04578273104510148
reg_alpha=1e-05 reg_lambda=1.2 cv_score=-0.04548284746598081
reg_alpha=1e-05 reg_lambda=1.3 cv_score=-0.045643120417889954
reg_alpha=1e-05 reg_lambda=1.4 cv_score=-0.046375086918695484
reg_alpha=2e-05 reg_lambda=1 cv_score=-0.04578278364243649
reg_alpha=2e-05 reg_lambda=1.1 cv_score=-0.04578273104510148
reg_alpha=2e-05 reg_lambda=1.2 cv_score=-0.04548284746598081
reg_alpha=2e-05 reg_lambda=1.3 cv_score=-0.045643120417889954
reg_alpha=2e-05 reg_lambda=1.4 cv_score=-0.046375086918695484
reg_alpha=5e-05 reg_lambda=1 cv_score=-0.04578278354502468
reg_alpha=5e-05 reg_lambda=1.1 cv_score=-0.04578273104510148
reg_alpha=5e-05 reg_lambda=1.2 cv_score=-0.04548284746598081
reg_alpha=5e-05 reg_lambda=1.3 cv_score=-0.045643120417889954
reg_alpha=5e-05 reg_lambda=1.4 cv_score=-0.046375086918695484
reg_alpha=0.0001 reg_lambda=1 cv_score=-0.04578278354502468
reg_alpha=0.0001 reg_lamb

Now the optimal combination is `reg_alpha=1e-5` and `reg_lambda=1.2`. We repeat once again since we never tried values for `reg_alpha` under `1e-5`.

In [81]:
reg_alpha_grid = [0, 1e-7, 2e-7, 5e-7, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5]
reg_lambda_grid = [1.14, 1.16, 1.18, 1.2, 1.22, 1.24, 1.26]

best_score = None
best_combination = None
for reg_alpha in reg_alpha_grid:
    xgb_params['reg_alpha'] = reg_alpha
    for reg_lambda in reg_lambda_grid:
        xgb_params['reg_lambda'] = reg_lambda
        cv_score = evaluate(X, y, xgb_params, debug=False)['cv']
        
        if best_score is None or best_score < cv_score:
            best_score = cv_score
            best_combination = (reg_alpha, reg_lambda)

        print(f'reg_alpha={reg_alpha}', f'reg_lambda={reg_lambda}', f'cv_score={cv_score}')

print()
print('Best combination:')
print(f'reg_alpha={best_combination[0]}', f'reg_lambda={best_combination[1]}', f'cv_score={best_score}')

reg_alpha=0 reg_lambda=1.14 cv_score=-0.04575066737345572
reg_alpha=0 reg_lambda=1.16 cv_score=-0.045739285173386254
reg_alpha=0 reg_lambda=1.18 cv_score=-0.04549778908046519
reg_alpha=0 reg_lambda=1.2 cv_score=-0.04548284746598081
reg_alpha=0 reg_lambda=1.22 cv_score=-0.04515917307731445
reg_alpha=0 reg_lambda=1.24 cv_score=-0.04514891002530697
reg_alpha=0 reg_lambda=1.26 cv_score=-0.04567284011247232
reg_alpha=1e-07 reg_lambda=1.14 cv_score=-0.04575066737345572
reg_alpha=1e-07 reg_lambda=1.16 cv_score=-0.045739285173386254
reg_alpha=1e-07 reg_lambda=1.18 cv_score=-0.04549778908046519
reg_alpha=1e-07 reg_lambda=1.2 cv_score=-0.04548284746598081
reg_alpha=1e-07 reg_lambda=1.22 cv_score=-0.04515917307731445
reg_alpha=1e-07 reg_lambda=1.24 cv_score=-0.04514891002530697
reg_alpha=1e-07 reg_lambda=1.26 cv_score=-0.04567284011247232
reg_alpha=2e-07 reg_lambda=1.14 cv_score=-0.04575066737345572
reg_alpha=2e-07 reg_lambda=1.16 cv_score=-0.045739285173386254
reg_alpha=2e-07 reg_lambda=1.18 cv_

We set the optimal values for `reg_alpha=0` and `reg_lambda=1.24`.

In [82]:
xgb_params['reg_alpha'] = 0
xgb_params['reg_lambda'] = 1.24

## Step 7: Recalibrate the number of estimators (again)

In [87]:
evaluate(X, y, xgb_params, debug=True)

Optimal number of trees, averaged over all cv folds and rounded: 9
CV SCORE [R2 score (1 - (residual sum squares)/(total sum squares))] after 10x3 cross-validation: -0.04514891002530697
TRAIN SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: 0.0027635814097973777
TEST SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: -0.05640941874324645


{'train': 0.0027635814097973777,
 'cv': -0.04514891002530697,
 'test': -0.05640941874324645,
 'diff-train-cv': 0.04791249143510435,
 'diff-train-test': 0.05917300015304383,
 'diff-test-cv': 0.011260508717939481}

Still the optimal number of estimators has not changed. However, we see a much better relation between the train, cv and test scores, indicating that the overfitting we had might be mitigated.

## Step 8: Reduce the learning rate

We now reduce the learning rate and set once again a big number of estimators.

In [88]:
xgb_params['learning_rate'] = 0.01
xgb_params['n_estimators'] = 1000

In [89]:
evaluate(X, y, xgb_params, debug=True)

Optimal number of trees, averaged over all cv folds and rounded: 322
CV SCORE [R2 score (1 - (residual sum squares)/(total sum squares))] after 10x3 cross-validation: -0.09520274743278513
TRAIN SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: 0.05598475635551403
TEST SCORE [R2 score (1 - (residual sum squares)/(total sum squares))]: -0.03382896941352853


{'train': 0.05598475635551403,
 'cv': -0.09520274743278513,
 'test': -0.03382896941352853,
 'diff-train-cv': 0.15118750378829915,
 'diff-train-test': 0.08981372576904256,
 'diff-test-cv': 0.0613737780192566}

Lowering the learning rate yields an optimal number of estimators of 322. We update this parameter.

In [90]:
xgb_params['n_estimators'] = 322

In [93]:
xgb_params

{'n_estimators': 322,
 'max_depth': 4,
 'learning_rate': 0.01,
 'gamma': 0,
 'min_child_weight': 9,
 'subsample': 0.9,
 'colsample_bytree': 0.66,
 'reg_alpha': 0,
 'reg_lambda': 1.24,
 'early_stopping_rounds': 50}