In [74]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
encoder_embedding_file = "embeddings/{split}_encoder_embeddings.npy"
ground_truth_file = "embeddings/{split}_age.npy"

In [27]:
age_brackets = ["10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "more than 70"]

In [3]:
# Load training data
X_train = np.load(encoder_embedding_file.format(split="train"))
y_train = np.load(ground_truth_file.format(split="train"))

In [4]:
# Load validation data
X_val = np.load(encoder_embedding_file.format(split="val"))
y_val = np.load(ground_truth_file.format(split="val"))

In [None]:
# # Store the data as a DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [65]:
# Scaffolding from https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train)
        
    # Return the booster as a whole
    return alg

In [None]:
xgb_init = XGBClassifier(
    learning_rate =0.25,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'multi:softmax',
    nthread=4,
    device = "cuda",
    num_class = np.unique(y_train).size,
    seed=42
)
xgb_init = modelfit(xgb_init, X_train, y_train)

In [85]:
xgb_init.n_estimators

99

In [75]:
# evallist = [(dtrain, 'train'), (dval, 'eval')]

In [76]:
# num_round = 50
# bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

In [86]:
y_pred_train = xgb_init.get_booster().predict(dtrain)

In [87]:
print(classification_report(y_train, y_pred_train, target_names=age_brackets, output_dict=False))

              precision    recall  f1-score   support

       10-19       0.90      0.78      0.83      9103
       20-29       0.76      0.87      0.81     25598
       30-39       0.75      0.70      0.72     19250
       40-49       0.80      0.72      0.76     10744
       50-59       0.87      0.85      0.86      6228
       60-69       0.95      0.93      0.94      2779
more than 70       1.00      1.00      1.00       842

    accuracy                           0.80     74544
   macro avg       0.86      0.83      0.85     74544
weighted avg       0.80      0.80      0.79     74544



In [72]:
y_pred_val = xgb_init.get_booster().predict(dval)

In [73]:
print(classification_report(y_val, y_pred_val, target_names=age_brackets, output_dict=False))

              precision    recall  f1-score   support

       10-19       0.80      0.62      0.70       765
       20-29       0.65      0.75      0.70      2137
       30-39       0.49      0.49      0.49      1442
       40-49       0.47      0.45      0.46       853
       50-59       0.51      0.45      0.48       523
       60-69       0.48      0.43      0.45       204
more than 70       0.56      0.30      0.39        76

    accuracy                           0.59      6000
   macro avg       0.57      0.50      0.52      6000
weighted avg       0.59      0.59      0.58      6000



In [78]:
# Identify vicinity of max_depth and child weight on grid
param_test_tier_1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch_1 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'multi:softmax',
        nthread=4,
        device = "cuda",
        num_class = np.unique(y_train).size,
        seed=42
    ), 
    param_grid = param_test_tier_1, 
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
gsearch_1.fit(X_train, y_train)
gsearch_1.grid_scores_, gsearch_1.best_params_, gsearch_1.best_score_

KeyboardInterrupt: 