In [71]:
import os 
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch 
import xgboost as xgb 

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import classification_report, confusion_matrix, f1_score, log_loss
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [41]:
def load_data(file_path):
    """Load pytorch tensor data from file

    Args:
        filepath (str): a full filename path.

    Returns:
        X (array): a numpy array of features.
        y (array): a numpy array of labels.
    """
    tensors = torch.load(file_path)
    X = tensors[:, :-1].copy()
    y = tensors[:, -1].copy().astype(int)

    return (X, y)






In [39]:
tensors_input_path = os.path.join('..', 'data', 'processed')
X_train, y_train   = load_data(os.path.join(tensors_input_path, 'mozilla_bug_report_train_data.pt'))
X_test, y_test   = load_data(os.path.join(tensors_input_path, 'mozilla_bug_report_test_data.pt'))

In [72]:
def score(params):
    print("Training with params : ")
    print(params)
    
    num_round = int(params['n_estimators'])
    del params['n_estimators']
   
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    model  = xgb.train(params, dtrain, num_round)
    
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 5))
    
    score = log_loss(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.choice('max_depth', np.arange(1, 14, dtype=int)),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'num_class' : 5,
             'eval_metric': 'mlogloss',
             'objective': 'multi:softprob',
             'nthread' : 6,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)

    print(best)

In [73]:
trials = Trials()

optimize(trials)

Training with params :                                
{'colsample_bytree': 0.9500000000000001, 'eta': 0.17500000000000002, 'eval_metric': 'mlogloss', 'gamma': 0.55, 'max_depth': 13, 'min_child_weight': 2.0, 'n_estimators': 477.0, 'nthread': 6, 'num_class': 5, 'objective': 'multi:softprob', 'silent': 1, 'subsample': 0.55}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


	Score 1.9344406914710999                             


Training with params :                                                          
{'colsample_bytree': 1.0, 'eta': 0.4, 'eval_metric': 'mlogloss', 'gamma': 0.6000000000000001, 'max_depth': 11, 'min_child_weight': 6.0, 'n_estimators': 633.0, 'nthread': 6, 'num_class': 5, 'objective': 'multi:softprob', 'silent': 1, 'subsample': 0.8}
Param

In [97]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
model = xgb.train({'colsample_bytree': 0.6000000000000001, 
                   'eta': 0.35000000000000003, 'gamma': 0.75, 'max_depth': 3, 
                   'min_child_weight': 3.0, 'subsample': 0.9},
                   dtrain, 151,
                   objective= 'multi:softmax',
                   num_class= 5, )
y_pred = model.predict(dvalid).astype(int)




In [98]:
y_pred


array([0, 2, 1, 3, 2, 3, 3, 2, 1, 2, 4, 2, 2, 1, 2, 1, 1, 1, 0, 1, 2, 3,
       2, 2, 3])

In [99]:
y_test

array([2, 3, 1, 3, 2, 1, 0, 3, 3, 1, 2, 3, 2, 1, 1, 1, 0, 2, 3, 2, 4, 4,
       3, 2, 2])

In [93]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.43      0.50      0.46         6
           2       0.30      0.38      0.33         8
           3       0.20      0.14      0.17         7
           4       0.00      0.00      0.00         2

    accuracy                           0.28        25
   macro avg       0.19      0.20      0.19        25
weighted avg       0.25      0.28      0.26        25

