In [10]:
import os 
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch 
import xgboost as xgb 

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import classification_report, confusion_matrix, f1_score, log_loss
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [11]:
def load_data(file_path):
    """Load pytorch tensor data from file

    Args:
        filepath (str): a full filename path.

    Returns:
        X (array): a numpy array of features.
        y (array): a numpy array of labels.
    """
    tensors = torch.load(file_path)
    X = tensors[:, :-1].copy()
    y = tensors[:, -1].copy().astype(int)

    return (X, y)

In [12]:
tensors_input_path = os.path.join('/', 'drive', 'My Drive', 'data', 'processed')
X_train, y_train   = load_data(os.path.join(tensors_input_path, 'mozilla_bug_report_train_data.pt'))
X_test, y_test   = load_data(os.path.join(tensors_input_path, 'mozilla_bug_report_test_data.pt'))

In [13]:
def score(params):
    print("Training with params : ")
    print(params)
    
    num_round = int(params['n_estimators'])
    del params['n_estimators']
   
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    model  = xgb.train(params, dtrain, num_round)
    
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 5))
    
    score = log_loss(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.choice('max_depth', np.arange(1, 14, dtype=int)),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'num_class' : 5,
             'eval_metric': 'mlogloss',
             'objective': 'multi:softprob',
             'nthread' : 6,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)

    print(best)

In [14]:
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 0.55, 'eta': 0.375, 'eval_metric': 'mlogloss', 'gamma': 0.8, 'max_depth': 9, 'min_child_weight': 2.0, 'n_estimators': 925.0, 'nthread': 6, 'num_class': 5, 'objective': 'multi:softprob', 'silent': 1, 'subsample': 0.9500000000000001}
	Score 1.540851269364357


Training with params : 
{'colsample_bytree': 0.6000000000000001, 'eta': 0.17500000000000002, 'eval_metric': 'mlogloss', 'gamma': 0.75, 'max_depth': 5, 'min_child_weight': 5.0, 'n_estimators': 797.0, 'nthread': 6, 'num_class': 5, 'objective': 'multi:softprob', 'silent': 1, 'subsample': 0.8500000000000001}
	Score 1.4581984588950871


Training with params : 
{'colsample_bytree': 0.8, 'eta': 0.47500000000000003, 'eval_metric': 'mlogloss', 'gamma': 0.55, 'max_depth': 6, 'min_child_weight': 4.0, 'n_estimators': 270.0, 'nthread': 6, 'num_class': 5, 'objective': 'multi:softprob', 'silent': 1, 'subsample': 0.8500000000000001}
	Score 1.5711855987086891


Training with params : 
{'colsample_bytree'

In [15]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
model = xgb.train({'colsample_bytree': 0.65, 
                   'eta': 0.07500000000000001, 
                   'gamma': 0.55, 
                   'max_depth': 6, 
                   'min_child_weight': 6.0, 
                   'n_estimators': 581.0, 
                   'subsample': 0.8500000000000001,
                   'objective': 'multi:softmax',
                   'num_class': 5}, dtrain, 581)
y_pred = model.predict(dvalid).astype(int)


In [16]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.50      0.22      0.30        32
           1       0.39      0.33      0.36        54
           2       0.37      0.45      0.40        76
           3       0.40      0.56      0.47        64
           4       0.25      0.08      0.12        24

    accuracy                           0.39       250
   macro avg       0.38      0.33      0.33       250
weighted avg       0.39      0.39      0.37       250



In [17]:
import joblib
model_output_path = os.path.join('/', 'drive', 'My Drive', 'data', 'processed', 'final-model.joblib')
joblib.dump(model, model_output_path)

['/drive/My Drive/data/processed/final-model.joblib']