In [1]:
import yaml
from tqdm import tqdm
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

from utils import set_seed, get_data_loaders, get_idx_for_interested_fpr

In [2]:
torch.set_num_threads(5)
set_seed(42)

setting = 'DT_etaphi'
config = yaml.safe_load(Path(f'./configs/{setting}.yml').open('r'))
print(f'[INFO] Running {setting} on cpu')

[INFO] Running DT-etaphi on cpu


In [3]:
# Load data by using the config. Here the batch_size is not used, so it can be set to any value.
data_loaders, x_dim, edge_attr_dim, dataset = get_data_loaders(setting, config['data'], batch_size=1)

[Splits]
    train: 327108. # pos: 54518, # neg: 272590. Pos:Neg: 0.200
    valid: 70092. # pos: 11682, # neg: 58410. Pos:Neg: 0.200
    test: 180283. # pos: 11683, # neg: 168600. Pos:Neg: 0.069


In [16]:
def get_bins(data, n_bins, type):
    assert type in ['equal_width', 'equal_depth']
    if type == 'equal_width':
        bins = pd.cut(pd.Series(data), n_bins, labels=False)
    else:
        bins = pd.qcut(data, n_bins, labels=False)
    return bins

def bining(x, n_bins, bin_type):
    eta_bin = get_bins(x[:, 0], n_bins, bin_type)
    phi_bin = get_bins(x[:, 1], n_bins, bin_type)
    eta_phi_bin = list(zip(eta_bin, phi_bin))
    two_d_bins = sorted(list(set(eta_phi_bin)))
    two_d_bins = {each: idx for idx, each in enumerate(two_d_bins)}
    assert len(two_d_bins) == n_bins ** 2
    return torch.tensor([two_d_bins[each] for each in eta_phi_bin]).reshape(-1, 1)

In [17]:
n_bins = 10
bin_type = 'equal_depth'
y = np.array(dataset.data.y).reshape(-1)


In [18]:
# genereate the bins and count hits in each bin as the features for the decision tree
dataset.data.x = bining(dataset.data.x, n_bins, bin_type)
x = []
for data in tqdm(dataset):
    assert data.x.shape[1] == 1
    counts = data.x.unique(return_counts=True)
    new_x = torch.zeros(n_bins ** 2)
    new_x[counts[0]] = counts[1].float()
    x.append(new_x)
x = np.stack(x, axis=0)


100%|██████████| 577483/577483 [00:43<00:00, 13202.32it/s]


In [19]:
# shuffle the training data in case later we want to use other traditional models
train_idx = np.array(dataset.idx_split['train'])
np.random.shuffle(train_idx)

train_x, train_y = x[train_idx], y[train_idx]
valid_x, valid_y = x[dataset.idx_split['valid']], y[dataset.idx_split['valid']]
test_x, test_y = x[dataset.idx_split['test']], y[dataset.idx_split['test']]
train_x.shape, valid_x.shape, test_x.shape

((327108, 100), (70092, 100), (180283, 100))

In [20]:
max_depth = 100
n_estimators = 100
# clf = tree.DecisionTreeClassifier(max_depth=50)
clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=8, max_depth=max_depth)

In [21]:
clf = clf.fit(train_x, train_y)

In [22]:
def eval_auc(x, clf_labels, clf):
    clf_probs = clf.predict_proba(x)[:, 1]
    clf_labels = clf_labels.reshape(-1)

    auroc = metrics.roc_auc_score(clf_labels, clf_probs)
    partial_auroc = metrics.roc_auc_score(clf_labels, clf_probs, max_fpr=0.001)
    fpr, recall, thres = metrics.roc_curve(clf_labels, clf_probs)
    indices = get_idx_for_interested_fpr(fpr, [0.001, 0.001/10])
    return auroc, recall[indices][0], recall[indices][1]

In [23]:
auroc, recall_30khz, recall_3khz = eval_auc(train_x, train_y, clf)
print(f'Train: auroc: {auroc:.4f}, recall@30khz: {recall_30khz:.4f}, recall@3khz: {recall_3khz:.4f}')

auroc, recall_30khz, recall_3khz = eval_auc(valid_x, valid_y, clf)
print(f'Valid: auroc: {auroc:.4f}, recall@30khz: {recall_30khz:.4f}, recall@3khz: {recall_3khz:.4f}')

auroc, recall_30khz, recall_3khz = eval_auc(test_x, test_y, clf)
print(f'Test:  auroc: {auroc:.4f}, recall@30khz: {recall_30khz:.4f}, recall@3khz: {recall_3khz:.4f}')

Train: auroc: 0.9997, recall@30khz: 0.9988, recall@3khz: 0.9972
Valid: auroc: 0.9861, recall@30khz: 0.7998, recall@3khz: 0.6392
Test:  auroc: 0.9850, recall@30khz: 0.7929, recall@3khz: 0.5877
