In [1]:
%%bash
DATASET="eurlex-4k"
wget -nv -nc https://archive.org/download/pecos-dataset/xmc-base/${DATASET}.tar.gz
tar --skip-old-files -zxf ${DATASET}.tar.gz 
find xmc-base/${DATASET}/*

2023-02-18 22:52:57 URL:https://ia802308.us.archive.org/21/items/pecos-dataset/xmc-base/eurlex-4k.tar.gz [66307781/66307781] -> "eurlex-4k.tar.gz" [1]


xmc-base/eurlex-4k/X.trn.txt
xmc-base/eurlex-4k/X.tst.txt
xmc-base/eurlex-4k/Y.trn.npz
xmc-base/eurlex-4k/Y.trn.txt
xmc-base/eurlex-4k/Y.tst.npz
xmc-base/eurlex-4k/Y.tst.txt
xmc-base/eurlex-4k/output-items.txt
xmc-base/eurlex-4k/tfidf-attnxml
xmc-base/eurlex-4k/tfidf-attnxml/X.trn.npz
xmc-base/eurlex-4k/tfidf-attnxml/X.tst.npz


In [1]:
import logging
import numpy as np
from pecos.utils import smat_util, logging_util

# set logging level to WARNING(1)
# you can change this to INFO(2) or DEBUG(3) if you'd like to see more logging
LOGGER = logging.getLogger(__name__)
logging_util.setup_logging_config(level=1)

# load training data
X_feat_trn = smat_util.load_matrix("xmc-base/eurlex-4k/tfidf-attnxml/X.trn.npz", dtype=np.float32)
Y_trn = smat_util.load_matrix("xmc-base/eurlex-4k/Y.trn.npz", dtype=np.float32)

with open("xmc-base/eurlex-4k/X.trn.txt", 'r') as fin:
    X_txt_trn = [xx.strip() for xx in fin.readlines()]

# load test data
X_feat_tst = smat_util.load_matrix("xmc-base/eurlex-4k/tfidf-attnxml/X.tst.npz", dtype=np.float32)
Y_tst = smat_util.load_matrix("xmc-base/eurlex-4k/Y.tst.npz", dtype=np.float32)

with open("xmc-base/eurlex-4k/X.tst.txt", 'r') as fin:
    X_txt_tst = [xx.strip() for xx in fin.readlines()]

In [2]:
import json
import requests
from pecos.xmc.xtransformer.model import XTransformer

# get XR-Transformer training params

params = json.load(open("params.json"))

    
eurlex4k_train_params = XTransformer.TrainParams.from_dict(params["train_params"])
eurlex4k_pred_params = XTransformer.PredParams.from_dict(params["pred_params"])

# you can view the detailed parameter setting via
print(json.dumps(eurlex4k_train_params.to_dict(), indent=True))
print(json.dumps(eurlex4k_pred_params.to_dict(), indent=True))

{
 "__meta__": {
  "class_fullname": "pecos.xmc.xtransformer.model###XTransformer.TrainParams"
 },
 "preliminary_indexer_params": {
  "__meta__": {
   "class_fullname": "pecos.xmc.base###HierarchicalKMeans.TrainParams"
  },
  "nr_splits": 16,
  "min_codes": 16,
  "max_leaf_size": 16,
  "spherical": true,
  "seed": 0,
  "kmeans_max_iter": 20,
  "threads": -1
 },
 "refined_indexer_params": {
  "__meta__": {
   "class_fullname": "pecos.xmc.base###HierarchicalKMeans.TrainParams"
  },
  "nr_splits": 8,
  "min_codes": null,
  "max_leaf_size": 16,
  "spherical": true,
  "seed": 0,
  "kmeans_max_iter": 20,
  "threads": -1
 },
 "matcher_params_chain": [
  {
   "__meta__": {
    "class_fullname": "pecos.xmc.xtransformer.matcher###TransformerMatcher.TrainParams"
   },
   "model_shortcut": "bert-base-uncased",
   "negative_sampling": "tfn+man",
   "loss_function": "weighted-squared-hinge",
   "bootstrap_method": "weighted-linear",
   "lr_schedule": "linear",
   "threshold": 0.001,
   "hidden_dropo

#### Baseline 1: XR-Linear
Let's train a XR-Linear model on the TF-IDF features using the same hyper-parameters.

In [3]:
# construct label hierarchy
from pecos.xmc import Indexer, LabelEmbeddingFactory
cluster_chain = Indexer.gen(
    LabelEmbeddingFactory.create(Y_trn, X_feat_trn, method="pifa"),
    train_params=eurlex4k_train_params.refined_indexer_params,
)

# train XR-Linear model
from pecos.xmc.xlinear import XLinearModel
xlm = XLinearModel.train(
    X_feat_trn,
    Y_trn,
    C=cluster_chain,
    train_params=eurlex4k_train_params.ranker_params,
    pred_params=eurlex4k_pred_params.ranker_params,
)

# predict on test set with XR-Linear model
P_xlm = xlm.predict(X_feat_tst)

# compute metrics using ground truth
metrics = smat_util.Metrics.generate(Y_tst, P_xlm)
print("Evaluation metrics of XR-Linear model")
print(metrics)

Evaluation metrics of XR-Linear model
prec   = 85.05 77.98 71.30 64.77 58.78 53.15 48.02 43.61 39.94 36.84
recall = 17.26 31.33 42.44 50.89 57.28 61.73 64.78 67.08 68.96 70.53


#### Baseline 2: XR-Transformer without fine-tuning

In [5]:
# define the problem
from pecos.xmc.xtransformer.module import MLProblemWithText
prob = MLProblemWithText(X_txt_trn, Y_trn, X_feat=X_feat_trn)

# disable fine-tuning, directly use pre-trained bert model from huggingface
eurlex4k_train_params.do_fine_tune = False

# train XR-Transformer (without fine-tuning)
# this will be slow on CPU only machine
xrt_pretrained = XTransformer.train(
    prob,
    train_params=eurlex4k_train_params,
    pred_params=eurlex4k_pred_params,
)

# predict and compute metrics
P_xrt_pretrained = xrt_pretrained.predict(X_txt_tst, X_feat=X_feat_tst)
metrics = smat_util.Metrics.generate(Y_tst, P_xrt_pretrained)
print("Evaluation metrics of XR-Transformer (not fine-tuned)")
print(metrics)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForXMC: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForXMC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForXMC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Evaluation metrics of XR-Transformer (not fine-tuned)
prec   = 84.89 78.02 71.28 64.85 58.94 53.42 48.24 43.74 39.96 36.77
recall = 17.23 31.34 42.44 50.92 57.36 62.02 65.08 67.26 68.99 70.46


#### Model: XR-Transformer

In [5]:
# construct label hierarchy
from pecos.xmc import Indexer, LabelEmbeddingFactory
cluster_chain = Indexer.gen(
    LabelEmbeddingFactory.create(Y_trn, X_feat_trn, method="pifa"),
    train_params=eurlex4k_train_params.refined_indexer_params,
)

# train XR-Linear model
from pecos.xmc.xtransformer.module import MLProblemWithText
prob = MLProblemWithText(X_txt_trn, Y_trn, X_feat=X_feat_trn)

In [None]:
eurlex4k_train_params.do_fine_tune = True

xrt_fine_tuned = XTransformer.train(
    prob,
    
    train_params=eurlex4k_train_params,
    pred_params=eurlex4k_pred_params,
)

P_xrt_fine_tuned = xrt_fine_tuned.predict(X_txt_tst, X_feat=X_feat_tst)
metrics = smat_util.Metrics.generate(Y_tst, P_xrt_fine_tuned, topk=10)
print("Evaluation metrics of XR-Transformer")
print(metrics)