In [1]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments
from doctransformers import DocDataset, DocTrainer
import evaluate
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Load and preprocess the docdataset
docdata = DocDataset.load_from_disk("example/data")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
docdata.preprocess(tokenizer=tokenizer)



Map:   0%|          | 0/160005 [00:00<?, ? examples/s]

In [2]:
# Prepare TrainingArguments as you would for a transformers Trainer
acc = evaluate.load("accuracy")
id2label = {1: "POS", 0: "NEG"}
label2id = {"POS": 1, "NEG": 0}

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                num_labels=2, id2label=id2label, label2id=label2id).to("cuda")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return acc.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="example/model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    weight_decay=0.01,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Init doctrainer
clf = RandomForestClassifier(n_jobs=8, verbose=1) # The random forest classifier to classify the documents 

trainer = DocTrainer(
    model=model,
    doc_classifier=clf,
    data_collator=data_collator,
    args=training_args,
    tokenizer=tokenizer,
    doc_dataset=docdata,
    compute_metrics=compute_metrics,
)

Selecting chunks:   0%|          | 0/160005 [00:00<?, ? examples/s]

Selecting chunks:   0%|          | 0/160005 [00:00<?, ? examples/s]

In [4]:
# Train the BERT model to embedd chunks
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmojio[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/5047 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4665, 'learning_rate': 1.8018624925698435e-05, 'epoch': 0.1}
{'loss': 0.4073, 'learning_rate': 1.603724985139687e-05, 'epoch': 0.2}
{'loss': 0.3672, 'learning_rate': 1.4055874777095306e-05, 'epoch': 0.3}
{'loss': 0.3583, 'learning_rate': 1.2074499702793741e-05, 'epoch': 0.4}
{'loss': 0.3355, 'learning_rate': 1.0093124628492174e-05, 'epoch': 0.5}
{'loss': 0.3399, 'learning_rate': 8.11174955419061e-06, 'epoch': 0.59}
{'loss': 0.3252, 'learning_rate': 6.130374479889043e-06, 'epoch': 0.69}
{'loss': 0.3166, 'learning_rate': 4.148999405587478e-06, 'epoch': 0.79}
{'loss': 0.3138, 'learning_rate': 2.1676243312859127e-06, 'epoch': 0.89}
{'loss': 0.3055, 'learning_rate': 1.8624925698434714e-07, 'epoch': 0.99}


  0%|          | 0/4955 [00:00<?, ?it/s]

{'eval_loss': 0.31056132912635803, 'eval_accuracy': 0.867458935735372, 'eval_runtime': 369.35, 'eval_samples_per_second': 214.609, 'eval_steps_per_second': 13.415, 'epoch': 1.0}
{'train_runtime': 1544.2648, 'train_samples_per_second': 52.283, 'train_steps_per_second': 3.268, 'train_loss': 0.35293580197522684, 'epoch': 1.0}


TrainOutput(global_step=5047, training_loss=0.35293580197522684, metrics={'train_runtime': 1544.2648, 'train_samples_per_second': 52.283, 'train_steps_per_second': 3.268, 'train_loss': 0.35293580197522684, 'epoch': 1.0})

In [5]:
# Train document classifier
trainer.train_head() # Accuracy 0.9502

Embedding chunks.:   0%|          | 0/160005 [00:00<?, ? examples/s]

Embedding docs.:   0%|          | 0/25000 [00:00<?, ? examples/s]

Embedding docs.:   0%|          | 0/25000 [00:00<?, ? examples/s]

Fitting classifier


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   16.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   44.7s finished


Testing on eval data


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


mcc,accuracy,f1-score
f64,f64,f64
0.900401,0.9502,0.950162
