In [1]:
!python -V

Python 3.10.11


In [8]:
import os
import itertools
import mlflow
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers.integrations import MLflowCallback

In [3]:
print("torch:", torch.__version__)
print("Is GPU available:", torch.cuda.is_available())

torch: 2.0.0
Is GPU available: True


In [7]:
mlflow.set_tracking_uri("sqlite:///../mlflow_data/mlflow.db")
mlflow.set_experiment("distilbert-un-ner")

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2023/05/26 14:32:45 INFO mlflow.tracking.fluent: Experiment with name 'distilbert-un-ner' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1', creation_time=1685100765733, experiment_id='1', last_update_time=1685100765733, lifecycle_stage='active', name='distilbert-un-ner', tags={}>

In [24]:
# Enable or disable logging artifacts in MLflow
os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "1"

In [10]:
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
    

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
  
def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

In [13]:
TRAIN_DATA_DIR = '../data/train'
TEST_DATA_DIR = '../data/test'

In [14]:
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3, 'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 8
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_dataset, test_dataset = get_un_token_dataset(train_directory=TRAIN_DATA_DIR, test_directory=TEST_DATA_DIR)

loading configuration file config.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.29.2",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilber

In [15]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

                                                                 

In [26]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.add_callback(MLflowCallback())

trainer.train()

mlflow.end_run()

loading configuration file config.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.

{'eval_loss': 0.05386212840676308, 'eval_precision': 0.8149293286219081, 'eval_recall': 0.8553546592489569, 'eval_f1': 0.8346527934856368, 'eval_accuracy': 0.9833432376691019, 'eval_runtime': 6.5539, 'eval_samples_per_second': 316.452, 'eval_steps_per_second': 39.671, 'epoch': 1.0}


 55%|█████▍    | 500/916 [00:48<00:33, 12.24it/s]Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json


{'loss': 0.0855, 'learning_rate': 4.5414847161572056e-05, 'epoch': 1.09}


Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
Logging checkpoint artifacts in checkpoint-500. This may take time.
100%|██████████| 916/916 [01:30<00:00, 10.85it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2074
  Batch size = 8

100%|██████████| 916/916 [01:38<00:00, 10.85it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 916/916 [01:38<00:00,  9.25it/s]
Saving model checkpoint to ../models/un-ner.model
Configuration saved in ../models/un-ner.model/config.json


{'eval_loss': 0.053391553461551666, 'eval_precision': 0.8447883477469277, 'eval_recall': 0.8604543347241539, 'eval_f1': 0.8525493798805696, 'eval_accuracy': 0.9846366273988884, 'eval_runtime': 7.9577, 'eval_samples_per_second': 260.629, 'eval_steps_per_second': 32.673, 'epoch': 2.0}
{'train_runtime': 99.0062, 'train_samples_per_second': 73.874, 'train_steps_per_second': 9.252, 'train_loss': 0.058791021072187796, 'epoch': 2.0}


Model weights saved in ../models/un-ner.model/pytorch_model.bin
tokenizer config file saved in ../models/un-ner.model/tokenizer_config.json
Special tokens file saved in ../models/un-ner.model/special_tokens_map.json
