In [1]:
!python -V

Python 3.10.11


In [8]:
import os
import itertools
import mlflow
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers.integrations import MLflowCallback

In [3]:
print("torch:", torch.__version__)
print("Is GPU available:", torch.cuda.is_available())

torch: 2.0.0
Is GPU available: True


In [28]:
MLFLOW_TRACKING_URI = "sqlite:///../mlflow_data/mlflow.db"
EXPERIMENT_NAME = "distilbert-un-ner"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

client = mlflow.tracking.MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [45]:
# Enable or disable logging artifacts in MLflow
os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "0"
os.environ["MLFLOW_NESTED_RUN"] = "1"

In [10]:
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
    

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
  
def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

In [13]:
TRAIN_DATA_DIR = '../data/train'
TEST_DATA_DIR = '../data/test'

In [14]:
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3, 'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 8
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_dataset, test_dataset = get_un_token_dataset(train_directory=TRAIN_DATA_DIR, test_directory=TEST_DATA_DIR)

loading configuration file config.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.29.2",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilber

In [15]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

                                                                 

In [57]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}


with mlflow.start_run(nested=True):

    mlflow.log_param("train_data", TRAIN_DATA_DIR)
    mlflow.log_param("test_data", TEST_DATA_DIR)
    
    trainer = Trainer(
        model,
        args,
        train_dataset=train_tokenized_dataset,
        eval_dataset=test_tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    metrics = trainer.evaluate()

    mlflow.log_metrics(metrics)

    components = {"model": trainer.model, "tokenizer": tokenizer}
    model_info = mlflow.transformers.log_model(transformers_model=components, artifact_path="model", task="ner")

    mlflow.end_run()

loading configuration file config.json from cache at /home/gotsul/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.

{'eval_loss': 0.04881860315799713, 'eval_precision': 0.794750430292599, 'eval_recall': 0.85628187297172, 'eval_f1': 0.8243695603659897, 'eval_accuracy': 0.9836054112629776, 'eval_runtime': 6.5265, 'eval_samples_per_second': 317.783, 'eval_steps_per_second': 39.838, 'epoch': 1.0}
{'train_runtime': 43.871, 'train_samples_per_second': 83.358, 'train_steps_per_second': 10.44, 'train_loss': 0.0806705358247049, 'epoch': 1.0}


100%|██████████| 260/260 [00:06<00:00, 41.11it/s]
  model_info = mlflow.transformers.log_model(transformers_model=components, artifact_path="model", task="ner")
  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Configuration saved in /tmp/tmpqy9r8m7r/model/pipeline/config.json
Model weights saved in /tmp/tmpqy9r8m7r/model/pipeline/pytorch_model.bin
tokenizer config file saved in /tmp/tmpqy9r8m7r/model/pipeline/tokenizer_config.json
Special tokens file saved in /tmp/tmpqy9r8m7r/model/pipeline/special_tokens_map.json
tokenizer config file saved in /tmp/tmpqy9r8m7r/model/components/tokenizer/tokenizer_config.json
Special tokens file saved in /tmp/tmpqy9r8m7r/model/components/tokenizer/special_tokens_map.json


In [58]:
current_experiment = dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
current_experiment

{'artifact_location': '/home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1',
 'creation_time': 1685100765733,
 'experiment_id': '1',
 'last_update_time': 1685100765733,
 'lifecycle_stage': 'active',
 'name': 'distilbert-un-ner',
 'tags': {}}

In [59]:
experiment_id = current_experiment['experiment_id']

In [73]:
coppied_run_id = "887a52d1a3ae42aea5e8384fbc7c83e3"  # coppied from the mlflow ui
runs_df = mlflow.search_runs([experiment_id], order_by=["metrics.eval_f1 DESC"])

In [74]:
model_uri = f"runs:/{coppied_run_id}/model"

In [75]:
register_model_name = "un-ner-model-nested"
mlflow.register_model(model_uri=model_uri, name=register_model_name)

Successfully registered model 'un-ner-model-nested'.
2023/05/26 20:10:11 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: un-ner-model-nested, version 1
Created version '1' of model 'un-ner-model-nested'.


<ModelVersion: aliases=[], creation_timestamp=1685121011103, current_stage='None', description=None, last_updated_timestamp=1685121011103, name='un-ner-model-nested', run_id='887a52d1a3ae42aea5e8384fbc7c83e3', run_link=None, source='/home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1/887a52d1a3ae42aea5e8384fbc7c83e3/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [76]:
latest_versions = client.get_latest_versions(name=register_model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


In [77]:
version = latest_versions[0].version
new_stage = "Staging"

client.transition_model_version_stage(
    name=register_model_name,
    version=version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685121011103, current_stage='Staging', description=None, last_updated_timestamp=1685121055452, name='un-ner-model-nested', run_id='887a52d1a3ae42aea5e8384fbc7c83e3', run_link=None, source='/home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1/887a52d1a3ae42aea5e8384fbc7c83e3/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [89]:
loaded_model_with_mlflow = mlflow.pyfunc.load_model(model_uri=f"models:/{register_model_name}/{new_stage}")

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
loading configuration file /home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1/887a52d1a3ae42aea5e8384fbc7c83e3/artifacts/model/pipeline/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "

In [84]:
loaded_model_with_hf = mlflow.transformers.load_model(model_uri=f"models:/{register_model_name}/{new_stage}")

  loaded_model_with_hf = mlflow.transformers.load_model(model_uri=f"models:/{register_model_name}/{new_stage}")
2023/05/26 20:13:47 INFO mlflow.transformers: 'models:/un-ner-model-nested/Staging' resolved as '/home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1/887a52d1a3ae42aea5e8384fbc7c83e3/artifacts/model'
loading configuration file /home/gotsul/python-projects/simple-mlops/1-training/notebooks/mlruns/1/887a52d1a3ae42aea5e8384fbc7c83e3/artifacts/model/pipeline/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "L

In [82]:
loaded_model.predict(pd.DataFrame({"tokens": ["I am a citizen of the United States of America"]}))

'LABEL_0,LABEL_0,LABEL_0,LABEL_0,LABEL_0,LABEL_0,LABEL_8,LABEL_8,LABEL_8,LABEL_8'