In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import logging
import shutil
from omegaconf import OmegaConf
from hydra import initialize, compose
from hydra.core.config_store import ConfigStore
import mlflow
from src.experiments.sner import sner
from pathlib import Path

import os

logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(message)s",
    level=logging.INFO,
    datefmt="%I:%M:%S",
)
logger = logging.getLogger("training")

In [9]:
sentence_length = 106

run_experiments = os.getenv("UVL_BERT_RUN_EXPERIMENTS", "True")
pin_commits = os.getenv("UVL_BERT_PIN_COMMITS", "True") == "FALSE"

print(f"{run_experiments=}")
print(f"{pin_commits=}")

run_experiments='True'
pin_commits=False


In [10]:
from tooling.config import Experiment, Transformation

base_experiment_config = Experiment(
    name="Base Config", iterations=1, force=False, dataset="all"
)

levels_transformation_config = Transformation(
    description="Levels",
    type="Reduced",
    task="Domain_Level",
    domain_data="Domain_Level",
    activity="Domain_Level",
    stakeholder="Domain_Level",
    system_function="Interaction_Level",
    interaction="Interaction_Level",
    interaction_data="Interaction_Level",
    workspace="Interaction_Level",
    software="System_Level",
    internal_action="System_Level",
    internal_data="System_Level",
)

label_transformation_config = Transformation(
    description="None",
    type="Full",
    task="Task",
    domain_data="Domain_Data",
    activity="Activity",
    stakeholder="Stakeholder",
    system_function="System_Function",
    interaction="Interaction",
    interaction_data="Interaction_Data",
    workspace="Workspace",
    software="System_Level",
    internal_action="System_Level",
    internal_data="System_Level",
)

In [11]:
from tooling.transformation import get_hint_transformation
import pickle

hint_transformation = get_hint_transformation(
    transformation_cfg=OmegaConf.structured(levels_transformation_config)
)
hint_label2id = hint_transformation["label2id"]
pickle.dump(
    hint_label2id, open("./src/service/models/hint_label2id.pickle", "wb")
)
hint_id2label = {y: x for x, y in hint_label2id.items()}
pickle.dump(
    hint_id2label, open("./src/service/models/hint_id2label.pickle", "wb")
)

transformation = get_hint_transformation(
    transformation_cfg=OmegaConf.structured(label_transformation_config)
)
label2id = transformation["label2id"]
pickle.dump(label2id, open("./src/service/models/label2id.pickle", "wb"))
id2label = {y: x for x, y in label2id.items()}
pickle.dump(id2label, open("./src/service/models/id2label.pickle", "wb"))

01:05:39 INFO:Hint Label2Id: hint_label2id={'0': 0, 'Domain_Level': 1, 'Interaction_Level': 2, 'System_Level': 3}
01:05:39 INFO:Hint Label2Id: hint_label2id={'0': 0, 'Task': 1, 'Domain_Data': 2, 'Activity': 3, 'Stakeholder': 4, 'System_Function': 5, 'Interaction': 6, 'Interaction_Data': 7, 'Workspace': 8, 'System_Level': 9}


## Train BiLSTM First Stage Model


In [6]:
from tooling.observability import get_run_id
from tooling.config import BiLSTMConfig, BiLSTM

from copy import deepcopy

bilstm_experiment_config = deepcopy(base_experiment_config)
bilstm_experiment_config.name = "Production"

bilstm_config = BiLSTM()

bilstm_cfg: BiLSTMConfig = OmegaConf.structured(
    BiLSTMConfig(
        bilstm=bilstm_config,
        experiment=bilstm_experiment_config,
        transformation=levels_transformation_config,
    )
)

# bilstm_cfg.experiment.force = True

if run_experiments:
    from experiments.bilstm import bilstm

    bilstm(bilstm_cfg)


bilstm_run_id = get_run_id(bilstm_cfg, pin_commit=pin_commits)

print(bilstm_run_id)
print(mlflow.get_artifact_uri())

bilstm_run = mlflow.get_run(bilstm_run_id)
mlflow.artifacts.download_artifacts(
    f"{bilstm_run.info.artifact_uri}/0_model",
    dst_path=Path("./src/service/models/"),
)
try:
    shutil.rmtree(Path("./src/service/models/bilstm"))
except FileNotFoundError:
    pass
Path("./src/service/models/0_model").rename("./src/service/models/bilstm")

07:50:03 INFO:
bilstm:
  type: BiLSTM
  sentence_length: null
  batch_size: 32
  number_epochs: 4
  verbose: 1
  weighted_classes: false
  learning_rate: 0.0001
experiment:
  name: Production
  description: ''
  random_state: 125
  folds: 5
  iterations: 1
  average: macro
  dataset: all
  lower_case: false
  force: false
transformation:
  description: Levels
  type: Reduced
  task: Domain_Level
  goals: null
  domain_data: Domain_Level
  activity: Domain_Level
  stakeholder: Domain_Level
  system_function: Interaction_Level
  interaction: Interaction_Level
  interaction_data: Interaction_Level
  workspace: Interaction_Level
  software: System_Level
  internal_action: System_Level
  internal_data: System_Level
  system_level: null

07:50:04 INFO:New experiment. Running
07:50:04 INFO:Entering mlflow context
07:50:05 INFO:Created a temporary directory at /var/folders/82/x_tg3xgx1px781gb8v4bny1r0000gn/T/tmpcxvce8fn
07:50:05 INFO:Writing /var/folders/82/x_tg3xgx1px781gb8v4bny1r0000gn/T/tmp

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


07:52:15 INFO:Assets written to: /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/temp/bilstm/languid-boar-815/0_model/assets
2023-08-28 19:52:29.860495: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:52:30.013368: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:52:30.013399: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-08-28 19:52:30.221249: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:52:30.232625: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




07:52:31 INFO:Logged iteration result res.precision=0.31939181551676077 res.recall=0.2587330357436993
07:52:31 INFO:Finished iteration=0
07:52:31 INFO:Breaking early after iteration=0 of 5 folds
07:52:32 INFO:Logged experiment result res.mean_precision=0.31939181551676077 res.mean_recall=0.2587330357436993
07:52:33 INFO:Left mlflow context


9cd9c07579f64068bd2cd829ee5301a7
mlflow-artifacts:/38/d4819210a6a447b7918df5a906ba76be/artifacts


PosixPath('src/service/models/bilstm')

## Train SNER First Stage Model


In [7]:
from tooling.observability import get_run_id
from tooling.config import SNERConfig, SNER

from copy import deepcopy

sner_experiment_config = deepcopy(base_experiment_config)
sner_experiment_config.name = "Production"

sner_config = SNER()

sner_cfg = OmegaConf.structured(
    SNERConfig(
        sner=sner_config,
        experiment=sner_experiment_config,
        transformation=levels_transformation_config,
    )
)

if run_experiments:
    from experiments.sner import sner

    sner(OmegaConf.create(sner_cfg))

sner_run_id = get_run_id(sner_cfg, pin_commit=pin_commits)
print(mlflow.get_artifact_uri())
print(sner_run_id)

sner_run = mlflow.get_run(sner_run_id)
mlflow.artifacts.download_artifacts(
    f"{sner_run.info.artifact_uri}/0_model.ser.gz",
    dst_path=Path("./src/service/models/"),
)
try:
    Path("./src/service/models/sner.ser.gz").unlink()
except FileNotFoundError:
    pass
Path("./src/service/models/0_model.ser.gz").rename(
    "./src/service/models/sner.ser.gz"
)

07:52:36 INFO:
sner:
  type: SNER
experiment:
  name: Production
  description: ''
  random_state: 125
  folds: 5
  iterations: 1
  average: macro
  dataset: all
  lower_case: false
  force: false
transformation:
  description: Levels
  type: Reduced
  task: Domain_Level
  goals: null
  domain_data: Domain_Level
  activity: Domain_Level
  stakeholder: Domain_Level
  system_function: Interaction_Level
  interaction: Interaction_Level
  interaction_data: Interaction_Level
  workspace: Interaction_Level
  software: System_Level
  internal_action: System_Level
  internal_data: System_Level
  system_level: null

07:52:37 INFO:New experiment. Running
07:52:37 INFO:Entering mlflow context
07:52:37 INFO:Importing dataset: anno from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/forum/anno_test.json
07:52:37 INFO:Importing dataset: anno from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/forum/anno_train.json
07:52:38 INFO:Importing dataset: prolific from

mlflow-artifacts:/38/d4819210a6a447b7918df5a906ba76be/artifacts
1575ee955f574d47990265aa51247133


PosixPath('src/service/models/sner.ser.gz')

## Train BERT First Stage Model


In [16]:
from tooling.observability import get_run_id
from tooling.config import BERTConfig, BERT

from copy import deepcopy

bert_1_experiment_config = deepcopy(base_experiment_config)
bert_1_experiment_config.name = "Production"
bert_1_experiment_config.force = True

bert_1_config = BERT(max_len=123)


bert_1_cfg = OmegaConf.structured(
    BERTConfig(
        bert=bert_1_config,
        experiment=bert_1_experiment_config,
        transformation=levels_transformation_config,
    )
)

if run_experiments:
    from experiments.bert import bert

    bert(OmegaConf.create(bert_1_cfg))

bert_1_run_id = get_run_id(bert_1_cfg, pin_commit=pin_commits)

print(bert_1_run_id)

bert_1_run = mlflow.get_run(bert_1_run_id)
mlflow.artifacts.download_artifacts(
    f"{bert_1_run.info.artifact_uri}/0_model",
    dst_path=Path("./src/service/models/"),
)
try:
    shutil.rmtree(Path("./src/service/models/bert_1"))
except FileNotFoundError:
    pass
Path("./src/service/models/0_model").rename("./src/service/models/bert_1")

10:40:22 INFO:
bert:
  model: bert-base-uncased
  type: BERT
  max_len: 123
  train_batch_size: 32
  validation_batch_size: 32
  number_epochs: 5
  learning_rate_bert: 2.0e-05
  learning_rate_classifier: 0.01
  weight_decay: 0.01
  weighted_classes: false
experiment:
  name: Production
  description: ''
  random_state: 125
  folds: 5
  iterations: 1
  average: macro
  dataset: all
  lower_case: false
  force: true
transformation:
  description: Levels
  type: Reduced
  task: Domain_Level
  goals: null
  domain_data: Domain_Level
  activity: Domain_Level
  stakeholder: Domain_Level
  system_function: Interaction_Level
  interaction: Interaction_Level
  interaction_data: Interaction_Level
  workspace: Interaction_Level
  software: System_Level
  internal_action: System_Level
  internal_data: System_Level
  system_level: null

10:40:22 INFO:New experiment. Running
10:40:22 INFO:Entering mlflow context


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


10:40:22 INFO:Using device: mps
10:40:22 INFO:Importing dataset: anno from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/forum/anno_test.json
10:40:23 INFO:Importing dataset: anno from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/forum/anno_train.json
10:40:24 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_1_33.json
10:40:24 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_34_66.json
10:40:25 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_67_100.json
10:40:26 INFO:Importing dataset: komoot from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/app/Komoot_AppReview.json
10:40:31 INFO:Dataset Labels: transformed_dataset['labels']=['Domain_Level', 'Interaction_Level', '0', 'Syst

Map:   0%|          | 0/2488 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

10:41:46 INFO:Logged iteration result res.precision=0.7448315652306485 res.recall=0.7329953881360559


{'eval_loss': 0.34498119354248047, 'eval_step': 0, 'eval_precision': 0.7448315652306485, 'eval_recall': 0.7329953881360559, 'eval_label_count': 4, 'eval_runtime': 5.2449, 'eval_samples_per_second': 118.592, 'eval_steps_per_second': 3.813, 'epoch': 1.0}


10:42:57 INFO:Logged iteration result res.precision=0.7534383658104413 res.recall=0.7500822024701829


{'eval_loss': 0.3281768560409546, 'eval_step': 2, 'eval_precision': 0.7534383658104413, 'eval_recall': 0.7500822024701829, 'eval_label_count': 4, 'eval_runtime': 5.0288, 'eval_samples_per_second': 123.688, 'eval_steps_per_second': 3.977, 'epoch': 2.0}


10:44:08 INFO:Logged iteration result res.precision=0.7596953062648087 res.recall=0.7564701926066737


{'eval_loss': 0.3337039351463318, 'eval_step': 4, 'eval_precision': 0.7596953062648087, 'eval_recall': 0.7564701926066737, 'eval_label_count': 4, 'eval_runtime': 4.9906, 'eval_samples_per_second': 124.634, 'eval_steps_per_second': 4.008, 'epoch': 3.0}


10:45:19 INFO:Logged iteration result res.precision=0.7647277950675371 res.recall=0.7527890121521682


{'eval_loss': 0.3423565924167633, 'eval_step': 6, 'eval_precision': 0.7647277950675371, 'eval_recall': 0.7527890121521682, 'eval_label_count': 4, 'eval_runtime': 5.0374, 'eval_samples_per_second': 123.476, 'eval_steps_per_second': 3.97, 'epoch': 4.0}


10:46:29 INFO:Logged iteration result res.precision=0.7680200436698275 res.recall=0.7558824750001193


{'eval_loss': 0.3467903137207031, 'eval_step': 8, 'eval_precision': 0.7680200436698275, 'eval_recall': 0.7558824750001193, 'eval_label_count': 4, 'eval_runtime': 4.9791, 'eval_samples_per_second': 124.922, 'eval_steps_per_second': 4.017, 'epoch': 5.0}
{'train_runtime': 355.7713, 'train_samples_per_second': 34.966, 'train_steps_per_second': 1.096, 'train_loss': 0.28190397604917866, 'epoch': 5.0}


10:46:38 INFO:Logged iteration result res.precision=0.7534383658104413 res.recall=0.7500822024701829
10:46:39 INFO:Logged iteration result res.precision=0.7534383658104413 res.recall=0.7500822024701829
10:46:39 INFO:Finished iteration=0
10:46:39 INFO:Logging model artifact (might take a while)
10:48:14 INFO:Breaking early after iteration=0 of 5 folds
10:48:15 INFO:Logged experiment result res.mean_precision=0.7534383658104413 res.mean_recall=0.7500822024701829
10:48:15 INFO:Left mlflow context


2ced2fa62a254611abca1d3af07e013e


PosixPath('src/service/models/bert_1')

## Train BERT Second Stage Model for BERT First Stage


In [None]:
run_experiments

'True'

In [None]:
from tooling.observability import get_run_id
from tooling.config import DualModelStagedBERTConfig, StagedBERT

from copy import deepcopy

bert_2_bert_experiment_config = deepcopy(base_experiment_config)
bert_2_bert_experiment_config.name = "Production"

bert_2_bert_config = StagedBERT(max_len=123)

bert_2_bert_cfg: DualModelStagedBERTConfig = OmegaConf.structured(
    DualModelStagedBERTConfig(
        bert=bert_2_bert_config,
        experiment=bert_2_bert_experiment_config,
        transformation=label_transformation_config,
        first_model_bert=bert_1_cfg,
    )
)

if run_experiments:
    from experiments.dual_model_staged_bert import dual_stage_bert

    dual_stage_bert(OmegaConf.create(bert_2_bert_cfg))

bert_2_bert_run_id = get_run_id(bert_2_bert_cfg, pin_commit=pin_commits)

print(bert_2_bert_run_id)

bert_2_bert_run = mlflow.get_run(bert_2_bert_run_id)
mlflow.artifacts.download_artifacts(
    f"{bert_2_bert_run.info.artifact_uri}/0_model",
    dst_path=Path("./src/service/models/"),
)
try:
    shutil.rmtree(Path("./src/service/models/bert_2_bert"))
except FileNotFoundError:
    pass
Path("./src/service/models/0_model").rename("./src/service/models/bert_2_bert")

07:11:46 INFO:
first_model_bert:
  bert:
    model: bert-base-uncased
    type: BERT
    max_len: 110
    train_batch_size: 32
    validation_batch_size: 32
    number_epochs: 5
    learning_rate_bert: 2.0e-05
    learning_rate_classifier: 0.01
    weight_decay: 0.01
    weighted_classes: false
  experiment:
    name: Production
    description: ''
    random_state: 125
    folds: 5
    iterations: 1
    average: macro
    dataset: prolific
    lower_case: false
    force: false
  transformation:
    description: Levels
    type: Reduced
    task: Domain_Level
    goals: null
    domain_data: Domain_Level
    activity: Domain_Level
    stakeholder: Domain_Level
    system_function: Interaction_Level
    interaction: Interaction_Level
    interaction_data: Interaction_Level
    workspace: Interaction_Level
    software: System_Level
    internal_action: System_Level
    internal_data: System_Level
    system_level: null
first_model_bilstm: null
first_model_sner: null
bert:
  model: bert

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:11:46 INFO:Using device: mps
07:11:47 INFO:Found existing run with run_id: ec3a52b91a4c4ebe95763583e2f818f5 matching the configuration
07:11:47 INFO:Downloading run model from mlflow-artifacts:/38/ec3a52b91a4c4ebe95763583e2f818f5/artifacts/0_model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:12:04 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_1_33.json
07:12:04 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_34_66.json
07:12:04 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_67_100.json
07:12:07 INFO:Dataset Labels: transformed_dataset['labels']=['Interaction', 'Activity', 'System_Level', 'Task', '0', 'Stakeholder', 'Domain_Data', 'Workspace', 'Interaction_Data', 'System_Function']
07:12:07 INFO:Class weights: {'0': 1.0, 'Task': 1.0, 'Domain_Data': 1.0, 'Activity': 1.0, 'Stakeholder': 1.0, 'System_Function': 1.0, 'Interaction': 1.0, 'Interaction_Data': 1.0, 'Workspace': 1.0, 'System_Level': 1.0}
07:12:07 INFO:Configured maximal token sequence length: max_len = 110
07:12:08 INFO:Created fold datasets for fold: 0, store

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

07:12:08 INFO:Loading Model
07:12:09 INFO:Using device: mps
07:12:09 INFO:Creating hint column


Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

07:12:43 INFO:Loading Model
07:12:43 INFO:Using device: mps
07:12:43 INFO:Creating hint column


Map:   0%|          | 0/269 [00:00<?, ? examples/s]

07:13:21 INFO:Logged iteration result res.precision=0.6675078277673936 res.recall=0.6316880036205067


{'eval_loss': 0.38334932923316956, 'eval_step': 0, 'eval_precision': 0.6675078277673936, 'eval_recall': 0.6316880036205067, 'eval_label_count': 10, 'eval_runtime': 2.2749, 'eval_samples_per_second': 118.248, 'eval_steps_per_second': 3.956, 'epoch': 1.0}


07:13:51 INFO:Logged iteration result res.precision=0.6950434026279945 res.recall=0.6460017381423724


{'eval_loss': 0.3633381426334381, 'eval_step': 2, 'eval_precision': 0.6950434026279945, 'eval_recall': 0.6460017381423724, 'eval_label_count': 10, 'eval_runtime': 2.2431, 'eval_samples_per_second': 119.921, 'eval_steps_per_second': 4.012, 'epoch': 2.0}


07:14:20 INFO:Logged iteration result res.precision=0.7245306308415692 res.recall=0.6812385477840243


{'eval_loss': 0.35096409916877747, 'eval_step': 4, 'eval_precision': 0.7245306308415692, 'eval_recall': 0.6812385477840243, 'eval_label_count': 10, 'eval_runtime': 2.1976, 'eval_samples_per_second': 122.408, 'eval_steps_per_second': 4.095, 'epoch': 3.0}


07:14:50 INFO:Logged iteration result res.precision=0.7011409677525176 res.recall=0.6533480467054927


{'eval_loss': 0.3634767532348633, 'eval_step': 6, 'eval_precision': 0.7011409677525176, 'eval_recall': 0.6533480467054927, 'eval_label_count': 10, 'eval_runtime': 2.1951, 'eval_samples_per_second': 122.547, 'eval_steps_per_second': 4.1, 'epoch': 4.0}


07:15:19 INFO:Logged iteration result res.precision=0.6980975323342593 res.recall=0.6570924951464788


{'eval_loss': 0.36366090178489685, 'eval_step': 8, 'eval_precision': 0.6980975323342593, 'eval_recall': 0.6570924951464788, 'eval_label_count': 10, 'eval_runtime': 2.2006, 'eval_samples_per_second': 122.24, 'eval_steps_per_second': 4.09, 'epoch': 5.0}
{'train_runtime': 149.136, 'train_samples_per_second': 36.041, 'train_steps_per_second': 1.14, 'train_loss': 0.3030788197236903, 'epoch': 5.0}


07:15:25 INFO:Logged iteration result res.precision=0.7245306308415692 res.recall=0.6812385477840243
07:15:25 INFO:Logged iteration result res.precision=0.7245306308415692 res.recall=0.6812385477840243
07:15:25 INFO:Finished iteration=0
07:15:25 INFO:Logging model artifact (might take a while)
07:18:24 INFO:Breaking early after iteration=0 of 5 folds
07:18:25 INFO:Logged experiment result res.mean_precision=0.7245306308415692 res.mean_recall=0.6812385477840243
07:18:25 INFO:Left mlflow context


394f57d85bed4aaba003b0cee28c68f7


PosixPath('src/service/models/bert_2_bert')

## Train BERT Second Stage Model for SNER First Stage


In [None]:
from tooling.observability import get_run_id
from tooling.config import DualModelStagedBERTConfig, StagedBERT

from copy import deepcopy

bert_2_sner_experiment_config = deepcopy(base_experiment_config)
bert_2_sner_experiment_config.name = "Production"

bert_2_sner_config = StagedBERT(max_len=123)

bert_2_sner_cfg = OmegaConf.structured(
    DualModelStagedBERTConfig(
        bert=bert_2_sner_config,
        experiment=bert_2_sner_experiment_config,
        transformation=label_transformation_config,
        first_model_sner=sner_cfg,
    )
)

if run_experiments:
    from experiments.dual_model_staged_bert import dual_stage_bert

    dual_stage_bert(OmegaConf.create(bert_2_sner_cfg))

bert_2_sner_run_id = get_run_id(bert_2_sner_cfg, pin_commit=pin_commits)

print(bert_2_sner_run_id)

bert_2_sner_run = mlflow.get_run(bert_2_sner_run_id)
mlflow.artifacts.download_artifacts(
    f"{bert_2_sner_run.info.artifact_uri}/0_model",
    dst_path=Path("./src/service/models/"),
)
try:
    shutil.rmtree(Path("./src/service/models/bert_2_sner"))
except FileNotFoundError:
    pass

Path("./src/service/models/0_model").rename("./src/service/models/bert_2_sner")

07:18:47 INFO:
first_model_bert: null
first_model_bilstm: null
first_model_sner:
  sner:
    type: SNER
  experiment:
    name: Production
    description: ''
    random_state: 125
    folds: 5
    iterations: 1
    average: macro
    dataset: prolific
    lower_case: false
    force: false
  transformation:
    description: Levels
    type: Reduced
    task: Domain_Level
    goals: null
    domain_data: Domain_Level
    activity: Domain_Level
    stakeholder: Domain_Level
    system_function: Interaction_Level
    interaction: Interaction_Level
    interaction_data: Interaction_Level
    workspace: Interaction_Level
    software: System_Level
    internal_action: System_Level
    internal_data: System_Level
    system_level: null
bert:
  model: bert-base-uncased
  type: BERT
  max_len: 110
  train_batch_size: 32
  validation_batch_size: 32
  number_epochs: 5
  learning_rate_bert: 2.0e-05
  learning_rate_classifier: 0.01
  weight_decay: 0.01
  weighted_classes: false
  layers: []
exper

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:18:48 INFO:Using device: mps
07:18:48 INFO:Found existing run with run_id: 0ac2a21d9e2c4cd3a984cfd6d5616f25 matching the configuration
07:18:48 INFO:Downloading run model from mlflow-artifacts:/38/0ac2a21d9e2c4cd3a984cfd6d5616f25/artifacts/0_model.ser.gz


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:18:49 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_1_33.json
07:18:49 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_34_66.json
07:18:49 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_67_100.json
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c7b57270-4af9-4247-8e9b-e4b5a9e96db5)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json
07:19:01 INFO:Dataset Labels: transformed_dataset['labels']=['Interaction', 'Activity', 'System_Level', 'Task', '0', 'Stakeholder', 'Domain_Data', 'Workspace', 'Interaction_Data', 'System_Function']
07:19:01 INFO:Class weights: {'0': 1.0, 'Task': 1.0, 'Domain_Data': 1.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/269 [00:00<?, ? examples/s]

07:19:38 INFO:Logged iteration result res.precision=0.6754658112454573 res.recall=0.6221767408632006


{'eval_loss': 0.38078516721725464, 'eval_step': 0, 'eval_precision': 0.6754658112454573, 'eval_recall': 0.6221767408632006, 'eval_label_count': 10, 'eval_runtime': 2.3576, 'eval_samples_per_second': 114.101, 'eval_steps_per_second': 3.818, 'epoch': 1.0}


07:20:08 INFO:Logged iteration result res.precision=0.7060137118065192 res.recall=0.6299854292953236


{'eval_loss': 0.35421010851860046, 'eval_step': 2, 'eval_precision': 0.7060137118065192, 'eval_recall': 0.6299854292953236, 'eval_label_count': 10, 'eval_runtime': 2.1802, 'eval_samples_per_second': 123.383, 'eval_steps_per_second': 4.128, 'epoch': 2.0}


07:20:38 INFO:Logged iteration result res.precision=0.7212585410998176 res.recall=0.6578305924888609


{'eval_loss': 0.34980329871177673, 'eval_step': 4, 'eval_precision': 0.7212585410998176, 'eval_recall': 0.6578305924888609, 'eval_label_count': 10, 'eval_runtime': 2.1743, 'eval_samples_per_second': 123.719, 'eval_steps_per_second': 4.139, 'epoch': 3.0}


07:21:07 INFO:Logged iteration result res.precision=0.6785963397508594 res.recall=0.6251045901344251


{'eval_loss': 0.3639702796936035, 'eval_step': 6, 'eval_precision': 0.6785963397508594, 'eval_recall': 0.6251045901344251, 'eval_label_count': 10, 'eval_runtime': 2.0724, 'eval_samples_per_second': 129.799, 'eval_steps_per_second': 4.343, 'epoch': 4.0}


07:21:37 INFO:Logged iteration result res.precision=0.6869321738971563 res.recall=0.6311597665520065


{'eval_loss': 0.36528241634368896, 'eval_step': 8, 'eval_precision': 0.6869321738971563, 'eval_recall': 0.6311597665520065, 'eval_label_count': 10, 'eval_runtime': 2.2077, 'eval_samples_per_second': 121.844, 'eval_steps_per_second': 4.077, 'epoch': 5.0}
{'train_runtime': 151.1739, 'train_samples_per_second': 35.555, 'train_steps_per_second': 1.125, 'train_loss': 0.24500407050637638, 'epoch': 5.0}


07:21:42 INFO:Logged iteration result res.precision=0.7212585410998176 res.recall=0.6578305924888609
07:21:43 INFO:Logged iteration result res.precision=0.7212585410998176 res.recall=0.6578305924888609
07:21:43 INFO:Finished iteration=0
07:21:43 INFO:Logging model artifact (might take a while)
07:23:18 INFO:Breaking early after iteration=0 of 5 folds
07:23:19 INFO:Logged experiment result res.mean_precision=0.7212585410998176 res.mean_recall=0.6578305924888609
07:23:19 INFO:Left mlflow context


1219d6e29446499890406e8151ef7b63


PosixPath('src/service/models/bert_2_sner')

## Train BERT Second Stage Model for BILSTM First Stage


In [None]:
from tooling.observability import get_run_id
from tooling.config import DualModelStagedBERTConfig, StagedBERT

from copy import deepcopy

bert_2_bilstm_experiment_config = deepcopy(base_experiment_config)
bert_2_bilstm_experiment_config.name = "Production"

bert_2_bilstm_config = StagedBERT(max_len=sentence_length)

bert_2_bilstm_cfg = OmegaConf.structured(
    DualModelStagedBERTConfig(
        bert=bert_2_bilstm_config,
        experiment=bert_2_bilstm_experiment_config,
        transformation=label_transformation_config,
        first_model_bilstm=bilstm_cfg,
    )
)

if run_experiments:
    from experiments.dual_model_staged_bert import dual_stage_bert

    dual_stage_bert(OmegaConf.create(bert_2_bilstm_cfg))

bert_2_bilstm_run_id = get_run_id(bert_2_bilstm_cfg, pin_commit=pin_commits)

print(bert_2_bilstm_run_id)

bert_2_bilstm_run = mlflow.get_run(bert_2_bilstm_run_id)
mlflow.artifacts.download_artifacts(
    f"{bert_2_bilstm_run.info.artifact_uri}/0_model",
    dst_path=Path("./src/service/models/"),
)
try:
    shutil.rmtree(Path("./src/service/models/bert_2_bilstm"))
except FileNotFoundError:
    pass
Path("./src/service/models/0_model").rename(
    "./src/service/models/bert_2_bilstm"
)

07:26:53 INFO:
first_model_bert: null
first_model_bilstm:
  bilstm:
    type: BiLSTM
    sentence_length: 110
    batch_size: 32
    number_epochs: 4
    verbose: 1
    weighted_classes: false
    learning_rate: 0.0001
  experiment:
    name: Production
    description: ''
    random_state: 125
    folds: 5
    iterations: 1
    average: macro
    dataset: prolific
    lower_case: false
    force: false
  transformation:
    description: Levels
    type: Reduced
    task: Domain_Level
    goals: null
    domain_data: Domain_Level
    activity: Domain_Level
    stakeholder: Domain_Level
    system_function: Interaction_Level
    interaction: Interaction_Level
    interaction_data: Interaction_Level
    workspace: Interaction_Level
    software: System_Level
    internal_action: System_Level
    internal_data: System_Level
    system_level: null
first_model_sner: null
bert:
  model: bert-base-uncased
  type: BERT
  max_len: 110
  train_batch_size: 32
  validation_batch_size: 32
  number_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:26:53 INFO:Using device: mps
07:26:53 INFO:Found existing run with run_id: 46aa06deee964d17905524b1ad721245 matching the configuration
07:26:53 INFO:Downloading run model from mlflow-artifacts:/38/46aa06deee964d17905524b1ad721245/artifacts/0_model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


07:26:55 INFO:loading projection weights from /Users/bockstaller/gensim-data/glove-twitter-100/glove-twitter-100.gz
07:27:30 INFO:KeyedVectors lifecycle event {'msg': 'loaded (1193514, 100) matrix of type float32 from /Users/bockstaller/gensim-data/glove-twitter-100/glove-twitter-100.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-08-28T19:27:30.978685', 'gensim': '4.3.1', 'python': '3.11.4 (main, Jun 20 2023, 19:14:10) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.3-arm64-arm-64bit', 'event': 'load_word2vec_format'}
07:27:31 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_1_33.json
07:27:31 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_34_66.json
07:27:31 INFO:Importing dataset: prolific from /Users/bockstaller/code/uvl-tore-classifier-bert/src/data/datasets/prolific/TORE_Coded_Answers_67

 5/34 [===>..........................] - ETA: 0s

2023-08-28 19:27:52.973992: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:27:52.986381: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

2023-08-28 19:28:01.113841: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:28:01.234551: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:28:01.248651: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-28 19:28:01.305962: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-08-28 19:28:01.317846: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Map:   0%|          | 0/269 [00:00<?, ? examples/s]

07:28:32 INFO:Logged iteration result res.precision=0.6680461931378503 res.recall=0.6199965235756286


{'eval_loss': 0.39321935176849365, 'eval_step': 0, 'eval_precision': 0.6680461931378503, 'eval_recall': 0.6199965235756286, 'eval_label_count': 10, 'eval_runtime': 2.1874, 'eval_samples_per_second': 122.974, 'eval_steps_per_second': 4.114, 'epoch': 1.0}


07:29:01 INFO:Logged iteration result res.precision=0.694559879687572 res.recall=0.6371689388907558


{'eval_loss': 0.3638952672481537, 'eval_step': 2, 'eval_precision': 0.694559879687572, 'eval_recall': 0.6371689388907558, 'eval_label_count': 10, 'eval_runtime': 2.1, 'eval_samples_per_second': 128.096, 'eval_steps_per_second': 4.286, 'epoch': 2.0}


07:29:30 INFO:Logged iteration result res.precision=0.7149316221462662 res.recall=0.6787447630552844


{'eval_loss': 0.3512847125530243, 'eval_step': 4, 'eval_precision': 0.7149316221462662, 'eval_recall': 0.6787447630552844, 'eval_label_count': 10, 'eval_runtime': 2.1298, 'eval_samples_per_second': 126.301, 'eval_steps_per_second': 4.226, 'epoch': 3.0}


07:30:00 INFO:Logged iteration result res.precision=0.701081362071734 res.recall=0.652372461383967


{'eval_loss': 0.3632904887199402, 'eval_step': 6, 'eval_precision': 0.701081362071734, 'eval_recall': 0.652372461383967, 'eval_label_count': 10, 'eval_runtime': 2.1378, 'eval_samples_per_second': 125.828, 'eval_steps_per_second': 4.21, 'epoch': 4.0}


07:30:32 INFO:Logged iteration result res.precision=0.7025782526868951 res.recall=0.6541316762715507


{'eval_loss': 0.36204540729522705, 'eval_step': 8, 'eval_precision': 0.7025782526868951, 'eval_recall': 0.6541316762715507, 'eval_label_count': 10, 'eval_runtime': 2.3672, 'eval_samples_per_second': 113.637, 'eval_steps_per_second': 3.802, 'epoch': 5.0}
{'train_runtime': 152.1069, 'train_samples_per_second': 35.337, 'train_steps_per_second': 1.118, 'train_loss': 0.31860966401941637, 'epoch': 5.0}


07:30:39 INFO:Logged iteration result res.precision=0.7149316221462662 res.recall=0.6787447630552844
07:30:40 INFO:Logged iteration result res.precision=0.7149316221462662 res.recall=0.6787447630552844
07:30:40 INFO:Finished iteration=0
07:30:40 INFO:Logging model artifact (might take a while)
07:32:23 INFO:Breaking early after iteration=0 of 5 folds
07:32:24 INFO:Logged experiment result res.mean_precision=0.7149316221462662 res.mean_recall=0.6787447630552844
07:32:25 INFO:Left mlflow context


82b0e7e4c2b041ddaddcb0261283ceef


PosixPath('src/service/models/bert_2_bilstm')

## Train BERT E2E Model


In [None]:
from tooling.observability import get_run_id
from tooling.config import BERTConfig, BERT

from copy import deepcopy

bert_experiment_config = deepcopy(base_experiment_config)
bert_experiment_config.name = "Production"

bert_config = BERT(max_len=sentence_length)

bert_cfg = OmegaConf.structured(
    BERTConfig(
        bert=bert_config,
        experiment=bert_experiment_config,
        transformation=label_transformation_config,
    )
)

if run_experiments:
    from experiments.bert import bert

    bert(OmegaConf.create(bert_cfg))

bert_run_id = get_run_id(bert_cfg, pin_commit=pin_commits)

print(bert_run_id)

run = mlflow.get_run(bert_run_id)
mlflow.artifacts.download_artifacts(
    f"{run.info.artifact_uri}/0_model", dst_path=Path("./src/service/models/")
)
try:
    shutil.rmtree(Path("./src/service/models/bert"))
except FileNotFoundError:
    pass
Path("./src/service/models/0_model").rename("./src/service/models/bert")

07:32:42 INFO:
bert:
  model: bert-base-uncased
  type: BERT
  max_len: 110
  train_batch_size: 32
  validation_batch_size: 32
  number_epochs: 5
  learning_rate_bert: 2.0e-05
  learning_rate_classifier: 0.01
  weight_decay: 0.01
  weighted_classes: false
experiment:
  name: Production
  description: ''
  random_state: 125
  folds: 5
  iterations: 1
  average: macro
  dataset: prolific
  lower_case: false
  force: false
transformation:
  description: None
  type: Full
  task: Task
  goals: null
  domain_data: Domain_Data
  activity: Activity
  stakeholder: Stakeholder
  system_function: System_Function
  interaction: Interaction
  interaction_data: Interaction_Data
  workspace: Workspace
  software: System_Level
  internal_action: System_Level
  internal_data: System_Level
  system_level: null



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
394f57d85bed4aaba003b0cee28c68f7


PosixPath('src/service/models/bert')