# Smokes tests

In [1]:
%load_ext autoreload
%reload_ext autoreload

%autoreload 2

import mlflow as mlf
import sys
import os
sys.path.append('../')

from model.utils import load_model, load_dataset
from model.train import train_model

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-uncased')
max_length = tokenizer.model_max_length
print(f"Max length configurado para RoBERTa: {max_length}")

Max length configurado para RoBERTa: 1000000000000000019884624838656


## Smoke tests for rebertuito model

In [None]:
ds_path = os.path.abspath(os.path.join('..', 'model', 'tweets_parsed_pruned.csv'))

ds = load_dataset(ds_path, force=False)
for key in ds.keys():
  for v in ds[key]:
    print(key, v)
    break

model, tokenizer = load_model(base_model='pysentimiento/robertuito-base-uncased')

## Smoke test for ReBERTuito+BLSTM

In [None]:
ds_path = os.path.abspath(os.path.join('..', 'model', 'tweets_parsed_pruned.csv'))

ds = load_dataset(ds_path, force=False)
for key in ds.keys():
  for v in ds[key]:
    print(key, v)
    break

train_arg = {
    "epochs": 6,
    "batch_size": 92,
    "accumulation_steps": 1,
    "warmup_ratio": 0.1,
    "learning_rate": 5e-5,
    "weight_decay": 0.01,
    "blstm": True,
    "lstm_hidden_dim": 128,
    "lstm_num_layers": 2,
}

model, tokenizer = load_model(base_model='pysentimiento/robertuito-base-uncased', train_arg=train_arg)

dataset = ds.map(
    lambda x: tokenizer(
        x["text"],
        padding=True if not train_arg.get("blstm", False) else "max_length",
        truncation=True,
    ),
    batched=True,
)

train_dataloader = dataset["train"].batch(92)

In [None]:
train_model(limit=8)

# Analysis runs

In [None]:

import os
import sys
sys.path.append('../model')

import mlflow as mlf
from transformers import (
    AutoModelForSequenceClassification
)

runs_path = os.path.abspath(os.path.join('..', 'model', 'mlruns'))

mlf.set_tracking_uri('file:/ ' + runs_path)

columns = [
  'run_id', 'status', 'params.lstm_hidden_dim', 'params.lstm_num_layers',
  'metrics.train_runtime', 'metrics.eval_macro_f1', 'metrics.train_loss', 'metrics.eval_macro_recall', 'metrics.eval_macro_precision',
  'artifact_uri',

]

runs = mlf.search_runs(
  filter_string='status="FINISHED"',
  order_by=['metrics.eval_macro_f1 DESC']
)[columns]

runs

## Import best Robertuito model

In [11]:
run_torch = runs[runs['run_id'] == '0dfbb860725d4bcaa12549371f948a64'].iloc[0]
roubertuito = AutoModelForSequenceClassification.from_pretrained(f'{run_torch["artifact_uri"]}/model'.replace('file:///', ''))
roubertuito

## Import Robertuito+BiLSTM model

In [None]:
run_torch = runs[runs['run_id'] == 'd7710812eef0481fbf6f78716417717b'].iloc[0]
robertuito_blstm = mlf.pytorch.load_model(run_torch['artifact_uri'] + '/model')
robertuito_blstm