In [None]:
import mlflow
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AdamW

esto esun ensayo

In [None]:
# Define parameters

params = {
    "model_name": "distilbert-base-uncased",
    "learning_rate": 5e-5,
    "batch_size":16,
    "num_epochs": 2,
    "dataset_name": "ag_news",
    "task_name": "sequence_classification",
    "log_steps": 100,
    "max_seq_length": 128,
    "output_dir": "../models/distilbert_ag-news",
}

Before configure the tracking uri, run the next command in the bash terminal:<br/>
`
mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 127.0.0.1 --port 5000
`

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")


client = mlflow.MlflowClient()

experiment_name = params["task_name"]
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:

    mlflow.set_experiment(params["task_name"])
    print(f"Experimento '{experiment_name}' creado")

else:
    mlflow.set_experiment(experiment_name)
    print(f"Usando el experimento existente: '{experiment_name}")

Usando el experimento existente: 'sequence_classification


In [None]:
with mlflow.start_run(run_name=f"{params['model_name']}-{params['dataset_name']}") as run:
    mlflow.log_params(params)

    # Load dataset

    dataset = load_dataset(params['dataset_name'])
    tokenizer = DistilBertTokenizerFast.from_pretrained(params['model_name'])

    # To be sure our input shape fit well, padding and truncation are used

    def tokenize(batch):
            return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=params['max_seq_length'])

    train_dataset = dataset['train'].shuffle().select(range(7_000)).map(tokenize, batched=True)
    test_dataset = dataset['test'].shuffle().select(range(1_000)).map(tokenize, batched=True)

    train_dataset.to_parquet('../data/train.parquet')

    test_dataset.to_parquet('../data/test.parquet')

    mlflow.log_artifact('../data/train.parquet', artifact_path='datasets')
    mlflow.log_artifact('../data/test.parquet', artifact_path='datasets')

    # Set format for Pytorch and create data loaders
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=True)

    # Get the labels
    labels = dataset['train'].features['label'].names



Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Keep in mind that `input_ids` and `attention_mask` are columns for the Hugging Face models.


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
tokens = tokenizer("This is a sample text.", padding="max_length", truncation=True, max_length=10)

print(tokens)

Now, to help speed up training a bit, will also create some data loaders for our data sets.

In [None]:
print(dataset)

print('\n')
print(dataset['train'].column_names)
print('\n')

print(dataset['train'].features)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


['text', 'label']


{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}
