In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataset_curso.csv')
df['label'] = df['2_way_label']
df = df[['clean_title', 'label']]

In [3]:
from datasets import Dataset

text_dataset = Dataset.from_pandas(df) \
    .train_test_split(test_size=0.2)

text_dataset

DatasetDict({
    train: Dataset({
        features: ['clean_title', 'label'],
        num_rows: 45120
    })
    test: Dataset({
        features: ['clean_title', 'label'],
        num_rows: 11280
    })
})

In [4]:
from transformers import AutoTokenizer

repo_id = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(repo_id)

In [5]:
def tokenize_fn(example):
  return tokenizer(example["clean_title"], truncation=True, return_tensors='pt', padding=True)

In [6]:
text_dataset = text_dataset.map(tokenize_fn, batched=True)
text_dataset = text_dataset.map(lambda x: x, batched=True, remove_columns=['clean_title'])

Map:   0%|          | 0/45120 [00:00<?, ? examples/s]

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

Map:   0%|          | 0/45120 [00:00<?, ? examples/s]

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

In [7]:
print(text_dataset['train'][0])

{'label': 0, 'input_ids': [0, 4783, 47510, 3254, 2242, 40254, 329, 8256, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [10]:
import numpy as np

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return clf_metrics.compute(predictions=predictions, references=labels)

In [11]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    repo_id,num_labels=2,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import torch
from transformers import TrainingArguments



training_args = TrainingArguments(
    output_dir = "./text_model",
    evaluation_strategy="steps",
    num_train_epochs=1,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    no_cuda=False,
)

In [13]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=text_dataset["train"],
    eval_dataset=text_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [14]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

  0%|          | 0/5640 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.5546, 'learning_rate': 4.556737588652483e-05, 'epoch': 0.09}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.44599902629852295, 'eval_accuracy': 0.7896276595744681, 'eval_f1': 0.7611474584801208, 'eval_precision': 0.6897117840204305, 'eval_recall': 0.849090500785987, 'eval_runtime': 453.4207, 'eval_samples_per_second': 24.878, 'eval_steps_per_second': 3.11, 'epoch': 0.09}
{'loss': 0.482, 'learning_rate': 4.1134751773049644e-05, 'epoch': 0.18}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.4508322477340698, 'eval_accuracy': 0.8097517730496454, 'eval_f1': 0.7706774951912801, 'eval_precision': 0.7351681957186544, 'eval_recall': 0.8097911520323378, 'eval_runtime': 132.4981, 'eval_samples_per_second': 85.133, 'eval_steps_per_second': 10.642, 'epoch': 0.18}
{'loss': 0.4565, 'learning_rate': 3.670212765957447e-05, 'epoch': 0.27}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.5181368589401245, 'eval_accuracy': 0.8109042553191489, 'eval_f1': 0.7830773924539816, 'eval_precision': 0.7156133828996283, 'eval_recall': 0.864585672580283, 'eval_runtime': 140.8534, 'eval_samples_per_second': 80.083, 'eval_steps_per_second': 10.01, 'epoch': 0.27}
{'loss': 0.4452, 'learning_rate': 3.226950354609929e-05, 'epoch': 0.35}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.42694878578186035, 'eval_accuracy': 0.825709219858156, 'eval_f1': 0.7908955541374176, 'eval_precision': 0.7512628813901798, 'eval_recall': 0.8349427352346732, 'eval_runtime': 132.1524, 'eval_samples_per_second': 85.356, 'eval_steps_per_second': 10.669, 'epoch': 0.35}
{'loss': 0.4364, 'learning_rate': 2.7836879432624114e-05, 'epoch': 0.44}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.38679587841033936, 'eval_accuracy': 0.8297872340425532, 'eval_f1': 0.7975537747785745, 'eval_precision': 0.7517392168554959, 'eval_recall': 0.8493150684931506, 'eval_runtime': 131.5439, 'eval_samples_per_second': 85.751, 'eval_steps_per_second': 10.719, 'epoch': 0.44}
{'loss': 0.4037, 'learning_rate': 2.340425531914894e-05, 'epoch': 0.53}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.4313322603702545, 'eval_accuracy': 0.8370567375886525, 'eval_f1': 0.7908511606736459, 'eval_precision': 0.8016147635524798, 'eval_recall': 0.7803727823938917, 'eval_runtime': 130.9854, 'eval_samples_per_second': 86.116, 'eval_steps_per_second': 10.765, 'epoch': 0.53}
{'loss': 0.3965, 'learning_rate': 1.897163120567376e-05, 'epoch': 0.62}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.3825870454311371, 'eval_accuracy': 0.8401595744680851, 'eval_f1': 0.8006192635187438, 'eval_precision': 0.7886710239651417, 'eval_recall': 0.8129350999326297, 'eval_runtime': 134.6425, 'eval_samples_per_second': 83.777, 'eval_steps_per_second': 10.472, 'epoch': 0.62}
{'loss': 0.3863, 'learning_rate': 1.4539007092198581e-05, 'epoch': 0.71}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.4211887717247009, 'eval_accuracy': 0.8445921985815603, 'eval_f1': 0.7981577432354633, 'eval_precision': 0.8189981096408318, 'eval_recall': 0.7783516730294183, 'eval_runtime': 134.6867, 'eval_samples_per_second': 83.75, 'eval_steps_per_second': 10.469, 'epoch': 0.71}
{'loss': 0.3942, 'learning_rate': 1.0106382978723404e-05, 'epoch': 0.8}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.40174466371536255, 'eval_accuracy': 0.8361702127659575, 'eval_f1': 0.8064516129032256, 'eval_precision': 0.7556427870461236, 'eval_recall': 0.864585672580283, 'eval_runtime': 133.8556, 'eval_samples_per_second': 84.27, 'eval_steps_per_second': 10.534, 'epoch': 0.8}
{'loss': 0.3828, 'learning_rate': 5.673758865248227e-06, 'epoch': 0.89}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.4085642695426941, 'eval_accuracy': 0.8453900709219858, 'eval_f1': 0.8028487451955686, 'eval_precision': 0.8083314363760528, 'eval_recall': 0.7974399281383338, 'eval_runtime': 145.2535, 'eval_samples_per_second': 77.657, 'eval_steps_per_second': 9.707, 'epoch': 0.89}
{'loss': 0.3759, 'learning_rate': 1.2411347517730497e-06, 'epoch': 0.98}


  0%|          | 0/1410 [00:00<?, ?it/s]

{'eval_loss': 0.3954736292362213, 'eval_accuracy': 0.8468085106382979, 'eval_f1': 0.8025142857142856, 'eval_precision': 0.8170816848964394, 'eval_recall': 0.7884572198517853, 'eval_runtime': 130.5239, 'eval_samples_per_second': 86.421, 'eval_steps_per_second': 10.803, 'epoch': 0.98}
{'train_runtime': 4725.7519, 'train_samples_per_second': 9.548, 'train_steps_per_second': 1.193, 'train_loss': 0.42828578475519274, 'epoch': 1.0}
***** train metrics *****
  epoch                    =        1.0
  train_loss               =     0.4283
  train_runtime            = 1:18:45.75
  train_samples_per_second =      9.548
  train_steps_per_second   =      1.193


In [15]:
metrics = trainer.evaluate(text_dataset["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

  0%|          | 0/1410 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.8402
  eval_f1                 =     0.8006
  eval_loss               =     0.3826
  eval_precision          =     0.7887
  eval_recall             =     0.8129
  eval_runtime            = 0:02:13.84
  eval_samples_per_second =     84.275
  eval_steps_per_second   =     10.534
