In [1]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('data/train.csv')
df_dataset = Dataset.from_pandas(df)
df_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 5909
})

In [2]:
df_dataset = df_dataset.train_test_split(test_size=0.3)
df_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4136
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1773
    })
})

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilroberta-base')
classification_model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilroberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=256)

tokenized_datasets = df_dataset.map(tokenize_function, batched=True).remove_columns(['text'])
tokenized_datasets

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1773 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 4136
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1773
    })
})

In [5]:
from sklearn.metrics import confusion_matrix
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1_score = f1.compute(predictions=predictions, references=labels)
    acc_score = accuracy.compute(predictions=predictions, references=labels)
    
    conf_matrix = confusion_matrix(labels, predictions)
    conf_matrix_dict = {
        "tp": conf_matrix[1, 1],  
        "tn": conf_matrix[0, 0],  
        "fp": conf_matrix[0, 1],  
        "fn": conf_matrix[1, 0], 
    }

    return {
        "f1": f1_score["f1"],
        "accuracy": acc_score["accuracy"],
        **conf_matrix_dict,
    }

training_args = TrainingArguments(
    output_dir="/trainer_output",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
)

# Trainer
trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()




Epoch,Training Loss,Validation Loss,F1,Accuracy,Tp,Tn,Fp,Fn
1,No log,0.229877,0.853974,0.866328,693,843,53,184
2,0.260000,0.218588,0.86242,0.878173,677,880,16,200
3,0.260000,0.212466,0.887981,0.891709,761,820,76,116


TrainOutput(global_step=777, training_loss=0.24429018371660582, metrics={'train_runtime': 66.4223, 'train_samples_per_second': 186.805, 'train_steps_per_second': 11.698, 'total_flos': 171096133985952.0, 'train_loss': 0.24429018371660582, 'epoch': 3.0})

In [7]:
file_path = "data/train-test.txt" 
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

sentences = [sentence.strip() for sentence in text_data.split('.') if sentence.strip()]

data_dict = {"text": sentences}

dataset = Dataset.from_dict(data_dict)

dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16685
    })
    test: Dataset({
        features: ['text'],
        num_rows: 7152
    })
})

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset

Map:   0%|          | 0/16685 [00:00<?, ? examples/s]

Map:   0%|          | 0/7152 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16685
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7152
    })
})

In [9]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)
lm_dataset

Map:   0%|          | 0/16685 [00:00<?, ? examples/s]

Map:   0%|          | 0/7152 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 66740
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 28608
    })
})

In [10]:
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [11]:
mlm_model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
training_args = TrainingArguments(
    output_dir="/my_awesome_eli5_mlm_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
)

trainer = Trainer(
    model=mlm_model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.242,0.211971
2,0.2131,0.187317
3,0.2031,0.181116


TrainOutput(global_step=6258, training_loss=0.2430416928879862, metrics={'train_runtime': 3158.6807, 'train_samples_per_second': 63.387, 'train_steps_per_second': 1.981, 'total_flos': 6638384528962560.0, 'train_loss': 0.2430416928879862, 'epoch': 3.0})

In [16]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 1.20


In [24]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=256)

tokenized_datasets = df_dataset.map(tokenize_function, batched=True).remove_columns(['text'])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1773 [00:00<?, ? examples/s]

In [None]:
trainer.save_model('/mlm_model')

In [6]:
classification_model_mlm = AutoModelForSequenceClassification.from_pretrained("/mlm_model", num_labels=2)

training_args = TrainingArguments(
    output_dir="/trainer_output",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
)

# Trainer
trainer = Trainer(
    model=classification_model_mlm,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy,Tp,Tn,Fp,Fn
1,No log,0.24064,0.860241,0.869148,714,827,69,163
2,0.261000,0.212781,0.858999,0.877609,661,895,1,216
3,0.261000,0.177982,0.897153,0.900169,772,824,72,105


TrainOutput(global_step=777, training_loss=0.24146132794432965, metrics={'train_runtime': 67.0467, 'train_samples_per_second': 185.065, 'train_steps_per_second': 11.589, 'total_flos': 171096133985952.0, 'train_loss': 0.24146132794432965, 'epoch': 3.0})

In [7]:
trainer.save_model('/classification_model_mlm')

При использовании подхода unsupervised masked language modeling имеем незначительное увеличение метрик (примерно на 0.01)