In [None]:
!git clone https://github.com/dqxiu/ParaSCI.git

In [1]:
with open('./ParaSCI/Data/ParaSCI-ACL/train/train.src') as file:
    train_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/train/train.tgt') as file:
    train_target = file.readlines()
    
with open('./ParaSCI/Data/ParaSCI-ACL/test/test.src') as file:
    test_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/test/test.tgt') as file:
    test_target = file.readlines()
    
with open('./ParaSCI/Data/ParaSCI-ACL/val/val.src') as file:
    val_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/val/val.tgt') as file:
    val_target = file.readlines()
    

In [2]:
from datasets import Dataset, DatasetDict, load_metric
import numpy as np

train_dataset = Dataset.from_dict({
    'id': np.arange(len(train_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(train_source, train_target)],
})
test_dataset = Dataset.from_dict({
        'id': np.arange(len(test_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(test_source, test_target)],
})
val_dataset = Dataset.from_dict({
    'id': np.arange(len(val_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(val_source, val_target)]
})

In [3]:
raw_dataset = DatasetDict()
raw_dataset['train'] = train_dataset
raw_dataset['test'] = test_dataset
raw_dataset['val'] = val_dataset

In [4]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'paraphrase'],
        num_rows: 28883
    })
    test: Dataset({
        features: ['id', 'paraphrase'],
        num_rows: 2345
    })
    val: Dataset({
        features: ['id', 'paraphrase'],
        num_rows: 2753
    })
})

In [5]:
from transformers import pipeline

model_checkpoint = "t5-small"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")



[{'translation_text': 'Standard für erweiterte Threads'}]

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

In [7]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["input"] for ex in examples["paraphrase"]]
    targets = [ex["output"] for ex in examples["paraphrase"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset["train"].column_names,
)

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [9]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [10]:
!pip install sacrebleu



In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
from datasets import load_metric

metric = load_metric("sacrebleu")


In [13]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [14]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"t5-finetuned-parasci",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=True,
)

In [15]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

/Users/domenicrosati/src/paralm/t5-finetuned-parasci is already a clone of https://huggingface.co/domenicrosati/t5-finetuned-parasci. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 2753
  Batch size = 64


In [None]:
trainer.train()

In [None]:
trainer.evaluate(max_length=max_target_length)

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "domenicrosati/t5-finetuned-parasci"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")