In [1]:
!pip install transformers datasets
!pip install rouge.score nltk py7zr
!pip install accelerate
!pip install pytorch-accelerated
!pip install transformers==4.30

Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transfo

In [6]:
import torch
import nltk
import numpy as np
import transformers

from datasets import load_dataset, load_metric, load_from_disk

nltk.download('punkt')
torch.mps.set_per_process_memory_fraction(0.0) # Train on Macbook

[nltk_data] Downloading package punkt to /Users/davidlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
data = load_dataset('samsum')
data.save_to_disk('/Users/davidlee/PycharmProjects/KoodooProject/koodoovoice/dataset_temp/samsum')
data = load_from_disk("/Users/davidlee/PycharmProjects/KoodooProject/koodoovoice/dataset_temp/samsum")

Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]

In [9]:
metric = load_metric('rouge')
model_checkpoints = 'kabita-choudhary/finetuned-bart-for-conversation-summary'

  metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [70]:
max_input = 512
max_target = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [71]:
def preprocess_data(data_to_process):
  inputs = [dialogue for dialogue in data_to_process['dialogue']]
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['summary'], max_length=max_target, padding='max_length', truncation=True)

  model_inputs['labels'] = targets['input_ids']
  return model_inputs

In [72]:
tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['id', 'dialogue', 'summary'])

Map:   0%|          | 0/819 [00:00<?, ? examples/s]



In [73]:
train_sample = tokenize_data['train'].shuffle(seed=123).select(range(2000))
validation_sample = tokenize_data['validation'].shuffle(seed=123).select(range(800))
test_sample = tokenize_data['test'].shuffle(seed=123).select(range(800))

In [74]:
tokenize_data['train'] = train_sample
tokenize_data['validation'] = validation_sample
tokenize_data['test'] = test_sample

In [82]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [83]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [84]:
def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}

In [85]:
args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=False,
    use_mps_device=True,
    )

In [86]:
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [None]:
trainer.train()