# Abstractive summaries - Train DistilBART on TWEETSUMM dataset

In [1]:
from huggingface_hub import login
import pandas as pd
import numpy as np
import os, time, datetime

from datasets import Dataset, DatasetDict

from transformers import DataCollatorForSeq2Seq, AutoTokenizer, set_seed
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip freeze

absl-py==2.1.0
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
asttokens==2.4.1
attrs==24.2.0
certifi==2024.8.30
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
comm==0.2.2
contourpy==1.3.0
cycler==0.12.1
datasets==2.21.0
debugpy==1.8.5
decorator==5.1.1
dill==0.3.8
evaluate==0.4.2
executing==2.0.1
filelock==3.15.4
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.6.1
huggingface-hub==0.24.6
idna==3.8
ipykernel==6.29.5
ipython==8.26.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
jupyter_client==8.6.2
jupyter_core==5.7.2
kiwisolver==1.4.5
MarkupSafe==2.1.5
matplotlib==3.9.2
matplotlib-inline==0.1.7
mpmath==1.3.0
multidict==6.0.5
multiprocess==0.70.16
nest-asyncio==1.6.0
networkx==3.3
nltk==3.9.1
numpy==1.26.4
packaging==24.1
pandas==2.2.2
parso==0.8.4
pillow==10.4.0
platformdirs==4.2.2
polars==1.2.1
prompt_toolkit==3.0.47
psutil==6.0.0
pure_eval==0.2.3
pyarrow==17.0.0
Pygments==2.18.0
pyparsing==3.1.4
python-dateutil==2.9.0.post0
pytz==2024.1
pywin32==306
PyYAML

In [3]:
ds_dir = ""
try:
    HF_TOKEN =  os.environ['HF_TOKEN']
except:
    HF_TOKEN = ""

if 'google.colab' in str(get_ipython()):
    print("Running on Colab")
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    HF_TOKEN = userdata.get('HF_TOKEN')
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE') != None:
    ds_dir="/kaggle/input/bertdata2207/"
    from kaggle_secrets import UserSecretsClient
    print("Running on Kaggle")
    ds_dir = "/kaggle/input/tweet-data-2106-1512/"
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY


In [4]:
set_seed(17)
os.environ["WANDB_PROJECT"] = "aiml-thesis-train"

In [5]:
# wandb.init(settings=wandb.Settings(start_method="thread"))

In [6]:
# login(token=HF_TOKEN)

## Load data

In [7]:
ds_dir = '.\\data\\'
print(ds_dir)

.\data\


In [8]:
checkpoint_bart = "sshleifer/distilbart-xsum-12-6"

In [9]:
train_df_temp = pd.read_csv(ds_dir + "dials_abs_2607_1312_train_spc.csv", names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
train_df_temp.convert_dtypes()
train_df_temp.drop(columns=['conv_id'], inplace=True)
train_df_temp.reset_index(drop=True, inplace=True)

val_df_temp = pd.read_csv(ds_dir + "dials_abs_2607_1312_valid_spc.csv", names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
val_df_temp.convert_dtypes()
val_df_temp.drop(columns=['conv_id'], inplace=True)
val_df_temp.reset_index(drop=True, inplace=True)

test_df_temp = pd.read_csv(ds_dir + "dials_abs_2607_1312_test_spc.csv", names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
test_df_temp.convert_dtypes()
test_df_temp.reset_index(drop=True, inplace=True)

print(train_df_temp.dtypes)
print(train_df_temp.head())

dialogue    string[python]
summary     string[python]
dtype: object
                                            dialogue  \
0  Customer: So neither my iPhone nor my Apple Wa...   
1  Customer: @115850 hi team! i m planning to get...   
2  Customer: @AskAmex Where do I write to address...   
3  Customer: @AmazonHelp @115821 Wow, expected 4 ...   
4  Customer: @GWRHelp I'd rather you spent some t...   

                                             summary  
0  Customer enquired about his Iphone and Apple w...  
1  Customer is eager to know about the replacemen...  
2  Signed up for an AmexCard with Delta but it di...  
3  The customer have a problem. The agent is very...  
4  Customer cannot purchase a train ticket on the...  


In [10]:
tweetsumm_abs = DatasetDict(
    {
        'train': Dataset.from_pandas(train_df_temp),
        'validation': Dataset.from_pandas(val_df_temp),
        'test': Dataset.from_pandas(test_df_temp)
    }
)

In [11]:
bart_tokenizer = AutoTokenizer.from_pretrained(checkpoint_bart)
bart_tokenizer.max_source_length = 512
bart_tokenizer.max_target_length = 80
tokenizer = bart_tokenizer
print(tokenizer)

BartTokenizerFast(name_or_path='sshleifer/distilbart-xsum-12-6', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}




In [12]:
# Source: https://huggingface.co/docs/transformers/en/tasks/summarization

def preprocess_function(examples):
    prefix = "summarize: "
    inputs = [str(prefix) + str(dial) for dial in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True) # same params as tweetsumm paper
    labels = tokenizer(text_target=examples["summary"], max_length=80, truncation=True)
    # print(inputs, model_inputs['input_ids'])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_tweetsumm_abs = tweetsumm_abs.map(preprocess_function, batched=True)
print(tokenized_tweetsumm_abs["train"][0])

Map: 100%|██████████| 867/867 [00:06<00:00, 144.32 examples/s]
Map: 100%|██████████| 110/110 [00:00<00:00, 1443.04 examples/s]
Map: 100%|██████████| 109/109 [00:00<00:00, 1338.42 examples/s]

{'dialogue': 'Customer: So neither my iPhone nor my Apple Watch are recording my steps/activity, and Health doesn’t recognise either source anymore for some reason. Any ideas? https://t.co/m9DPQbkftD\r\nCustomer: @AppleSupport please read the above.\r\nAgent: @135060 Let’s investigate this together. To start, can you tell us the software versions your iPhone and Apple Watch are running currently?\r\nCustomer: @AppleSupport My iPhone is on 11.1.2, and my watch is on 4.1.\r\nAgent: @135060 Thank you. Have you tried restarting both devices since this started happening?\r\nCustomer: @AppleSupport I’ve restarted both, also un-paired then re-paired the watch.\r\nAgent: @135060 Got it. When did you first notice that the two devices were not talking to each other. Do the two devices communicate through other apps such as Messages?\r\nCustomer: @AppleSupport Yes, everything seems fine, it’s just Health and activity.\r\nAgent: @135060 Let’s move to DM and look into this a bit more. When reaching




In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint_bart)

## Setup Training Evaluation

In [15]:
import evaluate, nltk
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gracz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gracz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gracz\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Using the latest cached version of the module from C:\Users\gracz\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bertscore\cf4907b18f8f741f202232c0f8009a3bd49ff98802c245abcb6ea51a37a8c05b (last modified on Mon Jul  8 15:40:51 2024) since it couldn't be found locally at evaluate-metric--bertscore, or remotely on the Hugging Face Hub.


ModuleNotFoundError: No module named 'bert_score'

In [None]:
def compute_metrics_abs(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    bert_scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bert_scores.pop('hashcode')
    result = {
      **{f"rouge/{k}": round(v, 4) for k,v in rouge_scores.items()},
      **{f"bertscore/bertscore-{k}": round(np.mean(v), 4) for k,v in bert_scores.items()},
      'meteor': round(meteor.compute(predictions=decoded_preds, references=decoded_labels)['meteor'], 4),
    }
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result


## Train

In [None]:
# Debug
!dir

 Volume in drive C has no label.
 Volume Serial Number is 2610-4815

 Directory of c:\Users\gracz\OneDrive - University of Limerick\University\Masters\Sem2\Thesis\notebooks

30/08/2024  20:52    <DIR>          .
24/07/2024  13:39    <DIR>          ..
27/08/2024  14:48                69 .gitignore
18/08/2024  13:24               107 .gitmodules
09/08/2024  15:45    <DIR>          .venv
27/08/2024  14:08            59,447 bert-input-cont-train-2508-1444.ipynb
27/08/2024  14:42            63,525 bert-input-cont-train-2708-1442.ipynb
16/08/2024  06:29           497,944 bert-plain-1308-1838.ipynb
16/08/2024  10:26            34,328 bert-plain-1608-1126.ipynb
18/08/2024  11:26           234,269 bert-plain-1808-1226.ipynb
28/08/2024  16:01           193,003 bertres_analyse_2808_1042.ipynb
16/08/2024  10:24            22,377 bert_plain_1608_1123.py
16/08/2024  10:26             3,567 bert_plain_script.py
09/08/2024  15:25           139,991 chatgpt-sum-0708-1154.ipynb
22/08/2024  16:44         

In [None]:
os.chdir('temp')
from debugtokens import check_tokenization_and_length, check_special_tokens_and_padding, check_dataset_and_collator
SPLITS= ('train', 'test', 'valid')
issues = []
for split in SPLITS:
    issues += check_tokenization_and_length(tokenizer, tweetsumm_abs['train']['dialogue'], tweetsumm_abs['train']['summary'])
    issues += check_special_tokens_and_padding(tokenizer, tweetsumm_abs[split])
    issues += check_dataset_and_collator(tweetsumm_abs[split], data_collator)

NameError: name 'os' is not defined

In [None]:
LEARN_RATES = (3e-5, 3e-4, 3e-6)

In [17]:
for exp_idx in range(0,4):
    
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_bart)
    current_time = datetime.datetime.now().strftime("%d%m-%H%M")
    print(current_time)
    run_name_model = f"distilbart-abs-{current_time}-lr-{LEARN_RATES[exp_idx]}"
    wandb.run.name = run_name_model
    wandb.run.save()

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"trained-distilbart-abs-{current_time[0:4]}",
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=10,
        learning_rate=LEARN_RATES[exp_idx],
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=6,
        num_train_epochs=6,
        predict_with_generate=True,
        fp16=True,
        generation_max_length=80,
        # generation_config=gen_config,
        push_to_hub=False,
        report_to="wandb",
        run_name=run_name_model
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_tweetsumm_abs["train"],
        eval_dataset=tokenized_tweetsumm_abs["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_abs,
    )

    training_start = time.time()
    trainer.train()
    training_end = time.time()
    print("Time it took for training:", str(datetime.timedelta(seconds=(training_end-training_start))))
    trainer.push_to_hub(run_name_model)



3008-2022


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,2.2292,2.114038,0.4426,0.2067,0.3721,0.3728,0.9007,0.8878,0.894,0.381,34.681818
2,1.7132,2.017842,0.4595,0.215,0.395,0.3959,0.9043,0.8913,0.8976,0.3934,33.3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'length_penalty': 0.5, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'length_penalty': 0.5, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TypeError: sequence item 35: expected str instance, NoneType found

In [None]:
wandb.finish()