# Abstractive summaries - Train DistilBART on TWEETSUMM dataset

In [1]:
from huggingface_hub import login
import pandas as pd
import numpy as np
import os, time, datetime

from datasets import Dataset, DatasetDict

from transformers import DataCollatorForSeq2Seq, AutoTokenizer, set_seed
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import wandb

In [2]:
!pip freeze > requirements_bart_a100.txt

In [3]:
ds_dir = os.path.join(os.getcwd(), 'data')
try:
    HF_TOKEN =  os.environ['HF_TOKEN']
except:
    HF_TOKEN = ""

if 'google.colab' in str(get_ipython()):
    print("Running on Colab")
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    HF_TOKEN = userdata.get('HF_TOKEN')
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE') != None:
    ds_dir = '/kaggle/input/bertdata2207/'
    # ds_dir="/kaggle/input/bertdata2207/"
    from kaggle_secrets import UserSecretsClient
    print("Running on Kaggle")
    # ds_dir = "/kaggle/input/tweet-data-2106-1512/"
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY


In [4]:
set_seed(17)
os.environ["WANDB_PROJECT"] = "aiml-thesis-train-test"

In [5]:
wandb.init(settings=wandb.Settings(start_method="thread"))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdawidk5[0m ([33mdawidk5ul[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
login(token=HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/dawidk/.cache/huggingface/token
Login successful


## Load data

In [7]:
print(ds_dir)

/home/dawidk/bartabs/aiml-thesis/data


In [8]:
checkpoint_bart = "sshleifer/distilbart-xsum-12-6"

In [9]:
train_df_temp = pd.read_csv(os.path.join(ds_dir,"dials_abs_2607_1312_train_spc.csv"), names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
train_df_temp.convert_dtypes()
train_df_temp.drop(columns=['conv_id'], inplace=True)
train_df_temp.reset_index(drop=True, inplace=True)

val_df_temp = pd.read_csv(os.path.join(ds_dir,"dials_abs_2607_1312_valid_spc.csv"), names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
val_df_temp.convert_dtypes()
val_df_temp.drop(columns=['conv_id'], inplace=True)
val_df_temp.reset_index(drop=True, inplace=True)

test_df_temp = pd.read_csv(os.path.join(ds_dir,"dials_abs_2607_1312_test_spc.csv"), names=['conv_id','dialogue','summary'], encoding='utf-8', dtype={'conv_id':'string', 'dialogue':'string', 'summary': 'string'})
test_df_temp.convert_dtypes()
test_df_temp.reset_index(drop=True, inplace=True)

print(train_df_temp.dtypes)
print(train_df_temp.head())

PD_DATASETS = {'train': train_df_temp, 'validation': val_df_temp, 'test': test_df_temp}

dialogue    string[python]
summary     string[python]
dtype: object
                                            dialogue  \
0  Customer: So neither my iPhone nor my Apple Wa...   
1  Customer: @115850 hi team! i m planning to get...   
2  Customer: @AskAmex Where do I write to address...   
3  Customer: @AmazonHelp @115821 Wow, expected 4 ...   
4  Customer: @GWRHelp I'd rather you spent some t...   

                                             summary  
0  Customer enquired about his Iphone and Apple w...  
1  Customer is eager to know about the replacemen...  
2  Signed up for an AmexCard with Delta but it di...  
3  The customer have a problem. The agent is very...  
4  Customer cannot purchase a train ticket on the...  


In [10]:
tweetsumm_abs = DatasetDict(
    {
        'train': Dataset.from_pandas(train_df_temp),
        'validation': Dataset.from_pandas(val_df_temp),
        'test': Dataset.from_pandas(test_df_temp)
    }
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_bart)
print(tokenizer)

BartTokenizerFast(name_or_path='sshleifer/distilbart-xsum-12-6', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}




In [12]:
# Source: https://huggingface.co/docs/transformers/en/tasks/summarization

def preprocess_function(examples):
    prefix = "summarize: "
    inputs = [str(prefix) + str(dial) for dial in examples["dialogue"]]
    with tokenizer.as_target_tokenizer():
        model_inputs = tokenizer(inputs, max_length=512, truncation=True) # same params as tweetsumm paper
        labels = tokenizer(text_target=examples["summary"], max_length=80, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    print(model_inputs.keys())
    return model_inputs

In [13]:
tokenized_tweetsumm_abs = tweetsumm_abs.map(preprocess_function, batched=True, remove_columns=['dialogue','summary'])
print(tokenized_tweetsumm_abs["train"][1])

Map:   0%|          | 0/867 [00:00<?, ? examples/s]



dict_keys(['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/110 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/109 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])
{'input_ids': [0, 18581, 3916, 2072, 35, 19458, 35, 787, 1225, 4432, 1096, 20280, 165, 328, 939, 475, 1884, 7, 120, 1257, 1754, 510, 20529, 27785, 24, 924, 15, 5, 998, 24, 34, 158, 360, 5010, 21784, 6, 64, 1717, 3922, 162, 99, 16, 24, 17487, 50118, 45443, 35, 787, 2481, 3897, 2036, 166, 348, 10, 158, 7033, 5010, 714, 114, 5, 6880, 47, 829, 16, 5009, 50, 31559, 4, 37249, 10237, 50118, 44799, 35, 787, 25146, 28780, 5148, 27785, 125, 99, 114, 939, 399, 17, 27, 90, 101, 5, 1152, 8, 236, 7, 671, 24, 50118, 45443, 35, 787, 2481, 3897, 2036, 166, 1979, 75, 28, 441, 7, 3264, 5, 23312, 2886, 4, 286, 55, 335, 15, 1830, 2886, 714, 4, 17161, 352, 3753, 15, 5, 3104, 1373, 259, 35, 1205, 640, 90, 4, 876, 73, 571, 40969, 9380, 530, 4154, 510, 975, 4, 3166, 19954, 877, 110, 2969, 4, 50118, 44799, 35, 787, 25146, 28780, 5148, 2446, 27785, 125, 209, 32, 5567, 15797, 98, 473, 24, 1266, 276, 714, 3253, 13, 209, 25, 157, 50118, 45443, 35, 787, 2481, 3897

## Setup Training Evaluation

In [14]:
!pip install evaluate pyrouge rouge_score bert_score meteor

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [15]:
import evaluate, nltk, csv
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /home/dawidk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dawidk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dawidk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/dawidk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
def compute_metrics_abs(eval_pred):
    predictions, labels = eval_pred
    
    with tokenizer.as_target_tokenizer():
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    bert_scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bert_scores.pop('hashcode')
    result = {
      **{f"rouge/{k}": round(v, 4) for k,v in rouge_scores.items()},
      **{f"bertscore/bertscore-{k}": round(np.mean(v), 4) for k,v in bert_scores.items()},
      'meteor': round(meteor.compute(predictions=decoded_preds, references=decoded_labels)['meteor'], 4),
    }
   
    result["gen_len"] = np.mean(prediction_lens)
    return result


## Train and Evaluate

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_bart)

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [19]:
my_batch = data_collator(tokenized_tweetsumm_abs['train'])
assert len(my_batch) == 4 # default setting for the model

In [20]:
EXPERIMENT_PARAMS = []
BASE_PARAMS = {'lr':3e-5, 'batch_size':4, 'epochs': 6}
EXPERIMENT_PARAMS.append(BASE_PARAMS)

In [21]:
LEARN_RATES = (3e-5, 3e-4, 3e-6)
BATCH_SIZES = (4, 2, 8)
EPOCHS = (6,10)

for lr in LEARN_RATES:
    for batch_size in BATCH_SIZES:
        for epoch in EPOCHS:
            if lr == BASE_PARAMS['lr'] and batch_size == BASE_PARAMS['batch_size'] and epoch == BASE_PARAMS['epochs']:
                continue
            experiment = {'lr':lr, 'batch_size':batch_size, }
            EXPERIMENT_PARAMS.append(experiment)

In [22]:
def run_post_training(split, test_details, test_df_temp: pd.DataFrame, tokenizer, experiment, run_name_model):
    preds = tokenizer.batch_decode(test_details.predictions, skip_special_tokens=True)
    test_df_temp['response'] = preds
    exp_res = None
    csv_items = {**experiment, **(test_details.metrics)}
    if not exp_res:
        exp_res = {k: list() for k in csv_items.keys()}
    else:
        for k, v in csv_items.items():
            exp_res[k].append(v)

    test_metrics_df = pd.DataFrame(exp_res)
    print(test_metrics_df.head())
    test_df_temp.convert_dtypes()
    test_metrics_df.convert_dtypes()
    print(test_df_temp.dtypes)
    print(test_metrics_df.dtypes)
    print(test_df_temp.head())
    print(test_metrics_df.head())
    wandb.log({run_name_model: test_details.metrics})
    preds_name = f"{split}_preds_{run_name_model.replace('-','_')}_bart.csv"
    metrics_name =  f"{split}_metrics_{run_name_model.replace('-','_')}_bart.csv"
    test_df_temp.to_csv(os.path.join(os.getcwd(), 'results', preds_name), index=False, header=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    test_metrics_df.to_csv(os.path.join(os.getcwd(), 'results', metrics_name), index=False, header=True, encoding='utf-8', quoting=csv.QUOTE_ALL)


In [23]:
def get_current_time():
    return datetime.datetime.now().strftime("%d%m-%H%M")

In [24]:
exp_res = None
for exp in EXPERIMENT_PARAMS:
    current_time = get_current_time()
    run_name_model = f"bart-abs-{current_time}-lr-{exp['lr']}-bs-{exp['batch_size']}-ep-{exp['epochs']}"
    print("Starting", run_name_model, "training")
    wandb.run.name = run_name_model
    wandb.run.save()

    training_args = Seq2SeqTrainingArguments(
        output_dir=os.path.join('..', f"trained-distilbart-abs-{current_time}"),
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=10,
        learning_rate=exp['lr'],
        per_device_train_batch_size=exp['batch_size'],
        per_device_eval_batch_size=exp['batch_size'],
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=10,
        num_train_epochs=1,
        predict_with_generate=True,
        fp16=True,
        generation_max_length=80,
        push_to_hub=True,
        report_to="wandb",
        run_name=run_name_model
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_tweetsumm_abs["train"],
        eval_dataset=tokenized_tweetsumm_abs["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_abs,
    )

    training_start = time.time()
    trainer.train()
    training_end = time.time()
    print("Finished. Time it took for training:", str(datetime.timedelta(seconds=(training_end-training_start))))
    for split in ('train', 'validation', 'test'):
        test_details = trainer.predict(tokenized_tweetsumm_abs[split], metric_key_prefix=split)
        run_post_training(split, test_details, PD_DATASETS[split], tokenizer, exp, run_name_model)
    trainer.push_to_hub(run_name_model)
    break



Starting bart-abs-0509-2104-lr-3e-05-bs-4-ep-6 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,2.2322,2.133909,0.4611,0.2165,0.3925,0.3938,0.9041,0.8896,0.8967,0.3978,32.527273


Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'length_penalty': 0.5, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'length_penalty': 0.5, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
No files have been modified since last commit. Skipping to prevent empty commit.


Finished. Time it took for training: 0:02:48.133706


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):




OverflowError: out of range integral type conversion attempted

In [None]:
print("Finished all training and evaluation")
wandb.finish()

In [None]:
run_name_git = f"bart-abs-{get_current_time()}"

In [None]:
%env BART_TRAINING_NAME={run_name_git}

In [None]:
!git add .
!sleep 2
!git commit -m "Upload results $BART_TRAINING_NAME"
!sleep 2
!git push origin main
!sleep 2

In [None]:
print("Results uploaded")