# Abstractive summaries - Train Distilt5 on TWEETSUMM dataset

In [1]:
from huggingface_hub import login
import pandas as pd
import numpy as np
import os, time, datetime, shutil

from datasets import Dataset, DatasetDict

from transformers import DataCollatorForSeq2Seq, AutoTokenizer, set_seed
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

import wandb

In [2]:
!pip freeze > requirements_t5.txt

  pid, fd = os.forkpty()


In [3]:
def get_current_time(underscore=False):
    return datetime.datetime.now().strftime("%d%m-%H%M" if not underscore else "%d%m_%H%M")

In [4]:
run_name = f"t5-abs-{get_current_time()}"
models_dir = os.path.join(os.getcwd(), 'models')
os.makedirs(models_dir, exist_ok=True)
results_dir = os.path.join(os.getcwd(), 'results', 't5')
os.makedirs(results_dir, exist_ok=True)
ds_dir = os.path.join(os.getcwd(), 'data')
print(run_name)

t5-abs-2309-1054


In [5]:
try:
    HF_TOKEN =  os.environ['HF_TOKEN']
except:
    HF_TOKEN = ""

if 'google.colab' in str(get_ipython()):
    print("Running on Colab")
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    HF_TOKEN = userdata.get('HF_TOKEN')
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE') != None:
    ds_dir = '/kaggle/input/bertdata2207/'
    from kaggle_secrets import UserSecretsClient
    print("Running on Kaggle")
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY
    os.makedirs(os.path.join(os.getcwd(), "results"), exist_ok=True)
    os.makedirs(os.path.join(os.getcwd(), 'results', 't5'), exist_ok=True)


Running on Kaggle


In [6]:
set_seed(17)

In [7]:
os.environ["WANDB_PROJECT"] = f"aiml-thesis-train-{run_name}"
os.environ["WANDB_WATCH"] = "false"
wandb.init(settings=wandb.Settings(start_method="thread"), id=run_name)

[34m[1mwandb[0m: Currently logged in as: [33mdawidk5[0m ([33mdawidk5ul[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.18.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240923_105412-t5-abs-2309-1054[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mt5-abs-2309-1054[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/dawidk5ul/aiml-thesis-train-t5-abs-2309-1054[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/dawidk5ul/aiml-thesis-train-t5-abs-2309-1054/runs/t5-abs-2309-1054[0m


In [8]:
login(token=HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load data

In [9]:
print(ds_dir)

/kaggle/input/bertdata2207/


In [10]:
checkpoint_t5 = "google-t5/t5-base"

In [11]:
def csv_to_pandas(file_name, ds_dir, drop_conv_id=True):
    df = pd.read_csv(os.path.join(ds_dir, file_name), names=['conv_id', 'dialogue', 'summary'], encoding='utf-8', dtype={'conv_id': 'string', 'dialogue': 'string', 'summary': 'string'})
    df = df.convert_dtypes()
    if drop_conv_id:
        df.drop(columns=['conv_id'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [12]:
train_df_temp = csv_to_pandas("dials_abs_2607_1312_train_spc.csv", ds_dir)
val_df_temp = csv_to_pandas("dials_abs_2607_1312_valid_spc.csv", ds_dir)
test_df = csv_to_pandas("dials_abs_2607_1312_test_spc.csv", ds_dir, drop_conv_id=False)

print(train_df_temp.dtypes)
print(train_df_temp.head(), len(train_df_temp))

dialogue    string[python]
summary     string[python]
dtype: object
                                            dialogue  \
0  Customer: So neither my iPhone nor my Apple Wa...   
1  Customer: @115850 hi team! i m planning to get...   
2  Customer: @AskAmex Where do I write to address...   
3  Customer: @AmazonHelp @115821 Wow, expected 4 ...   
4  Customer: @GWRHelp I'd rather you spent some t...   

                                             summary  
0  Customer enquired about his Iphone and Apple w...  
1  Customer is eager to know about the replacemen...  
2  Signed up for an AmexCard with Delta but it di...  
3  The customer have a problem. The agent is very...  
4  Customer cannot purchase a train ticket on the...   867


In [13]:
tweetsumm_abs = DatasetDict(
    {
        'train': Dataset.from_pandas(train_df_temp),
        'validation': Dataset.from_pandas(val_df_temp),
        'test': Dataset.from_pandas(test_df)
    }
)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_t5)
print(tokenizer)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

T5TokenizerFast(name_or_path='google-t5/t5-base', vocab_size=32100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extr



In [15]:
# Source: https://huggingface.co/docs/transformers/en/tasks/summarization

def preprocess_function(examples):
    prefix = "summarize: "
    inputs = [str(prefix) + str(dial) for dial in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True) # same params as tweetsumm paper
    labels = tokenizer(text_target=examples["summary"], max_length=80, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_tweetsumm_abs = tweetsumm_abs.map(preprocess_function, batched=True, remove_columns=['dialogue','summary'])
print(tokenized_tweetsumm_abs["train"][1])

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

{'input_ids': [21603, 10, 7327, 10, 3320, 15660, 17246, 7102, 372, 55, 3, 23, 3, 51, 1459, 12, 129, 2184, 1761, 16665, 7, 3, 55, 34, 1267, 30, 8, 475, 34, 65, 335, 477, 3709, 6755, 6, 54, 3, 76, 3209, 140, 125, 19, 34, 3, 58, 8628, 10, 3320, 2688, 4906, 2884, 101, 31, 162, 3, 9, 335, 1135, 7, 3709, 1291, 3, 99, 8, 2118, 25, 1204, 19, 6780, 42, 24701, 5, 3, 2, 9122, 7327, 10, 3320, 8123, 9, 8892, 29582, 8872, 3, 55, 299, 125, 3, 99, 3, 23, 737, 22, 17, 114, 8, 556, 11, 241, 12, 1205, 34, 8628, 10, 3320, 2688, 4906, 2884, 101, 3290, 31, 17, 36, 3, 179, 12, 1845, 8, 3, 60, 2528, 7, 15, 5146, 5, 242, 72, 251, 30, 1156, 5146, 1291, 5, 6557, 120, 1214, 30, 8, 1309, 2471, 270, 10, 4893, 1303, 17, 5, 509, 87, 122, 476, 476, 10665, 18519, 9082, 476, 5, 2276, 7886, 342, 39, 1705, 5, 7327, 10, 3320, 8123, 9, 8892, 29582, 8872, 2049, 3, 55, 299, 175, 33, 3, 2741, 6399, 7, 78, 405, 34, 1243, 337, 1291, 1581, 21, 175, 38, 168, 8628, 10, 3320, 2688, 4906, 2884, 2163, 6, 22545, 7, 87, 3, 2741, 6399, 7

## Setup Training Evaluation

In [17]:
!pip install -U nltk

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.5/1.5 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.9.1


In [18]:
!pip install evaluate pyrouge rouge_score bert_score meteor

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting pyrouge
  Downloading pyrouge-0.1.3.tar.gz (60 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.5/60.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting meteor
  Downloading meteor-2.0.16-py3-none-any.whl.metadata (8.3 kB)
Collecting bgzip<0.6.0,>=0.5.0 (from meteor)
  Downloading bgzip-0.5.0.tar.gz (100 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m100.2/100.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:0

In [19]:
import evaluate, nltk, csv
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

nltk.download('punkt_tab')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:
def compute_metrics_abs(eval_pred):
    predictions, labels = eval_pred
    # Extra line added to address an overflow: https://github.com/huggingface/transformers/issues/22634
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    bert_scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bert_scores.pop('hashcode')
    result = {
      **{f"rouge/{k}": round(v, 4) for k,v in rouge_scores.items()},
      **{f"bertscore/bertscore-{k}": round(np.mean(v), 4) for k,v in bert_scores.items()},
      'meteor': round(meteor.compute(predictions=decoded_preds, references=decoded_labels)['meteor'], 4),
    }
   
    result["gen_len"] = np.mean(prediction_lens)
    return result


## Train and Evaluate

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_t5)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [23]:
EXPERIMENT_PARAMS = []
BASE_PARAMS = {'lr':1e-4, 'batch_size':10, 'epochs': 20}
EXPERIMENT_PARAMS.append(BASE_PARAMS)

In [24]:
LEARN_RATES = (1e-3, 1e-4, 1e-5)
BATCH_SIZES = (2,5,10)
EPOCHS = (20,)

for lr in LEARN_RATES:
    for batch_size in BATCH_SIZES:
        for epoch in EPOCHS:
            if lr == BASE_PARAMS['lr'] and batch_size == BASE_PARAMS['batch_size'] and epoch == BASE_PARAMS['epochs']:
                continue
            experiment = {'lr':lr, 'batch_size':batch_size, 'epochs': epoch}
            EXPERIMENT_PARAMS.append(experiment)

In [25]:
def run_post_training(split, test_details, test_df_temp: pd.DataFrame, tokenizer, experiment, run_name_model, epoch, results_dir):
    # First line added due to label error, see 
    predictions = np.where(test_details.predictions != -100, test_details.predictions, tokenizer.pad_token_id)
    preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    test_df_temp['response'] = preds
    exp_res = {**experiment, **(test_details.metrics)}
    test_metrics_df = pd.DataFrame([exp_res])
    test_df_temp = test_df_temp.convert_dtypes()
    test_metrics_df = test_metrics_df.convert_dtypes()
    wandb.log({run_name_model: test_details.metrics})
    preds_name = f"{split}_preds_{run_name_model.replace('-','_')}_s{epoch}_t5.csv"
    metrics_name =  f"{split}_metrics_{run_name_model.replace('-','_')}_s{epoch}_t5.csv"
    test_df_temp.to_csv(os.path.join(results_dir, preds_name), index=False, header=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    test_metrics_df.to_csv(os.path.join(results_dir, metrics_name), index=False, header=True, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [26]:
class ExtraCallback(TrainerCallback):        
    def on_train_end(self, args, state, control, **kwargs):
        # Save and upload CSVs
        super().on_train_end(args, state, control, **kwargs)
        df = pd.DataFrame(state.log_history)
        df = df.convert_dtypes()
        df = df.groupby(['epoch'], as_index=False).sum()
        df.to_csv(os.path.join(results_dir, "log_" + args.run_name.replace('-','_') + ".csv"), header=True, index=False)

In [27]:
for count, exp in enumerate(EXPERIMENT_PARAMS):
    run_name_model = f"{run_name}-lr-{exp['lr']}-bs-{exp['batch_size']}-maxep-{exp['epochs']}"
    print("=== Starting experiment", count, f"on {get_current_time()}:", run_name_model, "training")
    wandb.run.name = run_name_model
    wandb.run.save()

    training_args = Seq2SeqTrainingArguments(
        output_dir=os.path.join(models_dir, run_name_model),
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_only_model=True,
        learning_rate=exp['lr'],
        per_device_train_batch_size=exp['batch_size'],
        per_device_eval_batch_size=exp['batch_size'],
        weight_decay=0.0,
        lr_scheduler_type='linear',
        warmup_ratio=0.1,
        gradient_accumulation_steps=2,
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_rouge/rougeL",
        greater_is_better=True,
        num_train_epochs=exp['epochs'],
        predict_with_generate=True,
        fp16=True,
        generation_max_length=80,
        push_to_hub=False,
        report_to="wandb",
        run_name=run_name_model,
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_tweetsumm_abs["train"], # .select(range(0,50)),
        eval_dataset=tokenized_tweetsumm_abs["validation"], # .select(range(0,10)),
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_abs,
    )
    trainer.add_callback(ExtraCallback)
    training_start = time.time()
    trainer.train()
    training_end = time.time()
    print(f"Finished experiment {count}: {run_name_model} - time it took for training:", str(datetime.timedelta(seconds=(training_end-training_start))))
    test_details = trainer.predict(tokenized_tweetsumm_abs['test'], metric_key_prefix='test')
    run_post_training('test', test_details, test_df, tokenizer, exp, run_name_model, trainer.state.best_model_checkpoint.split('-')[-1], results_dir)
    trainer.push_to_hub()
    shutil.rmtree(models_dir)
    os.makedirs(models_dir)



=== Starting experiment 0 on 2309-1056: t5-abs-2309-1054-lr-0.0001-bs-10-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
0,2.79,2.071168,0.4222,0.1746,0.3527,0.3532,0.8934,0.886,0.8895,0.3645,37.081818
2,1.7565,1.74082,0.4648,0.2233,0.3974,0.3992,0.8987,0.8919,0.8952,0.416,36.381818
4,1.4732,1.717622,0.4639,0.2163,0.3949,0.3968,0.8978,0.8942,0.8958,0.4189,38.072727
6,1.2905,1.745634,0.4584,0.2067,0.3851,0.3868,0.8975,0.8934,0.8953,0.4114,37.090909
8,1.1479,1.779385,0.4721,0.2233,0.4014,0.4037,0.8957,0.8964,0.8959,0.4378,41.254545
10,1.0456,1.843421,0.4585,0.208,0.3894,0.3913,0.8964,0.8936,0.8948,0.4136,37.836364
12,0.9618,1.883425,0.4702,0.2214,0.3996,0.4023,0.8949,0.8978,0.8962,0.441,42.6
14,0.9053,1.908234,0.4687,0.2231,0.4016,0.4043,0.8967,0.898,0.8972,0.4341,41.718182
16,0.866,1.927603,0.4615,0.2129,0.3936,0.3955,0.8945,0.8971,0.8957,0.4273,42.009091
18,0.8412,1.946738,0.4691,0.2222,0.4018,0.4035,0.8959,0.898,0.8968,0.4327,41.690909


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 0: t5-abs-2309-1054-lr-0.0001-bs-10-maxep-20 - time it took for training: 0:30:31.087447


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

=== Starting experiment 1 on 2309-1127: t5-abs-2309-1054-lr-0.001-bs-2-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,1.0524,1.979769,0.4548,0.2064,0.3831,0.3848,0.8919,0.8942,0.8929,0.4177,41.263636
2,1.4167,1.99031,0.4327,0.1886,0.3657,0.3672,0.8947,0.8843,0.8893,0.3692,33.854545
3,1.3767,2.004131,0.4527,0.2062,0.3851,0.3862,0.8945,0.8927,0.8934,0.4073,38.072727
4,1.0766,2.026762,0.4611,0.2088,0.3905,0.3915,0.8985,0.8923,0.8952,0.4052,36.418182
5,1.0071,2.358353,0.4183,0.1611,0.3443,0.3455,0.8901,0.8858,0.8877,0.3591,36.990909
6,1.1108,2.573919,0.4169,0.1599,0.3463,0.3463,0.8895,0.8835,0.8861,0.3516,35.263636
7,1.2043,2.788421,0.3995,0.1628,0.3339,0.3347,0.8717,0.8827,0.876,0.341,38.736364
8,1.24,2.788421,0.3995,0.1628,0.3339,0.3347,0.8717,0.8827,0.876,0.341,38.736364
9,1.2303,2.788421,0.3995,0.1628,0.3339,0.3347,0.8717,0.8827,0.876,0.341,38.736364
10,1.218,2.788421,0.3995,0.1628,0.3339,0.3347,0.8717,0.8827,0.876,0.341,38.736364


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 1: t5-abs-2309-1054-lr-0.001-bs-2-maxep-20 - time it took for training: 0:53:29.954177


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

=== Starting experiment 2 on 2309-1222: t5-abs-2309-1054-lr-0.001-bs-5-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,0.6833,2.254992,0.4516,0.2098,0.3797,0.3798,0.8907,0.8965,0.8934,0.4298,44.781818
2,0.5046,2.408205,0.4581,0.2105,0.3884,0.3894,0.8959,0.8952,0.8954,0.4107,40.272727
3,0.4865,2.305182,0.4316,0.173,0.3543,0.3558,0.8888,0.8913,0.8899,0.3841,43.172727
4,0.5507,2.519729,0.4394,0.1772,0.3647,0.3662,0.8906,0.8916,0.8909,0.3934,42.072727
5,0.5415,2.531945,0.4398,0.1786,0.3644,0.3657,0.8909,0.8917,0.8911,0.394,41.936364
6,0.536,2.531953,0.4395,0.1786,0.3645,0.3658,0.8909,0.8917,0.8911,0.3935,41.918182
7,0.5393,2.531953,0.4395,0.1786,0.3645,0.3658,0.8909,0.8917,0.8911,0.3935,41.918182
8,0.5442,2.531953,0.4395,0.1786,0.3645,0.3658,0.8909,0.8917,0.8911,0.3935,41.918182
9,0.5562,2.531953,0.4395,0.1786,0.3645,0.3658,0.8909,0.8917,0.8911,0.3935,41.918182
10,0.5472,2.531953,0.4395,0.1786,0.3645,0.3658,0.8909,0.8917,0.8911,0.3935,41.918182


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 2: t5-abs-2309-1054-lr-0.001-bs-5-maxep-20 - time it took for training: 0:37:41.743113


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

=== Starting experiment 3 on 2309-1301: t5-abs-2309-1054-lr-0.001-bs-10-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
0,0.3887,2.66724,0.4542,0.2002,0.3796,0.3796,0.891,0.8957,0.8932,0.4204,43.581818
2,0.2408,2.863127,0.4364,0.1874,0.3679,0.3694,0.8935,0.8896,0.8914,0.3822,38.045455
4,0.1864,2.942357,0.4515,0.2049,0.3805,0.3809,0.8914,0.8972,0.8942,0.4153,45.090909
6,0.1076,2.946899,0.4608,0.2096,0.3901,0.3898,0.8934,0.8964,0.8947,0.4271,42.836364
8,0.0775,2.704494,0.4555,0.2091,0.3843,0.3846,0.8928,0.8946,0.8936,0.4172,41.927273
10,0.0618,2.724976,0.4716,0.2218,0.3976,0.398,0.894,0.8977,0.8957,0.4389,42.818182
12,0.0503,2.773637,0.4556,0.2084,0.3826,0.3843,0.8942,0.8937,0.8938,0.4077,40.245455
14,0.0457,2.819016,0.4657,0.2136,0.3933,0.3943,0.8954,0.8955,0.8953,0.4204,40.381818
16,0.0431,2.840088,0.4639,0.2136,0.3893,0.39,0.8947,0.8962,0.8953,0.4237,41.472727
18,0.0419,2.860684,0.462,0.2116,0.3868,0.3881,0.8937,0.8961,0.8948,0.4239,41.945455


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 3: t5-abs-2309-1054-lr-0.001-bs-10-maxep-20 - time it took for training: 0:30:33.408033


training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

=== Starting experiment 4 on 2309-1333: t5-abs-2309-1054-lr-0.0001-bs-2-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,0.049,3.072577,0.4642,0.2147,0.395,0.3945,0.8959,0.897,0.8963,0.4246,41.036364
2,0.025,3.527822,0.4809,0.2331,0.4126,0.4135,0.8997,0.9002,0.8998,0.44,40.254545
3,0.0177,3.77093,0.4672,0.2131,0.3912,0.3918,0.8974,0.8961,0.8966,0.4224,40.0
4,0.014,3.832662,0.4738,0.2244,0.4005,0.4009,0.8966,0.8985,0.8974,0.4361,42.136364
5,0.0278,3.886467,0.4679,0.2181,0.3942,0.3949,0.8968,0.8983,0.8974,0.4296,41.590909
6,0.0246,3.869728,0.4642,0.2147,0.3904,0.3915,0.8959,0.8976,0.8966,0.421,41.681818
7,0.0204,3.973746,0.4646,0.2159,0.395,0.3953,0.8964,0.8967,0.8964,0.421,40.727273
8,0.0179,4.036695,0.461,0.2102,0.3896,0.3904,0.8969,0.8946,0.8956,0.4122,38.972727
9,0.0158,4.038353,0.4695,0.2117,0.391,0.3921,0.8975,0.8978,0.8976,0.4269,40.445455
10,0.0159,4.044585,0.4672,0.2166,0.3945,0.3951,0.8966,0.8982,0.8972,0.4296,41.309091


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 4: t5-abs-2309-1054-lr-0.0001-bs-2-maxep-20 - time it took for training: 0:50:46.061866


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

=== Starting experiment 5 on 2309-1425: t5-abs-2309-1054-lr-0.0001-bs-5-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,0.0239,3.530711,0.4777,0.229,0.409,0.4106,0.8977,0.8993,0.8984,0.4382,41.054545
2,0.0141,3.666675,0.4765,0.2246,0.4059,0.4075,0.9001,0.8985,0.8991,0.429,39.236364
3,0.027,3.715829,0.4704,0.219,0.3992,0.3991,0.8956,0.8967,0.896,0.4319,40.845455
4,0.0247,3.73199,0.4663,0.2173,0.3945,0.3947,0.8959,0.8973,0.8965,0.4271,41.6
5,0.0225,3.803136,0.4767,0.2219,0.4017,0.4025,0.8975,0.8977,0.8975,0.4341,40.1
6,0.0196,3.851592,0.4703,0.2223,0.3989,0.3996,0.8958,0.8977,0.8967,0.4337,41.4
7,0.0168,3.902792,0.4747,0.227,0.4023,0.4029,0.8968,0.8987,0.8976,0.4378,41.3
8,0.0165,3.911555,0.4676,0.2224,0.3955,0.397,0.8965,0.8974,0.8968,0.4305,41.472727
9,0.0153,3.926814,0.4737,0.2288,0.4016,0.4025,0.8965,0.8984,0.8973,0.4411,41.454545
10,0.0149,3.951328,0.48,0.2329,0.4095,0.4101,0.8989,0.8997,0.8992,0.4438,41.027273


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 5: t5-abs-2309-1054-lr-0.0001-bs-5-maxep-20 - time it took for training: 0:36:12.937811


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

=== Starting experiment 6 on 2309-1503: t5-abs-2309-1054-lr-1e-05-bs-2-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,0.0048,4.019135,0.4796,0.2348,0.4105,0.4113,0.8989,0.8999,0.8993,0.445,41.163636
2,0.0019,4.049019,0.4749,0.2307,0.406,0.4074,0.8979,0.8986,0.8981,0.4412,40.836364
3,0.0062,4.064366,0.4795,0.2336,0.4078,0.4094,0.898,0.9,0.8988,0.4468,41.9
4,0.0062,4.066042,0.4789,0.2299,0.4056,0.4062,0.8986,0.899,0.8986,0.4406,41.190909
5,0.0114,4.076062,0.4755,0.2298,0.4046,0.405,0.899,0.8991,0.8989,0.4421,40.818182
6,0.0106,4.085409,0.4732,0.2267,0.401,0.4021,0.8982,0.8992,0.8986,0.4401,41.127273
7,0.0112,4.09934,0.4706,0.2273,0.4008,0.402,0.8965,0.8987,0.8975,0.4396,41.718182
8,0.0108,4.094891,0.4696,0.2269,0.3982,0.399,0.8971,0.8987,0.8978,0.442,41.872727
9,0.0109,4.094615,0.4742,0.2304,0.4035,0.4037,0.8982,0.8992,0.8986,0.4447,41.336364
10,0.0103,4.101743,0.4769,0.2333,0.4064,0.4068,0.8988,0.8996,0.8991,0.4469,41.118182


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 6: t5-abs-2309-1054-lr-1e-05-bs-2-maxep-20 - time it took for training: 0:51:36.286662


training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

=== Starting experiment 7 on 2309-1556: t5-abs-2309-1054-lr-1e-05-bs-5-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
1,0.0043,3.967039,0.4794,0.2341,0.4098,0.4105,0.8988,0.9001,0.8993,0.4454,41.309091
2,0.0021,3.984583,0.482,0.2397,0.4136,0.4144,0.8988,0.8999,0.8993,0.4495,41.218182
3,0.0026,4.009699,0.4788,0.2365,0.4095,0.4104,0.8982,0.8995,0.8987,0.4461,41.327273
4,0.0028,4.033222,0.4773,0.2371,0.4078,0.4086,0.8974,0.8989,0.898,0.4476,41.690909
5,0.0027,4.049181,0.4799,0.2368,0.4087,0.4095,0.8981,0.8997,0.8988,0.4493,41.681818
6,0.0023,4.065958,0.4766,0.2319,0.405,0.4055,0.8971,0.899,0.8979,0.4466,41.827273
7,0.0023,4.081906,0.4777,0.2334,0.4066,0.407,0.8978,0.8988,0.8982,0.4457,41.527273
8,0.0023,4.091249,0.4799,0.2336,0.4085,0.4092,0.8979,0.8994,0.8985,0.4496,41.636364
9,0.0021,4.10352,0.4774,0.2328,0.4067,0.4075,0.8979,0.899,0.8983,0.4456,41.590909
10,0.0025,4.117744,0.4769,0.2321,0.4058,0.4064,0.898,0.8989,0.8983,0.4438,41.172727


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 7: t5-abs-2309-1054-lr-1e-05-bs-5-maxep-20 - time it took for training: 0:36:29.638408


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

=== Starting experiment 8 on 2309-1634: t5-abs-2309-1054-lr-1e-05-bs-10-maxep-20 training


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Rouge/rouge1,Rouge/rouge2,Rouge/rougel,Rouge/rougelsum,Bertscore/bertscore-precision,Bertscore/bertscore-recall,Bertscore/bertscore-f1,Meteor,Gen Len
0,0.0089,3.987122,0.4806,0.2393,0.4125,0.4129,0.8987,0.8999,0.8991,0.4493,41.7
2,0.0046,3.997303,0.4806,0.2358,0.4101,0.4109,0.8984,0.8993,0.8988,0.448,41.2
4,0.0051,4.006194,0.4817,0.2381,0.4116,0.4125,0.8996,0.8992,0.8993,0.4456,40.545455
6,0.0046,4.012106,0.4795,0.2331,0.4083,0.409,0.8991,0.8986,0.8987,0.4393,40.145455
8,0.0043,4.012431,0.4778,0.2344,0.4076,0.4083,0.899,0.8988,0.8988,0.4402,40.536364
10,0.0113,4.014878,0.4794,0.2361,0.4088,0.4096,0.8985,0.8992,0.8987,0.4436,41.209091
12,0.0116,4.008317,0.4811,0.2378,0.411,0.4119,0.8992,0.8997,0.8993,0.4472,41.363636
14,0.0109,4.005631,0.4796,0.2362,0.409,0.4096,0.8987,0.9,0.8992,0.4476,41.763636
16,0.0117,4.003932,0.4789,0.234,0.4076,0.4084,0.8992,0.8997,0.8993,0.4455,41.245455
18,0.0111,4.004401,0.4776,0.2339,0.4069,0.408,0.8986,0.8998,0.8991,0.4456,41.6


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Finished experiment 8: t5-abs-2309-1054-lr-1e-05-bs-10-maxep-20 - time it took for training: 0:30:39.714868


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

In [28]:
# Using wandb documentation: https://docs.wandb.ai/guides/artifacts
def log_csv_wandb(results_path, model_name):
    artifact = wandb.Artifact(name=model_name, type="predictions")
    for root, dirs, files in os.walk(results_path):
        for file in files:
            artifact.add_file(local_path=os.path.join(root, file), name=file)
    wandb.log_artifact(artifact)

In [29]:
log_csv_wandb(results_dir, run_name)

In [30]:
print("Finished all training and evaluation for", run_name)
wandb.finish()

Finished all training and evaluation for t5-abs-2309-1054


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:        eval/bertscore/bertscore-f1 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñÅ‚ñà‚ñà‚ñà‚ñÅ‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
[34m[1mwandb[0m: eval/bertscore/bertscore-precision ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñÅ‚ñà‚ñà‚ñà‚ñÅ‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
[34m[1mwandb[0m:    eval/bertscore/bertscore-recall ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñÅ‚ñà‚ñà‚ñà‚ñÅ‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
[34m[1mwandb[0m:                       eval/gen_len ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÅ‚ñà‚ñà‚ñà‚ñÅ‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
[34m[1mwandb[0m:                          eval/loss ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÑ‚ñÑ ‚ñÉ‚ñÉ‚ñÉ  ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñá‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

In [31]:
print("Results uploaded")

Results uploaded
