**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**

In [None]:
import json, re
from huggingface_hub import notebook_login
# import pyarrow.feather as fthht
import pandas as pd
import numpy as np
import os, time, datetime

try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset

try:
  import accelerate
except:
  !pip install -U 'accelerate==0.27.2'
  import accelerate

# try:
#   import transformers
# except:
!pip install -U transformers[torch]
import transformers

from transformers import AutoTokenizer, DataCollatorForSeq2Seq, pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

print(transformers.__version__, accelerate.__version__)


In [None]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
set_seed(17)

os.environ["WANDB_PROJECT"] = "aiml-thesis-train"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

!pip install wandb
import wandb
wandb.init(settings=wandb.Settings(start_method="thread"))
# wandb.login()
# !wandb login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

## Load data

In [None]:
ds_dir = ""
if os.environ['KAGGLE_KERNEL_RUN_TYPE']:
    print("In Kaggle")
    ds_dir = "/kaggle/input/tweet-data-2106-1512/"

In [None]:
train_df_temp = pd.read_feather(ds_dir + "data/train_dial_abs_noex_noco_2006.feather")
train_df_temp.drop(columns=['index', 'company'], inplace=True)

In [None]:
val_df_temp = pd.read_feather(ds_dir + "data/val_dial_abs_noex_noco_2006.feather")
val_df_temp.drop(columns=['index', 'company'], inplace=True)

In [None]:
train_df_temp['summary'].values

In [None]:
from datasets import Dataset, DatasetDict

tweetsum_train_val_abs = DatasetDict(
 {
 'train': Dataset.from_pandas(train_df_temp),
 'validation': Dataset.from_pandas(val_df_temp)
 }
)

In [None]:
tweetsum_train_val_abs['train'][10]

In [None]:
# Source: https://huggingface.co/docs/transformers/en/tasks/summarization

def preprocess_function(examples):
  prefix = "summarize: "
  inputs = [prefix + dial for dial in examples["dialogue"]]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True) # same params as tweetsumm paper
  labels = tokenizer(text_target=examples["summary"], max_length=80, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
checkpoint_bart = "sshleifer/distilbart-xsum-12-6"

In [None]:
t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
bart_tokenizer = AutoTokenizer.from_pretrained(checkpoint_bart)

In [None]:
tokenizer = bart_tokenizer
tokenized_tweetsumm_abs = tweetsum_train_val_abs.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint_bart)

In [None]:
print(json.dumps(tokenized_tweetsumm_abs['train'][5], indent=2))

## Evaluate

In [None]:
!pip install evaluate nltk rouge_score bert_score

In [None]:
!pip install -U nltk

In [None]:
import evaluate

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

In [None]:
# import numpy as np


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     # result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
#     result = {
#       'rouge': rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True),
#       'bertscore': bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en"),
#       'meteor': meteor.compute(predictions=decoded_preds, references=decoded_labels),
#     }
#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
#     result["gen_len"] = np.mean(prediction_lens)
#     print(json.dumps(result, indent=2))
#     return {k: round(v, 4) if type(v) != list else v for k, v in result.items()}

In [None]:
def compute_metrics_abs(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  
  betscores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    np.mean(bertscores)
  result = {
    'rouge': rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True),
    'bertscore': ,
    'meteor': meteor.compute(predictions=decoded_preds, references=decoded_labels),
  }
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  print(json.dumps(result, indent=2))
  return result # {k: round(v, 4) if type(v) != list else v for k, v in result.items()}

## Train

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_bart)

In [None]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [None]:
training_start = time.time()
training_args = Seq2SeqTrainingArguments(
    output_dir="trained-distilbart-ext-2106",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    max_source_length=512,
    max_target_length=80,
    push_to_hub=True,
    report_to="wandb",
    run_name="distilbart-abs-2106_1747"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweetsumm_abs["train"],
    eval_dataset=tokenized_tweetsumm_abs["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_abs,
)

trainer.train()
training_end = time.time()
print("Time it took for training:", str(datetime.timedelta(seconds=(training_end-training_start))))

In [None]:
trainer.push_to_hub()