## ***GPT-2-Medium - Text Generation Task***

In [None]:
!pip install transformers datasets jiwer -qq
!pip install evaluate sacrebleu rouge_score -qq
!pip install --upgrade accelerate -qq
!pip install wandb -Uqq

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer,TrainingArguments
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import sacrebleu
import csv

### ***Configuration***

In [5]:
class cfc:

  test_file_path = "/data/datafiles/test_data.json"

  checkpoint = "gpt2-medium"
  #Naming convention {model_name}-finetuned-{source_lang}-to-{target_lang}
  model_name = "GPT-2-Medium-latex-to-text-hypertuned"
  model_dir = f"/content/drive/MyDrive/models/{model_name}"

  # Hyperparameter
  #lr_rate = 4e-5
  #batch_size = 32
  #epochs = 10
  #weight_decay = 0.01

   # Tuned Hyperparameter
  lr_rate = 0.0000382
  batch_size = 64
  epochs = 10
  weight_decay = 0

  wandb_project = "NLG"
  run_name = model_name

## ***Data preprocessing***

In [6]:
train_path = "/content/drive/MyDrive/data/my_corpus/train_data.csv"
valid_path = "/content/drive/MyDrive/data/my_corpus/valid_data.csv"
test_path  = "/content/drive/MyDrive/data/my_corpus/test_data.csv"

In [7]:
# Hinzufügen des Prompt für das GPT-2 Modell "Generate spoken text of"

def prepare_data(input_file, output_file, prompt):
    """
    Reads in a CSV file containing LaTeX formula and spoken text pairs and adds the
    given prompt to the start of each line, then writes the modified lines to a new file.
    """
    # Open the input and output files
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        reader = csv.reader(f_in)
        writer = csv.writer(f_out)

        # Loop through rows in the input file
        for row in reader:
            # Extract LaTeX formula and spoken text
            latex_formula = row[1]
            spoken_text = row[2]

            # Add prompt to beginning of row
            row = [f"{prompt} {latex_formula}", spoken_text]

            # Write modified row to output file
            writer.writerow(row)

In [8]:
prompt = "generate spoken text of: "

prepare_data(train_path, "/content/drive/MyDrive/data/my_corpus/gpt2_formatted_train_data.csv", prompt)
prepare_data(valid_path, "/content/drive/MyDrive/data/my_corpus/gpt2_formatted_valid_data.csv", prompt)
prepare_data(test_path, "/content/drive/MyDrive/data/my_corpus/gpt2_formatted_test_data.csv", prompt)

In [None]:
from transformers import AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained(cfc.checkpoint)
model = AutoModelWithLMHead.from_pretrained(cfc.checkpoint)

In [10]:
# GPT2 only has bos/eos tokens but no decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id

#### ***Building the TextDataset***

In [11]:
train_path_gpt2 = "/data/datafiles/gpt2_formatted_train_data.csv"
valid_path_gpt2 = "/data/datafiles/gpt2_formatted_valid_data.csv"
test_path_gpt2 = "/data/datafiles/gpt2_formatted_test_data.csv"

In [12]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path_gpt,valid_path_gpt,test_path_gpt,tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path_gpt,block_size=128)
    valid_dataset = TextDataset(tokenizer=tokenizer,file_path=valid_path_gpt,block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer,file_path=valid_path_gpt,block_size=128)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return train_dataset,valid_dataset,test_dataset,data_collator

In [None]:
train_dataset,valid_dataset,test_dataset,data_collator = load_dataset(
    train_path_gpt2,
    valid_path_gpt2,
    test_path_gpt2,
    tokenizer
    )

### ***Metrics for evaluation***

In [None]:
from transformers import EvalPrediction
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

bleu = evaluate.load("bleu")
ter = evaluate.load("ter")
rouge = evaluate.load("rouge")


def compute_metrics(pred):
    preds = pred.predictions
    labels = pred.label_ids

    preds_str = tokenizer.batch_decode(preds.argmax(-1), skip_special_tokens=True)

    labels[labels == -100] = tokenizer.eos_token_id
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_res = bleu.compute(predictions=preds_str, references=labels_str)
    ter_res = ter.compute(predictions=preds_str, references=labels_str)
    rouge_res = rouge.compute(predictions=preds_str, references=labels_str)
    ter_acc = (1 - (ter_res["score"] / 100))

    metrics = {
        "BLEU": bleu_res["bleu"],
        "TER": ter_res["score"],
        "TER-ACC": ter_acc,
        "ROUGE-1": rouge_res["rouge1"],
        "ROUGE-2": rouge_res["rouge2"],
        "ROUGE-L": rouge_res["rougeL"],
    }
    return metrics

# ***Fine-Tuning***

### ***Configure Weigths and Biases für Hyperparameter Tuning***

In [None]:
import wandb
wandb.login()

wandb.init(
    project=cfc.wandb_project,
    name = cfc.run_name,

    config={"architecture": "GPT-2-Medium-Hyper","dataset": "Formula2Text-4k"}
    )

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    cfc.model_dir,
    report_to = "wandb",
    evaluation_strategy = "steps",
    eval_steps = 200,
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=200,
    logging_steps=200,
    per_device_train_batch_size=cfc.batch_size,
    per_device_eval_batch_size=cfc.batch_size,
    fp16=True,
    num_train_epochs=cfc.epochs,
    learning_rate=cfc.lr_rate,
    weight_decay=cfc.weight_decay,
    save_total_limit=1,
    load_best_model_at_end=True,
    )

In [None]:
def model_init():
    return AutoModelWithLMHead.from_pretrained(cfc.checkpoint)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model()

In [None]:
wandb.finish()

# ***Hyperparameter Tuning***

In [None]:
import wandb
wandb.login()

In [None]:
%env WANDB_PROJECT=NLG_Sweeps
%env WANDB_LOG_MODEL=true

In [None]:
import pprint

sweep_config = {'method': 'random'}
metric = {'name': 'BLEU', 'goal': 'maximize'}

sweep_config['metric'] = metric

# define the hyperparameters
parameters_dict = {
    "epochs": {
        "values": [2,5,7,10,15]
        },
    "batch_size" : {
        "values":[4,8,16,32,64]
        },
    "learning_rate" : {
        "distribution" : "log_uniform_values",
        "min" : 1e-5, "max" : 1e-3
    },
    "weight_decay" :{
        "values" : [0.0,0.01,0.1,0.2]
        },
}
sweep_config["parameters"] = parameters_dict
pprint.pprint(sweep_config)

In [None]:
sweep_id = wandb.sweep(sweep_config, project="NLG_Sweeps")

### ***Training Function for hyperparameter tuning***

In [None]:
def train(config=None):
  with wandb.init(config=config):
    config = wandb.config

    training_args = TrainingArguments(
        output_dir = "/content/drive/MyDrive/sweeps/GPT-2-sweeps",
        report_to = "wandb",
        run_name=cfc.model_name,
        evaluation_strategy = "steps",
        eval_steps = 200,
        logging_strategy="steps",
        save_strategy="steps",
        save_steps=600,
        logging_steps=200,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size,
        fp16=False,
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        save_total_limit=1,
        load_best_model_at_end=True,
        )

    def model_init():
        return AutoModelWithLMHead.from_pretrained(cfc.checkpoint)


    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    wandb.log({"BLEU": metric})
    trainer.save_model()

In [None]:
wandb.agent(sweep_id, train, count=5)

In [None]:
wandb.finish()

# ***Evaluation on Testset***

In [19]:
from google.colab import files

In [21]:
!cp /utils/cf_custom_functions.py /content

In [23]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelWithLMHead
from transformers.data.datasets.language_modeling import TextDataset
import pandas as pd
import torch
import evaluate
import cf_custom_functions as cf


## ***Pre-trained model evaluation***

In [None]:
pt_tokenizer = AutoTokenizer.from_pretrained(cfc.checkpoint)
pt_model = AutoModelWithLMHead.from_pretrained(cfc.checkpoint)

In [28]:
df_test = cf.load_test_data(cfc.test_file_path)

In [30]:
def generate_GPT_predictions(test_data:pd.DataFrame, model: object, tokenizer: object,prompt:str)-> pd.DataFrame:
    """Generate predictions of testdata

    Arguments:
        test_data {pd.DataFrame} -- Testdata as DataFrame
        model {object} -- Model
        tokenizer {object} -- Tokenizer
        prompt {str} -- "translate Latex to english:"

    Returns:
        pd.DataFrame -- DataFrame with Testdata and predictions
    """
    df = test_data.copy()
    y_preds = []

    for index, row in df.iterrows():
        input = prompt+row["formula"]
        #print("Input: ", input)
        #encoded_input = tokenizer(input, truncation=True, return_tensors="pt")
        encoded_input = tokenizer(input, return_tensors="pt")
        output = model.generate(**encoded_input, max_length=50)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
        gen_text = decoded_output[0].replace(input, "")  # Remove the input prompt from the output
        #print("Output:", gen_text)
        y_preds.append(gen_text)

    y_preds = np.array(y_preds)
    df["prediction"] = y_preds

    return df

In [None]:
df_preds_pt = generate_GPT_predictions(df_test, model, tokenizer,"generate spoken text of: ")

In [None]:
df_preds_clean = cf.post_processing_multi_predictions(df_preds_pt)

In [None]:
dict_metrics_pt = cf.compute_evaluation_metrics(df_preds_clean,"clean_prediction")
print(dict_metrics_pt)

In [None]:
cf.save_evaluation_metrics(f"{cfc.model_name}-Pre-trained", dict_metrics_pt, "../metrics/NLG_metrics_new.json")

### ***Generate Test Predictions Fine-tuned model***

In [None]:
ft_tokenizer = AutoTokenizer.from_pretrained(cfc.checkpoint)
ft_model = AutoModelWithLMHead.from_pretrained(cfc.model_dir)

In [27]:
ft_tokenizer.pad_token = ft_tokenizer.eos_token
ft_model.config.eos_token_id = ft_tokenizer.eos_token_id
ft_model.config.pad_token_id = ft_tokenizer.pad_token_id
ft_model.config.decoder_start_token_id = ft_tokenizer.bos_token_id

In [None]:
df_preds_ft = generate_GPT_predictions(df_test, ft_model, ft_tokenizer,"generate spoken text of: ")

In [32]:
df_preds_clean_ft = cf.post_processing_multi_predictions(df_preds_ft)

In [None]:
dict_metrics_ft = cf.compute_evaluation_metrics(df_preds_clean_ft,"clean_prediction")
print(dict_metrics_ft)

In [None]:
cf.save_evaluation_metrics(f"{cfc.model_name}-Fine-tuned", dict_metrics_ft, "../metrics/NLG_metrics_new.json")