In [1]:
from transformers import AdamW, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

train = pd.read_csv("/home/vda/Prophetnet/train.csv")
val = pd.read_csv("/home/vda/Prophetnet/val.csv")

In [4]:
import torch
import time
from transformers import ProphetNetForConditionalGeneration, ProphetNetTokenizer, AdamW

# Load pre-trained model and tokenizer
model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")

# Define your fine-tuning data
train_input_str = train.Body.tolist()
train_target_str = train.Question.tolist()
val_input_str = val.Body.tolist()
val_target_str = val.Question.tolist()

# Tokenize and encode the training data
train_input_ids = tokenizer(train_input_str, padding=True, truncation=True, return_tensors="pt", max_length=128)
train_target_ids = tokenizer(train_target_str, padding=True, truncation=True, return_tensors="pt", max_length=128)
val_input_ids = tokenizer(val_input_str, padding=True, truncation=True, return_tensors="pt", max_length=128)
val_target_ids = tokenizer(val_target_str, padding=True, truncation=True, return_tensors="pt", max_length=128)


# Prepare data loaders
train_dataset = torch.utils.data.TensorDataset(train_input_ids["input_ids"], train_target_ids["input_ids"])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataset = torch.utils.data.TensorDataset(val_input_ids["input_ids"], val_target_ids["input_ids"])
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=True)


Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 11.6MB/s]
Downloading model.safetensors: 100%|██████████| 1.57G/1.57G [02:44<00:00, 9.52MB/s]
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 2.35MB/s]
Downloading (…)prophetnet.tokenizer: 100%|██████████| 232k/232k [00:00<00:00, 40.3MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 440kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 141/141 [00:00<00:00, 661kB/s]


In [11]:
num_epochs = 30 # total 30 epochs
warmup_steps = 1e2
learning_rate = 5e-5
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [12]:
optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)



In [13]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * num_epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [14]:
import time
best_loss = 99999999
# Fine-tuning loop
for epoch in range(num_epochs):
    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()
    for batch in train_loader:
        model.zero_grad()  
        input_ids, labels = batch

        # Forward pass
        loss = model(input_ids, labels=labels).loss

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calculate average loss for the epoch
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_train_loss}")
    print("Training time: ", time.time() - t0)

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in val_loader:
        
        input_ids, labels = batch
        
        with torch.no_grad():        
            loss = model(input_ids, labels=labels).loss
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(val_loader)
    if best_loss > avg_val_loss:
        best_loss = avg_val_loss
        best_model = model
        best_epoch = epoch

    validation_time = time.time() - t0   

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

print(best_loss, best_epoch)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_prophetnet")

print("Fine-tuning complete!")


Training...
Epoch 1/30, Avg. Loss: 0.40420332888762156
Training time:  97.40243101119995

Running Validation...
  Validation Loss: 0.49
  Validation took: 7.221677303314209

Training...
Epoch 2/30, Avg. Loss: 0.8883681527773539
Training time:  93.68661952018738

Running Validation...
  Validation Loss: 0.92
  Validation took: 7.129942893981934

Training...
Epoch 3/30, Avg. Loss: 0.6809177947044373
Training time:  93.56731104850769

Running Validation...
  Validation Loss: 0.69
  Validation took: 7.10741400718689

Training...
Epoch 4/30, Avg. Loss: 0.3372596748669942
Training time:  93.68126583099365

Running Validation...
  Validation Loss: 0.46
  Validation took: 7.1124351024627686

Training...
Epoch 5/30, Avg. Loss: 0.22322243869304656
Training time:  93.62625408172607

Running Validation...
  Validation Loss: 0.43
  Validation took: 7.105046272277832

Training...
Epoch 6/30, Avg. Loss: 0.17031128967801729
Training time:  93.60147762298584

Running Validation...
  Validation Loss: 0

In [19]:
best_model.save_pretrained("./best_fine_tuned_prophetnet")

print("Fine-tuning complete!")

Fine-tuning complete!


In [15]:
import pandas as pd

test = pd.read_csv("test.csv")

In [None]:
predictions_gen = []

for i in range(len(test.Body.tolist())):
  input_str = test.Body.tolist()[i]
  input_ids = tokenizer(input_str, return_tensors="pt").input_ids
  # Generate predictions
  with torch.no_grad():
      output_ids = model.generate(input_ids)
  # Convert the output_ids to text
  predicted_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  print(predicted_str)
  predictions_gen.append(predicted_str)

pd.DataFrame(predictions_gen).to_csv("prediction_svamp.csv")

In [17]:
pd.DataFrame(predictions_gen).to_csv("prediction_svamp.csv")

In [None]:
predictions_gen = []

for i in range(len(test.Body.tolist())):
  input_str = test.Body.tolist()[i]
  input_ids = tokenizer(input_str, return_tensors="pt").input_ids
  # Generate predictions
  with torch.no_grad():
      output_ids = best_model.generate(input_ids)
  # Convert the output_ids to text
  predicted_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  print(predicted_str)
  predictions_gen.append(predicted_str)

pd.DataFrame(predictions_gen).to_csv("best_prediction_svamp.csv")

EVALUATION

In [None]:
generated_outputs = pd.read_csv("best_prediction_svamp.csv")["0"].tolist()
contexts = pd.read_csv("test.csv").Body.tolist()

for i in range(len(generated_outputs)):
  print(contexts[i])
  print(generated_outputs[i])
  print()

In [7]:
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize

seqs = []
for line in generated_outputs: 
  seqs.append(word_tokenize(line))

[nltk_data] Downloading package punkt to /home/vda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from collections import Counter

from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
import numpy as np

def distinct(seqs):
    """ Calculate intra/inter distinct 1/2. """
    batch_size = len(seqs)
    intra_dist1, intra_dist2 = [], []
    unigrams_all, bigrams_all = Counter(), Counter()
    for seq in seqs:
        unigrams = Counter(seq)
        bigrams = Counter(zip(seq, seq[1:]))
        intra_dist1.append((len(unigrams)+1e-12) / (len(seq)+1e-5))
        intra_dist2.append((len(bigrams)+1e-12) / (max(0, len(seq)-1)+1e-5))

        unigrams_all.update(unigrams)
        bigrams_all.update(bigrams)

    inter_dist1 = (len(unigrams_all)+1e-12) / (sum(unigrams_all.values())+1e-5)
    inter_dist2 = (len(bigrams_all)+1e-12) / (sum(bigrams_all.values())+1e-5)
    intra_dist1 = np.average(intra_dist1)
    intra_dist2 = np.average(intra_dist2)
    return intra_dist1, intra_dist2, inter_dist1, inter_dist2

In [9]:
_, _, dis1, dis2 = distinct(seqs)

print("dis1: ", dis1)
print("dis2: ", dis2)

dis1:  0.1634808844895333
dis2:  0.39597315214780177


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

nltk.download('punkt')
nltk.download('wordnet')

bert_score = 0

cnt = 0
assert len(generated_outputs) == len(contexts)
rouge_scr = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

for idx in tqdm(range(len(generated_outputs))):
  generated_output = generated_outputs[idx]
  context = contexts[idx]

  _,_, bertscoreF1 = score([generated_output], [context], lang='en', verbose=True)
  bert_score += float(bertscoreF1.mean())

  cnt += 1

print("")
print("BERTScore Relevancy: ", bert_score/cnt)

In [14]:
print("BERTScore Relevancy: ", bert_score/cnt)

BERTScore Relevancy:  0.8800921764969826
