##### Importing Packages

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm
import numpy as np
import random
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Selecting cpu or gpu based on device

device = "cpu"

if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using {device} device.")

In [None]:
# Parameters

MAX_LENGTH = 1024  

TEMPRATURE = 0.7  

REPETITION_PENALTY = 1.0

TOP_K = 50  

TOP_P = 0.92 

DO_SAMPLE = True  

NUM_RETUTNR_SEQUENCES = 1 

TXT_DATA = 'Data/Final_merged_data.txt'

MODEL_PATH = "Models/New_E6"

In [2]:
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH).to('cuda')

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [3]:
# Number of parameters in Millions

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Convert number of parameters to millions
num_params_millions = num_params / 1_000_000

print("Number of parameters in the model (in millions):", num_params_millions)


Number of parameters in the model (in millions): 254.814208


#### Evaluation

In [5]:
# Seperating conversation for evaluation

def parse_demo_txt(file_path, samples):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Take 100 random lines from the file
    lines_sampled = random.sample(lines, samples)
    all_lines = []
    for line in lines_sampled:
        segments = line.split('gpt:')
        parsed_data = []
        for segment in segments[:-1]:  # Exclude the last segment as it will not have a corresponding gpt part
            human_part = segment.split('human:')[-1].strip()
            gpt_part = segments[segments.index(segment) + 1].split('human:')[0].strip()
            parsed_data.append((human_part, gpt_part))
        all_lines.append(parsed_data)
    
    return all_lines



In [6]:

def generate_response(model, tokenizer, conversation_history, new_message, max_length=1024):

    if len(conversation_history) > 0:
        input_text = conversation_history + ' human: ' + new_message + " gpt:"
    else:
        input_text = 'human: ' + new_message + ' gpt:'

    input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        # num_beams=5,
        temperature=TEMPRATURE,
        repetition_penalty=REPETITION_PENALTY,
        top_k=TOP_K,
        top_p=TOP_P,
        do_sample=DO_SAMPLE,
        num_return_sequences=NUM_RETUTNR_SEQUENCES,
        pad_token_id=tokenizer.pad_token_id,  # Set pad token ID to EOS token ID
        attention_mask=input_ids.new_ones(input_ids.shape),  # Provide attention mask
        early_stopping=True, # Stop generation when a pad token is generated,
    )
        
    ### Decode and print generated text sequences
    for output_sequence in output_sequences:
        output_sequence = [token.item() for token in output_sequence if token is not None]

        first_index = output_sequence.index(50257)
        output_sequence = output_sequence[:first_index]
        response = tokenizer.decode(output_sequence)

    last_index = input_text.rfind("gpt:")
    response = response[last_index+5:]

    last_index = response.find(" human")
    response = response[:last_index]
    updated_history = input_text + ' ' + response
    
    return response, updated_history  # Keep only the most recent part fitting the max_length

In [7]:
# Initialize models and tools
rouge = Rouge()
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

rouge_scores = []
bleu_scores = []
distilbert_scores = []

def evaluation(txt_path, num_samples, model, tokenizer):

    parsed_data = parse_demo_txt(txt_path, num_samples)

    for data in parsed_data:
        
        conversation_history = ""

        for human_part, gpt_part in data:
            # Simulate GPT generation as reusing gpt_part
            generated_gpt_response, conversation_history = generate_response(model, tokenizer, conversation_history, human_part, max_length=1024)
            reference_gpt_response = gpt_part  

            # ROUGE
            scores = rouge.get_scores(generated_gpt_response, reference_gpt_response, avg=True)
            rouge_scores.append(scores['rouge-1']['f'])
            
            # BLEU
            reference_tokens = [reference_gpt_response.split()]
            generated_tokens = generated_gpt_response.split()
            bleu_score = sentence_bleu(reference_tokens, generated_tokens, smoothing_function=SmoothingFunction().method1)
            bleu_scores.append(bleu_score)
            
            # DistilBERT
            gen_embed = sentence_model.encode([generated_gpt_response])
            ref_embed = sentence_model.encode([reference_gpt_response])
            cosine_sim = util.pytorch_cos_sim(gen_embed, ref_embed)
            distilbert_scores.append(cosine_sim.item())

    # Calculate averages
    avg_rouge_score = np.mean(rouge_scores)
    avg_bleu_score = np.mean(bleu_scores)
    avg_distilbert_score = np.mean(distilbert_scores)

    print(f"Average ROUGE-1 F1: {avg_rouge_score}")
    print(f"\nAverage BLEU: {avg_bleu_score}")
    print(f"\nAverage DistilBERT Cosine Similarity: {avg_distilbert_score}")


In [8]:
model = model.to('cuda')

evaluation(txt_path=TXT_DATA, num_samples=10, model=model, tokenizer=tokenizer)



Average ROUGE-1 F1: 0.3692968740147636
Average BLEU: 0.09851205911397291
Average DistilBERT Cosine Similarity: 0.805561407919853
