In [None]:
!pip install transformers datasets torch tqdm

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import math
from tqdm import tqdm

In [2]:
model_name = "nqzfaizal77ai/qlgpt-m-en-1bc-exp"

In [None]:
model_name = "nqzfaizal77ai/qlpt-m-en-1bc-exp2"

In [2]:
model_name = "EleutherAI/gpt-neo-125m"

In [15]:
model_name = "facebook/opt-350m"

In [23]:
model_name = "openai-community/gpt2-medium"

In [6]:
model_name = "HuggingFaceTB/SmolLM2-360M"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = load_dataset("mikasenghaas/wikitext-2", split="test")

Perplexity <br>
Note : Lower Better <br>
 Perplexity is a measure of how well a probability distribution or probability model predicts a sample. In the context of language models, it quantifies how surprised the model is by new data. A lower perplexity score indicates that the model is less "perplexed" by the data,

In [None]:
def evaluate(model, dataset, tokenizer, max_length=1024):
    device = next(model.parameters()).device
    total_loss = 0.0
    total_tokens = 0

    for example in tqdm(dataset):
        input_ids = tokenizer.encode(example['text'], return_tensors='pt').to(device)
        if input_ids.size(1) < 2:
            continue  # skip very short sequences

        # Truncate to max length
        for i in range(0, input_ids.size(1), max_length):
            input_chunk = input_ids[:, i:i+max_length]
            if input_chunk.size(1) < 2:
                continue

            with torch.no_grad():
                outputs = model(input_chunk, labels=input_chunk)
                loss = outputs.loss
                total_loss += loss.item() * input_chunk.size(1)
                total_tokens += input_chunk.size(1)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity

In [None]:
perplexity = evaluate(model, dataset, tokenizer)
print(f"Perplexity: {perplexity:.2f}")

Self-Bleu <br>
Note : Lower Better <br>
Lower Self-BLEU = Higher Diversity: If the score is low, it means that when one generated sentence is compared to another, their similarity (as measured by BLEU) is low. This indicates that the sentences are distinct, reflecting greater diversity in the model's output.

In [None]:
!pip install evaluate

In [27]:
from evaluate import load
import torch

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Generate multiple samples from a prompt
def generate_samples(prompt, num_samples=50, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        max_length=max_length,
        num_return_sequences=num_samples
    )
    return [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

# Self-BLEU with BLEU scorer
def compute_self_bleu(sentences):
    bleu = load("bleu")
    scores = []
    for i in range(len(sentences)):
        references = sentences[:i] + sentences[i+1:]
        result = bleu.compute(predictions=[sentences[i]], references=[references])
        scores.append(result["bleu"])
    return sum(scores) / len(scores)

# Run
prompt = "Hacking drones swarmed the satellite"
samples = generate_samples(prompt)
self_bleu_score = compute_self_bleu(samples)
print(f"Self-BLEU: {self_bleu_score:.4f} (Lower = More Diverse)")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Self-BLEU: 0.1784 (Lower = More Diverse)


Sklearn Topic Consistency <br>
Note : Higher is Better <br>
What a higher score means: A higher topic_similarity score (closer to 1) indicates that the generated texts (generations) are, on average, more semantically similar to the original prompt. This implies that the model generating these texts has successfully stayed on topic and produced content that aligns with the initial subject matter. <br>
What a lower score means: A lower topic_similarity score (closer to 0, or even negative if using different similarity metrics, though cosine similarity for TF-IDF is usually non-negative) indicates that the generated texts are less semantically similar to the prompt. This suggests that the model might have drifted off-topic, generated irrelevant content, or misinterpreted the prompt's intent.

In [None]:
!pip install scikit-learn

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def topic_similarity(prompt, generations):
    vectorizer = TfidfVectorizer(stop_words="english")
    vectors = vectorizer.fit_transform([prompt] + generations)
    similarities = cosine_similarity(vectors[0:1], vectors[1:])[0]
    return similarities.mean()

# Example usage
similarity_score = topic_similarity(prompt, samples)
print(f"Topic Similarity (TF-IDF Cosine): {similarity_score:.4f}")

Topic Similarity (TF-IDF Cosine): 0.1001


Sentence Tranformers Topic Consistency <br>
Note : Higher Better
- What a higher score means: A higher `bert_topic_consistency` score (closer to 1.0) signifies that the generated texts (`generations`) are, on average, more semantically aligned and coherent with the original `prompt`. This indicates that the language model producing the generations has successfully understood the prompt's underlying topic and generated relevant content that stays "on message."
    - Cosine Similarity Range: Cosine similarity ranges from -1 to 1.
        - 1: Perfect similarity (vectors point in the exact same direction).
        - 0: No similarity (vectors are orthogonal).
        - -1: Perfect dissimilarity (vectors point in opposite directions).
    - In the context of sentence embeddings, scores typically range from 0 to 1 for meaningful similarities, as embeddings are often designed to capture positive relatedness.
- What a lower score means: A lower `bert_topic_consistency` score (closer to 0) suggests that the generated texts are less semantically related to the `prompt`. This could mean the model drifted off-topic, produced irrelevant information, or fundamentally misunderstood the intent of the prompt.


In [None]:
!pip install -U sentence-transformers

In [30]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def bert_topic_consistency(prompt, generations):
    prompt_emb = model.encode(prompt, convert_to_tensor=True)
    gen_embs = model.encode(generations, convert_to_tensor=True)
    scores = util.cos_sim(prompt_emb, gen_embs)
    return scores.mean().item()

bert_score = bert_topic_consistency(prompt, samples)
print(f"BERT Topic Consistency: {bert_score:.4f}")

BERT Topic Consistency: 0.6798


Comet Coherence Scoring <br>
Note : Higher is better for COMET scores <br>
- COMET is a quality estimation metric: COMET is designed to assess the quality of machine-generated text (in this case, the `generations` from your model) against human-written `references`. It predicts how good a translation or text generation is.
- Scores indicate similarity and quality: A higher COMET score indicates that the generated text is more similar to the reference text and is considered to be of higher quality. Conversely, a lower score suggests less similarity and lower quality.
- Ranges: While the exact range can vary slightly depending on the specific COMET model, scores typically fall within a range where higher values (e.g., closer to 1 or 100, depending on scaling) represent better quality. The `wmt22-cometkiwi-da` model typically outputs scores that can range from negative values up to 1, where higher values are better.

In [None]:
!pip install unbabel-comet

In [None]:
!pip install huggingface-hub

In [None]:
from huggingface_hub import login
login("hf_token")

In [None]:
from comet import download_model, load_from_checkpoint
import torch

# Load COMET model
try:
    comet_model_path = download_model("Unbabel/wmt22-cometkiwi-da")
    comet_model = load_from_checkpoint(comet_model_path)
    print("COMET model loaded successfully!")
except Exception as e:
    print(f"Failed to load COMET model: {e}")
    raise

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_text(prompt, max_length=50, temperature=0.7):
    """Generate text from the model with more controlled parameters"""
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(
        **input_ids,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=0.9,
        num_return_sequences=1
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Evaluation function using COMET
def evaluate_with_comet(prompts, references):
    """
    Evaluate model generations against references using COMET

    Args:
        prompts: List of prompt strings
        references: List of reference strings (one per prompt)

    Returns:
        Dictionary with COMET scores and analysis
    """
    if not isinstance(prompts, list):
        prompts = [prompts]
    if not isinstance(references, list):
        references = [references]

    # Generate outputs
    generations = [generate_text(prompt) for prompt in prompts]

    # Prepare COMET input format
    comet_data = [{"src": p, "mt": g, "ref": r}
                 for p, g, r in zip(prompts, generations, references)]

    # Get COMET scores
    try:
        comet_output = comet_model.predict(
            comet_data,
            batch_size=8,
            gpus=1 if torch.cuda.is_available() else 0
        )

        # Extract scores from the output (newer COMET versions return a dict)
        if isinstance(comet_output, dict):
            comet_scores = comet_output['scores']
        else:
            comet_scores = comet_output

        return {
            "scores": comet_scores,
            "generations": generations,
            "average_score": sum(comet_scores) / len(comet_scores)
        }
    except Exception as e:
        print(f"Error in COMET evaluation: {e}")
        return None

test_cases = [

    # === Politics ===
    {
        "prompt": "The government passed a new policy regarding",
        "reference": "The government passed a new policy regarding national security to improve border control and intelligence coordination."
    },
    {
        "prompt": "Parliament debated a bill on",
        "reference": "Parliament debated a bill on climate change aimed at reducing carbon emissions by 40% over the next decade."
    },
    {
        "prompt": "A constitutional amendment was proposed to",
        "reference": "A constitutional amendment was proposed to lower the voting age to 16 and increase civic engagement among youth."
    },

    # === Military ===
    {
        "prompt": "The army launched an operation in the",
        "reference": "The army launched an operation in the northern valley to eliminate remaining insurgent strongholds."
    },
    {
        "prompt": "Fighter jets conducted airstrikes over",
        "reference": "Fighter jets conducted airstrikes over the mountainous region, targeting weapons caches and militant camps."
    },
    {
        "prompt": "Soldiers were deployed to",
        "reference": "Soldiers were deployed to the conflict zone to restore peace and ensure the safety of civilians."
    },

    # === Healthcare ===
    {
        "prompt": "Doctors discovered a new treatment for",
        "reference": "Doctors discovered a new treatment for Alzheimer's that may significantly slow cognitive decline in early stages."
    },
    {
        "prompt": "Hospitals reported a rise in cases of",
        "reference": "Hospitals reported a rise in cases of dengue fever due to increased mosquito activity after the rainy season."
    },
    {
        "prompt": "A new vaccine was developed to fight",
        "reference": "A new vaccine was developed to fight the latest strain of influenza, offering broader protection for high-risk groups."
    },

    # === Islamic ===
    {
        "prompt": "Muslims around the world celebrated",
        "reference": "Muslims around the world celebrated Eid al-Fitr with prayers, feasts, and acts of charity after a month of fasting."
    },
    {
        "prompt": "The mosque hosted a sermon about",
        "reference": "The mosque hosted a sermon about forgiveness and unity during Friday prayers attended by hundreds."
    },
    {
        "prompt": "Ramadan began with",
        "reference": "Ramadan began with the sighting of the crescent moon and the call to the first night prayer."
    },

    # === Economy ===
    {
        "prompt": "The stock market showed signs of",
        "reference": "The stock market showed signs of recovery after the release of strong quarterly earnings reports."
    },
    {
        "prompt": "The central bank raised interest rates to",
        "reference": "The central bank raised interest rates to combat inflation and stabilize the national currency."
    },
    {
        "prompt": "Unemployment rates dropped due to",
        "reference": "Unemployment rates dropped due to increased hiring in the technology and service sectors."
    }
]

num_test_cases = len(test_cases)
categories = {
    "Politics": range(0, min(3, num_test_cases)),
    "Military": range(3, min(6, num_test_cases)),
    "Healthcare": range(6, min(9, num_test_cases)),
    "Islamic": range(9, min(12, num_test_cases)),
    "Economy": range(12, min(15, num_test_cases))
}

# Filter out empty categories
categories = {k: v for k, v in categories.items() if len(v) > 0}

print(f"\nEvaluating {num_test_cases} test cases across {len(categories)} categories")

# Run evaluation
results = evaluate_with_comet(
    prompts=[tc["prompt"] for tc in test_cases],
    references=[tc["reference"] for tc in test_cases]
)

# Print results
if results:
    print("\n=== COMET Evaluation Results ===")
    print(f"Overall Average Score: {results['average_score']:.4f}")

    # Calculate and print category averages if we have categories
    if categories:
        print("\nCategory Averages:")
        for category_name, indices in categories.items():
            try:
                category_scores = [results['scores'][i] for i in indices]
                avg_score = sum(category_scores) / len(category_scores)
                print(f"{category_name}: {avg_score:.4f}")
            except IndexError:
                print(f"Warning: Couldn't calculate score for {category_name} - index out of range")

    # Print detailed results
    print("\nDetailed Results:")
    for i, (prompt, gen, ref, score) in enumerate(zip(
        [tc["prompt"] for tc in test_cases],
        results["generations"],
        [tc["reference"] for tc in test_cases],
        results["scores"]
    )):
        # Find which category this test case belongs to
        category = next((cat for cat, indices in categories.items() if i in indices), "Unknown")

        print(f"\nTest Case {i+1} ({category}):")
        print(f"Prompt:    {prompt}")
        print(f"Generated: {gen}")
        print(f"Reference: {ref}")
        print(f"COMET Score: {score:.4f}")
else:
    print("Evaluation failed.")