In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import gensim
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
# --- Setup: Download necessary NLTK data ---
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download('punkt')


Downloading NLTK stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rynoc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rynoc\AppData\Roaming\nltk_data...


Downloading NLTK punkt tokenizer...


[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
# --- Constants ---
# Using a smaller sample for faster demonstration. Set to -1 to use the full dataset.
NUM_SAMPLES = 2000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")



Using device: cpu


In [4]:
# --- Data Loading and Preprocessing ---

def load_real_data(num_samples):
    """Loads the CNN/DailyMail dataset from Hugging Face."""
    print("\nLoading real CNN/DailyMail dataset...")
    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train")
    if num_samples != -1:
        dataset = dataset.shuffle(seed=42).select(range(num_samples))
    df = pd.DataFrame(dataset)
    return df['article'].tolist()

def create_dummy_synthetic_data(real_data):
    """
    Creates a dummy synthetic DataFrame.
    Replace this function with code to load your actual gemma_df.
    """
    print("\nCreating a dummy synthetic dataset for demonstration...")
    # This dummy data is just slightly modified real data for the script to run.
    # In your use case, you will load your actual gemma_df.
    gemma_articles = [s.replace("CNN", "GMN") for s in real_data[:len(real_data)//2]]
    gemma_df = pd.DataFrame(gemma_articles, columns=['generated_article'])
    return gemma_df['generated_article'].tolist()

def preprocess_text(text_list):
    """
    Tokenizes, removes stopwords, punctuation, and converts to lowercase.
    Returns a list of lists of tokens.
    """
    print("Preprocessing text data...")
    stop_words = set(stopwords.words('english'))
    preprocessed_texts = []
    for text in text_list:
        if not isinstance(text, str):
            continue
        # Tokenize and convert to lower case
        tokens = word_tokenize(text.lower())
        # Filter out stopwords and punctuation
        filtered_tokens = [
            word for word in tokens
            if word.isalpha() and word not in stop_words and word not in string.punctuation
        ]
        preprocessed_texts.append(filtered_tokens)
    return preprocessed_texts



In [5]:
# --- 1. Fidelity Evaluation ---

def evaluate_topic_coherence(processed_texts):
    """
    Trains an LDA model and calculates topic coherence using Gensim's C_v.
    A higher score indicates more human-interpretable topics.
    """
    print("\n--- Evaluating Topic Coherence ---")
    if not processed_texts:
        print("Not enough data to evaluate topic coherence.")
        return 0.0

    dictionary = Dictionary(processed_texts)
    # Filter out extreme words (appears in <5 docs or >50% of docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    if not corpus:
        print("Corpus is empty after filtering. Cannot evaluate topic coherence.")
        return 0.0

    print("Training LDA model...")
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42)
    
    print("Calculating Coherence Score (C_v)...")
    coherence_model = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score (C_v): {coherence_score:.4f}")
    return coherence_score

def evaluate_statistical_properties(real_texts, synthetic_texts):
    """
    Compares basic statistical properties of the two text datasets.
    - Average length
    - Vocabulary Richness (Type-Token Ratio)
    """
    print("\n--- Evaluating Statistical Properties ---")
    
    # Real Data Stats
    real_lengths = [len(doc.split()) for doc in real_texts]
    real_tokens = [tok for doc in real_texts for tok in doc.lower().split()]
    real_ttr = len(set(real_tokens)) / len(real_tokens) if real_tokens else 0

    # Synthetic Data Stats
    synth_lengths = [len(doc.split()) for doc in synthetic_texts]
    synth_tokens = [tok for doc in synthetic_texts for tok in doc.lower().split()]
    synth_ttr = len(set(synth_tokens)) / len(synth_tokens) if synth_tokens else 0

    print("Average document length (words):")
    print(f"  Real data: {np.mean(real_lengths):.2f} (Std: {np.std(real_lengths):.2f})")
    print(f"  Synthetic data: {np.mean(synth_lengths):.2f} (Std: {np.std(synth_lengths):.2f})")
    print("\nVocabulary Richness (Type-Token Ratio):")
    print(f"  Real data: {real_ttr:.4f}")
    print(f"  Synthetic data: {synth_ttr:.4f}")


def evaluate_perplexity(text_list, model, tokenizer):
    """
    Calculates the average perplexity of a list of texts using a given model.
    Lower perplexity is better, indicating the model finds the text less "surprising".
    """
    total_neg_log_likelihood = 0
    total_tokens = 0

    for i, text in enumerate(text_list):
        if i % 100 == 0:
            print(f"  Calculating perplexity for document {i}/{len(text_list)}...")
        
        if not text or not isinstance(text, str):
            continue

        encodings = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True)
        input_ids = encodings.input_ids.to(DEVICE)
        
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            neg_log_likelihood = outputs.loss * input_ids.size(1)

        total_neg_log_likelihood += neg_log_likelihood.item()
        total_tokens += input_ids.size(1)

    if total_tokens == 0:
        return float('inf')
        
    avg_neg_log_likelihood = total_neg_log_likelihood / total_tokens
    perplexity = torch.exp(torch.tensor(avg_neg_log_likelihood))
    return perplexity.item()

def run_perplexity_evaluation(real_texts, synthetic_texts):
    """
    Wrapper to load model and run perplexity evaluation on both datasets.
    """
    print("\n--- Evaluating Perplexity (Cross-Entropy) ---")
    print("Loading pre-trained GPT-2 model...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2').to(DEVICE)
    model.eval()

    print("\nCalculating perplexity for REAL data...")
    ppl_real = evaluate_perplexity(real_texts, model, tokenizer)
    print(f"Average Perplexity on REAL data: {ppl_real:.4f}")

    print("\nCalculating perplexity for SYNTHETIC data...")
    ppl_synth = evaluate_perplexity(synthetic_texts, model, tokenizer)
    print(f"Average Perplexity on SYNTHETIC data: {ppl_synth:.4f}")



In [6]:
# --- 2. Utility Evaluation ---

def evaluate_downstream_task(real_texts, synthetic_texts):
    """
    Trains a classifier to distinguish real from synthetic data.
    Accuracy close to 50% means the synthetic data is high quality.
    """
    print("\n--- Evaluating Utility: Real vs. Fake Classification ---")
    
    # Create labels: 0 for real, 1 for synthetic
    data = real_texts + synthetic_texts
    labels = [0] * len(real_texts) + [1] * len(synthetic_texts)

    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.3, random_state=42, stratify=labels
    )
    
    print("Vectorizing text with TF-IDF...")
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    print("Training Logistic Regression classifier...")
    classifier = LogisticRegression(random_state=42)
    classifier.fit(X_train_tfidf, y_train)
    
    print("Evaluating classifier performance...")
    y_pred = classifier.predict(X_test_tfidf)
    
    print("Classification Report (0=Real, 1=Synthetic):")
    print(classification_report(y_test, y_pred))
    print("Interpretation: An accuracy score close to 0.5 indicates that the model")
    print("struggles to distinguish synthetic data from real data, which is a sign of high quality.")



In [7]:
# --- Main Execution ---
if _name_ == "_main_":
    # --- Load Data ---
    real_articles = load_real_data(num_samples=NUM_SAMPLES)
    
    # IMPORTANT: Replace this with your actual data loading logic
    # For example:
    # gemma_df = pd.read_csv('path/to/your/gemma_data.csv')
    # synthetic_articles = gemma_df['generated_article'].tolist()
    synthetic_articles = create_dummy_synthetic_data(real_articles)
    
    # --- Preprocess Data for Topic Modeling ---
    processed_real = preprocess_text(real_articles)
    processed_synthetic = preprocess_text(synthetic_articles)

    # --- Run Fidelity Evaluations ---
    print("\n" + "="*20 + " FIDELITY EVALUATION " + "="*20)
    evaluate_topic_coherence(processed_real)
    evaluate_topic_coherence(processed_synthetic)
    evaluate_statistical_properties(real_articles, synthetic_articles)
    run_perplexity_evaluation(real_articles, synthetic_articles)

    # --- Run Utility Evaluation ---
    print("\n" + "="*20 + " UTILITY EVALUATION " + "="*20)
    evaluate_downstream_task(real_articles, synthetic_articles)
    
    print("\nEvaluation complete.")

NameError: name '_name_' is not defined