In [1]:
# Importing PyTorch, a deep learning framework
import torch

# Importing necessary components from the Hugging Face Transformers library
from transformers import (
    AutoModelForCausalLM,  # Model class for causal language modeling
    AutoTokenizer,         # Tokenizer class for auto-loading tokenizers
    GPT2Tokenizer,         # Tokenizer specific to GPT-2
    GPT2LMHeadModel,       # GPT-2 model class with a language modeling head
    BloomTokenizerFast,    # Tokenizer specific to Bloom model
    BloomForCausalLM       # Bloom model class for causal language modeling
)

# Importing tools for text vectorization and similarity measurement
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF vectorizer for text feature extraction
from sklearn.metrics.pairwise import cosine_similarity  # Function to compute cosine similarity between vectors

# Importing text statistics and analysis tools
from textstat import flesch_reading_ease  # Function to calculate the Flesch Reading Ease score
from textblob import TextBlob  # Library for text processing and sentiment analysis

In [2]:
# Function to generate text from a model
def generate_text(model_name, prompt):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=500,
        num_return_sequences=1,
        repetition_penalty=1.2,
        temperature=0.7,
        top_k=50,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

#### `evaluate_fluency_with_perplexity` Function

This function evaluates the fluency of a given text using the perplexity metric. Perplexity is a measure of how well a language model predicts a sample and is often used to assess model performance.

In [3]:
def evaluate_fluency_with_perplexity(model_name, text):
    if 'gpt2' in model_name.lower():
        # Load the GPT-2 tokenizer and model
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        model = GPT2LMHeadModel.from_pretrained(model_name)
    elif 'bloom' in model_name.lower():
        # Load the Bloom tokenizer and model
        tokenizer = BloomTokenizerFast.from_pretrained(model_name)
        model = BloomForCausalLM.from_pretrained(model_name)
    else:
        raise ValueError(f"Model {model_name} not supported for perplexity evaluation.")

    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = torch.exp(loss).item()

    return perplexity

### `analyze_tone` Function

This function analyzes the tone of a given text using sentiment analysis provided by the TextBlob library. The tone is classified into three categories: Positive, Negative, or Neutral based on the sentiment polarity.

In [4]:
# Analyze tone using TextBlob
def analyze_tone(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"

### `evaluate_relevance`

Evaluates the relevance of a generated text with respect to a given prompt using TF-IDF and cosine similarity.

In [5]:
# Evaluate relevance
def evaluate_relevance(prompt, generated_text):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([prompt, generated_text])
    similarity = cosine_similarity(vectors[0:1], vectors[1:2])
    return similarity[0][0]

### `calculate_lexical_diversity`

Calculates the lexical diversity of a text, which is the ratio of unique words to the total number of words.

In [6]:
# Calculate lexical diversity
def calculate_lexical_diversity(text):
    tokens = text.split()
    types = set(tokens)
    return len(types) / len(tokens) if tokens else 0

### `evaluate_length`

Checks if the length of the text (in terms of word count) falls within a specified range.

In [7]:
# Check text length
def evaluate_length(text, min_length=50, max_length=500):
    length = len(text.split())
    return min_length <= length <= max_length

In [8]:
# Check originality (Placeholder function)
def check_originality(text):
    return "Originality placeholder"

In [9]:
# Fact-checking (Placeholder function)
def check_facts(text):
    return "Fact-checking placeholder"

In [10]:
# Detect bias (Placeholder function)
def detect_bias(text):
    return "Bias detection placeholder"

### `evaluate_topic_coverage`

Evaluates how well the text covers the provided keywords.

In [11]:
# Evaluate topic coverage
def evaluate_topic_coverage(text, keywords):
    covered_keywords = [kw for kw in keywords if kw in text]
    return len(covered_keywords) / len(keywords)

### `evaluate_readability`

Evaluates the readability of the text using the Flesch Reading Ease score.

In [12]:
# Evaluate readability
def evaluate_readability(text):
    return flesch_reading_ease(text)

In [13]:
def display_summary(summary_text):
    # Count the number of words in the text
    word_count = len(summary_text.split())

    # Display the text line by line
    for line in summary_text.split('. '):
        print(line.strip())

    print(f"\nNumber of words in the Text: {word_count}")

def evaluate_models(models, prompt, topic_keywords, model_names):
    results = {}
    for model_name in models:
        generated_text = generate_text(model_name, prompt)
        
        # Format the generated text
        formatted_text = "\n".join(generated_text.split("\n"))
        
        # Perform evaluations
        fluency_score = evaluate_fluency_with_perplexity(model_name, formatted_text)
        relevance_score = evaluate_relevance(prompt, formatted_text)
        diversity_score = calculate_lexical_diversity(formatted_text)
        length_ok = evaluate_length(formatted_text)
        originality_check = check_originality(formatted_text)
        fact_check = check_facts(formatted_text)
        bias_check = detect_bias(formatted_text)
        topic_coverage = evaluate_topic_coverage(formatted_text, topic_keywords)
        readability_score = evaluate_readability(formatted_text)
        tone_analysis = analyze_tone(formatted_text)
        
        # Store results
        results[model_name] = {
            "text": formatted_text,
            "fluency": fluency_score,
            "relevance": relevance_score,
            "diversity": diversity_score,
            "length_ok": length_ok,
            "originality": originality_check,
            "fact_check": fact_check,
            "bias": bias_check,
            "topic_coverage": topic_coverage,
            "readability": readability_score,
            "tone": tone_analysis
        }
        
        # Print the results
        print(f"Model: {model_name}\n")
        # Display the summary
        display_summary(formatted_text)
        print(f"\nFluency Score: {fluency_score}")
        print(f"Relevance Score: {relevance_score}")
        print(f"Diversity Score: {diversity_score}")
        print(f"Length OK: {length_ok}")
        print(f"Originality Check: {originality_check}")
        print(f"Fact Check: {fact_check}")
        print(f"Bias Check: {bias_check}")
        print(f"Topic Coverage: {topic_coverage}")
        print(f"Readability Score: {readability_score}")
        print(f"Tone Analysis: {tone_analysis}")
        print("\n" + "-" * 50 + "\n")
        
    return results

In [14]:
# Define your prompt and topic keywords we want to evaluate
model_names = ['gpt2', 'bigscience/bloom-560m']
models = {name: name for name in model_names} 
prompt = "Discuss the role of education in achieving Sustainable Development Goal 4 (Quality Education)."
topic_keywords = ["Quality Education", "Education", "SDG4", "Skill Development"]

# Evaluation
results = evaluate_models(models, prompt, topic_keywords, model_names)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: gpt2

Discuss the role of education in achieving Sustainable Development Goal 4 (Quality Education).
The following is a list of key issues that need to be addressed before we can achieve our goal
The first issue, which needs addressing right now and will become more important as time goes on, involves ensuring schools are equipped with appropriate training for students who have not yet completed their degree requirements or if they do so would require further study at an accredited university such Asperger's Institute where there may also be additional resources available including courses from other universities but this should only take place once you've been certified by your school authorities within three years after graduation into high quality teaching positions under guidance provided through local government departments like Schools Councils etc.
This means it must include all aspects of learning skills needed during these four-year periods - reading comprehension; writ

Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Model: bigscience/bloom-560m

Discuss the role of education in achieving Sustainable Development Goal 4 (Quality Education).

Number of words in the Text: 13

Fluency Score: 31.918127059936523
Relevance Score: 1.0
Diversity Score: 1.0
Length OK: False
Originality Check: Originality placeholder
Fact Check: Fact-checking placeholder
Bias Check: Bias detection placeholder
Topic Coverage: 0.5
Readability Score: 7.52
Tone Analysis: Neutral

--------------------------------------------------



In [15]:
prompt = "How can renewable energy contribute to the success of Sustainable Development Goal 7 (Affordable and Clean Energy)?"
topic_keywords = ["Affordable Energy", "Clean", "Renewable", "Energy", "Electricity","Solar", "Affordable", "Emission", "Gas"]
results = evaluate_models(models, prompt, topic_keywords, model_names)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: gpt2

How can renewable energy contribute to the success of Sustainable Development Goal 7 (Affordable and Clean Energy)?
The following is a list of some key points that are important for sustainable development
The goal states: "We must ensure all people have access to clean, affordable electricity." This means we need more power from renewables in order not only meet our needs but also provide an alternative source or fuel which will be cheaper than fossil fuels such as coal-fired plants"
In other words it's about getting enough solar panels on your roof so you don't burn too much carbon dioxide into them! It should help us get rid out of greenhouse gases by reducing emissions through cleaner sources like wind turbines – this would reduce CO2 levels at home while increasing efficiency with less pollution coming back down onto society via increased use of natural gas instead…

Number of words in the Text: 147

Fluency Score: 19.63784408569336
Relevance Score: 0.3895494956085688