In [1]:
# GPT-2 Evaluation Notebook
!pip install transformers torch matplotlib seaborn tqdm scikit-learn pandas nltk rouge sentence-transformers

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import pandas as pd
from tqdm import tqdm
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Load dataset
with open('stats_problems_dataset.json', 'r') as f:
    dataset = json.load(f)

# Stratified sampling for 300-400 problems
df = pd.DataFrame(dataset)
sampled_dataset = df.groupby('difficulty', group_keys=False).apply(
    lambda x: x.sample(min(len(x), 400 // df['difficulty'].nunique()), random_state=42)
).to_dict('records')

# Load sentence embeddings model for semantic comparison
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Evaluation metrics
def evaluate_answer(generated, correct):
    generated_embedding = embedder.encode([generated])
    correct_embedding = embedder.encode([correct])
    similarity = cosine_similarity(generated_embedding, correct_embedding)[0][0]
    return {"is_correct": similarity > 0.7, "similarity": similarity}

# Evaluate model
def evaluate_model(model, tokenizer, dataset, device):
    results = []
    for problem in tqdm(dataset):
        prompt = f"Please solve the following statistics problem. Provide a clear, step-by-step solution, including all necessary calculations:\n\nProblem: {problem['problem']}\n\nSolution:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)

        try:
            start_time = time.time()
            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=512, num_return_sequences=1)
            generated_solution = tokenizer.decode(outputs[0], skip_special_tokens=True)
            end_time = time.time()

            evaluation_results = evaluate_answer(generated_solution, problem['solution']['conclusion'])
            results.append({
                "model_name": "gpt2",
                "topic": problem["topic"],
                "difficulty": problem["difficulty"],
                "generated_solution": generated_solution,
                "correct_solution": problem['solution']['conclusion'],
                "time_taken": end_time - start_time,
                **evaluation_results
            })
        except Exception as e:
            print(f"Error processing problem: {e}")

    return results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# Evaluate model
results = evaluate_model(model, tokenizer, sampled_dataset, device)

# Save results to JSON
with open('gpt2_results.json', 'w') as f:
    json.dump(results, f)

print("Evaluation complete. Results saved to gpt2_results.json")
torch.cuda.empty_cache()


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rouge, sentence-transformers
Successfully installed rouge-1.0.1 sentence-transformers-3.0.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/320 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/320 [00:06<32:54,  6.19s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 2/320 [00:10<26:03,  4.92s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 3/320 [00:14<25:10,  4.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Error processing problem: 'conclusion'


  1%|▏         | 4/320 [00:19<24:17,  4.61s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 5/320 [00:23<23:10,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 6/320 [00:27<23:41,  4.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 7/320 [00:32<23:17,  4.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▎         | 8/320 [00:36<22:33,  4.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 9/320 [00:42<25:13,  4.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 10/320 [00:46<23:52,  4.62s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 11/320 [00:50<22:54,  4.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 12/320 [00:55<23:40,  4.61s/i

Error processing problem: 'conclusion'


 23%|██▎       | 73/320 [05:25<18:22,  4.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 23%|██▎       | 74/320 [05:29<17:50,  4.35s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 23%|██▎       | 75/320 [05:33<17:34,  4.30s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 24%|██▍       | 76/320 [05:38<18:25,  4.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 24%|██▍       | 77/320 [05:42<17:51,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 24%|██▍       | 78/320 [05:46<17:27,  4.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▍       | 79/320 [05:51<18:06,  4.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 80/320 [05:55<17:31,  4.38s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 81/320 [05:59<17:17,  4

Error processing problem: 'float' object is not subscriptable


 90%|█████████ | 288/320 [21:17<02:24,  4.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 90%|█████████ | 289/320 [21:21<02:16,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 91%|█████████ | 290/320 [21:25<02:11,  4.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 91%|█████████ | 291/320 [21:30<02:10,  4.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 91%|█████████▏| 292/320 [21:34<02:02,  4.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 293/320 [21:39<01:58,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 294/320 [21:43<01:55,  4.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 295/320 [21:47<01:48,  4.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▎| 296/320 [21:52<

TypeError: Object of type bool_ is not JSON serializable

In [2]:
import numpy as np
import json

def normalize_types(results):
    """Convert numpy types to Python native types."""
    for result in results:
        for key, value in result.items():
            if isinstance(value, np.generic):  # For numpy types
                result[key] = value.item()  # Convert to Python native type
    return results

# Normalize and save results without re-running the evaluation
normalized_results = normalize_types(results)

# Save results to JSON
with open('gpt2_results.json', 'w') as f:
    json.dump(normalized_results, f)

print("Results saved to gpt2_results.json successfully.")


Results saved to gpt2_results.json successfully.


In [2]:
!pip install rouge-score sentence-transformers nltk

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5fa6adcc6d62cae56fd4e5ec39eec3523f785ff72b9c2c11aa16daf4c60cc19f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, sentence-transformers
Successfully installed rouge-score-0.1.2 sentence-transformers-3.0.1


In [3]:
import json
import nltk
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util

# Load the JSON file
with open('gpt2_results (2).json', 'r') as file:
    data = json.load(file)

# Initialize models and tools
nltk.download('punkt')
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to calculate BLEU, ROUGE, and Embedding Similarity
def calculate_metrics(generated_solution, correct_solution):
    # BLEU score
    reference = [nltk.word_tokenize(correct_solution)]
    hypothesis = nltk.word_tokenize(generated_solution)
    bleu_score = sentence_bleu(reference, hypothesis)

    # ROUGE score
    rouge_scores = rouge_scorer.score(correct_solution, generated_solution)
    rouge_1 = rouge_scores['rouge1'].fmeasure
    rouge_2 = rouge_scores['rouge2'].fmeasure
    rouge_l = rouge_scores['rougeL'].fmeasure

    # Embedding similarity
    embedding_gen = embedding_model.encode(generated_solution, convert_to_tensor=True)
    embedding_corr = embedding_model.encode(correct_solution, convert_to_tensor=True)
    embedding_similarity = util.pytorch_cos_sim(embedding_gen, embedding_corr).item()

    return {
        'bleu_score': bleu_score,
        'rouge_1': rouge_1,
        'rouge_2': rouge_2,
        'rouge_l': rouge_l,
        'embedding_similarity': embedding_similarity
    }

# Process the dataset and calculate the metrics
for item in data:
    generated_solution = item['generated_solution']
    correct_solution = item['correct_solution']

    metrics = calculate_metrics(generated_solution, correct_solution)

    # Add the calculated metrics to each entry
    item['bleu_score'] = metrics['bleu_score']
    item['rouge_1'] = metrics['rouge_1']
    item['rouge_2'] = metrics['rouge_2']
    item['rouge_l'] = metrics['rouge_l']
    item['embedding_similarity'] = metrics['embedding_similarity']

# Save the updated JSON file
with open('gpt2_results_with_metrics.json', 'w') as outfile:
    json.dump(data, outfile, indent=4)

print("Metrics calculation completed and saved in gpt2_results_with_metrics.json")

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Metrics calculation completed and saved in gpt2_results_with_metrics.json


In [5]:
import json
import pandas as pd

# Load the JSON file
with open('gpt2_results_with_metrics.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data into a DataFrame
df = pd.DataFrame(data)

# Calculate average metrics
avg_similarity = df['similarity'].mean()
avg_bleu = df['bleu_score'].mean()
avg_rouge1 = df['rouge_1'].mean()
avg_rouge2 = df['rouge_2'].mean()
avg_rouge_l = df['rouge_l'].mean()
avg_embedding_similarity = df['embedding_similarity'].mean()

# Print average metrics
print(f"Average Similarity: {avg_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1 Score: {avg_rouge1}")
print(f"Average ROUGE-2 Score: {avg_rouge2}")
print(f"Average ROUGE-L Score: {avg_rouge_l}")
print(f"Average Embedding Similarity: {avg_embedding_similarity}")

# Filter rows where the generated solution is correct
correct_solutions = df[df['is_correct'] == True]

# Display the filtered DataFrame
print(correct_solutions)

Average Similarity: 0.5971079451334589
Average BLEU Score: 0.008784097987714897
Average ROUGE-1 Score: 0.056645352999221556
Average ROUGE-2 Score: 0.0321646641989287
Average ROUGE-L Score: 0.05075769724884003
Average Embedding Similarity: 0.5971079442403294
    model_name                   topic    difficulty  \
2         gpt2                   ANOVA      advanced   
10        gpt2             Probability      advanced   
14        gpt2    Time Series Analysis      advanced   
16        gpt2    Time Series Analysis      advanced   
23        gpt2     Confidence Interval      advanced   
..         ...                     ...           ...   
297       gpt2             Probability  intermediate   
302       gpt2             Probability  intermediate   
309       gpt2  Descriptive Statistics  intermediate   
312       gpt2             Probability  intermediate   
316       gpt2      Hypothesis Testing  intermediate   

                                    generated_solution  \
2    Please