###Installing Libraries

In [1]:
pip install pandas numpy nltk rouge-score sentence-transformers transformers torch bert-score matplotlib

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12

In [4]:
import nltk;
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

###Creating Taxonomy List

In [6]:
import pandas as pd
from collections import Counter
import os
import networkx as nx

# Load dataset to compute skill frequencies
df_freq = pd.read_csv('/content/drive/MyDrive/EduVLM/gsm8k_wrong_answers_with_missing_prerequisites_enhanced.csv')

# Skill taxonomy and synonyms
skill_taxonomy = {
    'addition': [], 'subtraction': ['addition'], 'multiplication': ['addition'],
    'division': ['multiplication', 'addition'], 'fraction': ['division', 'multiplication', 'addition'],
    'percentage': ['multiplication', 'division', 'fraction'], 'percentages': ['multiplication', 'division', 'fraction'],
    'algebraic thinking': ['multiplication', 'addition', 'subtraction'], 'ratios': ['fraction', 'division', 'multiplication'],
    'geometry': ['measurement', 'addition', 'angles'], 'measurement': ['addition', 'number sense'],
    'algebra': ['algebraic thinking', 'multiplication', 'subtraction'], 'proportion': ['fraction', 'division', 'ratios'],
    'rate': ['fraction', 'division', 'ratios'], 'area': ['multiplication', 'measurement'],
    'volume': ['multiplication', 'measurement', 'area'], 'proportional': ['ratios', 'fraction', 'proportion'],
    'angles': ['geometry', 'measurement'], 'number sense': ['addition', 'counting'], 'counting': [], 'unknown': []
}

math_synonyms = {
    'addition': ['add', 'sum', 'plus', 'total', 'combine', 'addition operation', 'summation'],
    'subtraction': ['subtract', 'minus', 'difference', 'take away', 'reduce', 'subtraction operation', 'deduct'],
    'multiplication': ['multiply', 'times', 'product', 'repeated addition', 'multiplication operation', 'scale'],
    'division': ['divide', 'quotient', 'share', 'split', 'division operation', 'partitive'],
    'fraction': ['ratio', 'proportion', 'frac', 'part', 'division', 'fractional', 'proper fraction'],
    'percentage': ['percent', 'rate', 'pct', 'percentages', '%', 'percentile', 'per cent'],
    'percentages': ['percent', 'rate', 'pct', 'percentage', '%', 'percentile', 'per cent'],
    'algebraic thinking': ['algebra', 'equations', 'variables', 'unknowns', 'algebraic', 'linear equations'],
    'ratios': ['proportion', 'rate', 'ratio', 'fraction', 'scale', 'proportional', 'relative'],
    'geometry': ['shapes', 'measurement', 'area', 'volume', 'spatial', 'geometric', 'angles'],
    'measurement': ['measure', 'length', 'area', 'volume', 'units', 'dimension', 'metric'],
    'algebra': ['equations', 'variables', 'algebraic thinking', 'algebraic expressions', 'polynomials'],
    'proportion': ['ratio', 'fraction', 'rate', 'proportional', 'scaling'],
    'rate': ['proportion', 'ratio', 'speed', 'frequency', 'rate of change'],
    'area': ['surface area', 'square units', 'multiplication', 'measurement'],
    'volume': ['cubic units', 'multiplication', 'measurement', 'capacity'],
    'proportional': ['ratios', 'fraction', 'proportion', 'scaling'],
    'angles': ['geometry', 'measurement'],
    'number sense': ['counting', 'numeracy', 'number operations'],
    'computational error': ['computational errors', 'calculation mistakes', 'wrong operations']
}

# Normalize skill function
def normalize_skill(skill, synonyms):
    skill = str(skill).lower().strip()
    for canonical, syn_list in synonyms.items():
        if skill == canonical or skill in [s.lower() for s in syn_list]:
            return canonical
    if any(x in skill for x in ['$', '1/', 'computational', 'incorrect', 'mistake', 'error']):
        return 'computational error'
    return 'unknown'

# Extract skills for IC
def extract_skills(row):
    skills = str(row.get('missing_prerequisites', '')).lower().split()
    return [skill for skill in skills if skill in skill_taxonomy]

# Compute Information Content (IC)
skill_counts = Counter()
for index, row in df_freq.iterrows():
    skill_counts.update(extract_skills(row))

total_skills = sum(skill_counts.values())
if total_skills > 0:
    ic = {skill: -np.log(skill_counts[skill] / total_skills) if skill_counts[skill] > 0 else 0 for skill in skill_taxonomy}
else:
    ic = {skill: 0 for skill in skill_taxonomy}

# Build taxonomic graph
G = nx.DiGraph()
for skill, prerequisites in skill_taxonomy.items():
    G.add_node(skill, ic=ic[skill])
    for prereq in prerequisites:
        G.add_edge(prereq, skill)

# Function to find Least Common Ancestor (LCA)
def find_lca(graph, skill1, skill2):
    if skill1 not in graph or skill2 not in graph:
        return None
    ancestors1 = nx.ancestors(graph, skill1) | {skill1}
    ancestors2 = nx.ancestors(graph, skill2) | {skill2}
    common_ancestors = ancestors1 & ancestors2
    if not common_ancestors:
        return None
    return max(common_ancestors, key=lambda x: graph.nodes[x]['ic'])

# File paths
model_results = [
    '/content/drive/MyDrive/EduVLM/internvl3_8b_results.csv',
    '/content/drive/MyDrive/EduVLM/gemma3_12b_results_v2.csv',
    '/content/drive/MyDrive/EduVLM/qwen_2.5_vl_7b_results_v2.csv',
    '/content/drive/MyDrive/EduVLM/smolvlm2_500m_results_v2.csv'
]

###Pass@K Metrics

In [7]:
# Pass@1
def compute_pass_at_k(df, k=1):
    if k == 1:
        if 'is_correct' not in df.columns:
            raise ValueError("Column 'is_correct' not found")
        if not df['is_correct'].isin([0, 1, 0.0, 1.0]).all():
            print(f"Warning: Non-binary values in 'is_correct' for {file_path}: {df['is_correct'].unique()}")
        return df['is_correct'].mean() * 100  # Output as percentage
    else:
        raise ValueError("Only Pass@1 is supported in this version.")

# Evaluate Pass@1 for results
results_pass_at_1 = []
for file_path in model_results:
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            model_name = os.path.basename(file_path)
            pass_at_1 = compute_pass_at_k(df, k=1)
            results_pass_at_1.append({'Model': model_name, 'Pass@1 (%)': pass_at_1})
            print(f"Processed {model_name}: Pass@1 = {pass_at_1:.1f}%")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

Processed internvl3_8b_results.csv: Pass@1 = 19.0%
Processed gemma3_12b_results_v2.csv: Pass@1 = 35.5%
Processed qwen_2.5_vl_7b_results_v2.csv: Pass@1 = 38.5%
Processed smolvlm2_500m_results_v2.csv: Pass@1 = 13.5%


###METEOR Score Metric

In [19]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pandas as pd
import os





# Normalize skills
def normalize_skill(skill, synonyms):
    skill = str(skill).lower().strip()
    for canonical, syn_list in synonyms.items():
        if skill == canonical or skill in [s.lower() for s in syn_list]:
            return canonical
    return 'unknown'

# Compute METEOR score for a single prediction
def compute_meteor_score(ground_truth, predicted, synonyms):
    try:
        gt = normalize_skill(ground_truth, synonyms)
        pred = normalize_skill(predicted, synonyms)
        gt_tokens = word_tokenize(gt)
        pred_tokens = word_tokenize(pred)
        references = [gt_tokens] + [word_tokenize(syn) for syn in synonyms.get(gt, []) if syn]
        if not references or not pred_tokens:
            return 0.0
        score = float(meteor_score(references, pred_tokens))  # Explicit float conversion
        return score
    except Exception as e:
        print(f"METEOR error for gt='{ground_truth}', pred='{predicted}': {e}")
        return 0.0

# Evaluation function for METEOR Score
def evaluate_meteor_score(df, model_name):
    try:
        if 'ground_truth' not in df.columns or 'predicted' not in df.columns:
            raise ValueError("DataFrame must contain 'ground_truth' and 'predicted' columns")

        df = df.dropna(subset=['ground_truth', 'predicted'])  # Drop rows with missing values
        meteor_scores = []

        for _, row in df.iterrows():
            ground_truth = str(row['ground_truth']).lower()
            predicted = str(row['predicted']).lower()
            meteor = compute_meteor_score(ground_truth, predicted, math_synonyms)
            meteor_scores.append(meteor)

        avg_meteor = np.mean(meteor_scores) if meteor_scores else 0.0
        return {'Model': model_name, 'Average METEOR Score': avg_meteor}
    except Exception as e:
        print(f"Error computing METEOR score for {model_name}: {e}")
        return {'Model': model_name, 'Average METEOR Score': np.nan}


# Evaluate METEOR Score for all models
results_meteor_score = []
for file_path in model_results:
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        model_name = os.path.basename(file_path)
        print(f"\nEvaluation for {model_name}:")
        metrics = evaluate_meteor_score(df, model_name)
        results_meteor_score.append(metrics)
        for metric, value in metrics.items():
            if isinstance(value, (int, float)):  # Apply .3f only to numbers
                print(f"{metric}: {value:.3f}")
            else:  # Print strings as-is
                print(f"{metric}: {value}")
    else:
        print(f"\nFile not found: {file_path}")


Evaluation for internvl3_8b_results.csv:
Model: internvl3_8b_results.csv
Average METEOR Score: 0.345

Evaluation for gemma3_12b_results_v2.csv:
Model: gemma3_12b_results_v2.csv
Average METEOR Score: 0.255

Evaluation for qwen_2.5_vl_7b_results_v2.csv:
Model: qwen_2.5_vl_7b_results_v2.csv
Average METEOR Score: 0.260

Evaluation for smolvlm2_500m_results_v2.csv:
Model: smolvlm2_500m_results_v2.csv
Average METEOR Score: 0.227


###BERT Score Metric

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
# BERT Score
def compute_bert_score(df):
    try:
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        model = AutoModel.from_pretrained('bert-base-uncased')
        model.eval()

        def get_bert_embedding(text):
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

        ground_truth = df['ground_truth'].apply(str)
        predicted = df['predicted'].apply(str)
        scores = []
        for gt, pred in zip(ground_truth, predicted):
            gt_emb = get_bert_embedding(gt)
            pred_emb = get_bert_embedding(pred)
            cosine_sim = np.dot(gt_emb, pred_emb) / (np.linalg.norm(gt_emb) * np.linalg.norm(pred_emb) + 1e-8)
            scores.append(cosine_sim)
        return np.mean(scores)
    except Exception as e:
        print(f"Error computing BERT score: {str(e)}")
        return np.nan

# Evaluate BERT Score for results
results_bert_score = []
for file_path in model_results:
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            model_name = os.path.basename(file_path)
            bert_score = compute_bert_score(df)
            results_bert_score.append({'Model': model_name, 'BERT Score': bert_score})
            print(f"Processed {model_name}: BERT Score = {bert_score:.3f}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

Processed internvl3_8b_results.csv: BERT Score = 0.815
Processed gemma3_12b_results_v2.csv: BERT Score = 0.904
Processed qwen_2.5_vl_7b_results_v2.csv: BERT Score = 0.902
Processed smolvlm2_500m_results_v2.csv: BERT Score = 0.812


###BFS-based Taxonomic Distance Metric

In [11]:
# BFS-based Taxonomic Distance
def compute_taxonomic_distance(df, taxonomy):
    def get_distance(skill1, skill2, taxonomy):
        if skill1 == skill2:
            return 0
        if skill1 not in taxonomy or skill2 not in taxonomy:
            return 3  # Default distance for unknown skills
        visited = set()
        queue = [(skill1, 0)]
        while queue:
            skill, dist = queue.pop(0)
            if skill == skill2:
                return dist
            if skill not in visited:
                visited.add(skill)
                for neighbor in taxonomy.get(skill, []):
                    queue.append((neighbor, dist + 1))
        return 3  # Max distance if no path found

    ground_truth = df['ground_truth'].apply(lambda x: normalize_skill(str(x), math_synonyms))
    predicted = df['predicted'].apply(lambda x: normalize_skill(str(x), math_synonyms))
    distances = [get_distance(gt, pred, taxonomy) for gt, pred in zip(ground_truth, predicted)]
    return np.mean(distances)

# Evaluate BFS-based Taxonomic Distance for results
results_taxonomic_distance = []
for file_path in model_results:
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            model_name = os.path.basename(file_path)
            taxonomic_distance = compute_taxonomic_distance(df, skill_taxonomy)
            results_taxonomic_distance.append({'Model': model_name, 'Average Taxonomic Distance': taxonomic_distance})
            print(f"Processed {model_name}: ATD = {taxonomic_distance:.3f}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

Processed internvl3_8b_results.csv: ATD = 0.930
Processed gemma3_12b_results_v2.csv: ATD = 1.500
Processed qwen_2.5_vl_7b_results_v2.csv: ATD = 1.490
Processed smolvlm2_500m_results_v2.csv: ATD = 2.325


###Resnik-based Weighted Taxonomic Distance Metric

In [12]:
# Resnik-based Weighted Taxonomic Distance
def compute_weighted_taxonomic_distance(df, graph):
    def get_distance(skill1, skill2, graph):
        if skill1 == skill2:
            return 0.0
        lca = find_lca(graph, skill1, skill2)
        if lca is None:
            return 1.0  # Max distance if no LCA (normalized to 1)
        ic_lca = graph.nodes[lca]['ic']
        similarity = ic_lca
        max_ic = max(ic.values()) if ic.values() else 1.0
        distance = 1.0 - (similarity / max_ic) if max_ic > 0 else 1.0
        return distance

    ground_truth = df['ground_truth'].apply(lambda x: normalize_skill(str(x), math_synonyms))
    predicted = df['predicted'].apply(lambda x: normalize_skill(str(x), math_synonyms))
    distances = [get_distance(gt, pred, graph) for gt, pred in zip(ground_truth, predicted) if gt in graph and pred in graph]
    return np.mean(distances) if distances else 1.0

# Evaluate Resnik-based Weighted Taxonomic Distance for results
results_weighted_taxonomic_distance = []
for file_path in model_results:
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            model_name = os.path.basename(file_path)
            weighted_atd = compute_weighted_taxonomic_distance(df, G)
            results_weighted_taxonomic_distance.append({'Model': model_name, 'Weighted Taxonomic Distance': weighted_atd})
            print(f"Processed {model_name}: Weighted Taxonomic Distance = {weighted_atd:.3f}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

Processed internvl3_8b_results.csv: Weighted Taxonomic Distance = 0.310
Processed gemma3_12b_results_v2.csv: Weighted Taxonomic Distance = 0.508
Processed qwen_2.5_vl_7b_results_v2.csv: Weighted Taxonomic Distance = 0.497
Processed smolvlm2_500m_results_v2.csv: Weighted Taxonomic Distance = 0.606
