## Read in LLM Grades and Ground Truth

In [None]:
import os
import pandas as pd

folder_path = "llm-scores"

dataframes = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an Excel file
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        # Read the Excel file and append it to the list of DataFrames
        df = pd.read_excel(file_path)
        dataframes.append(df)

# Concatenate all the DataFrames
df = pd.concat(dataframes, ignore_index=True)

df2 = df[['Transaction_Id', 'GPT_4_turbo_predict', 'GPT_4_turbo_explain', 'GPT_4o_explain', 'GPT_4o_predict']].copy()

df2['GPT_4_turbo'] = df2['GPT_4_turbo_predict'].combine_first(df2['GPT_4_turbo_explain'])
df2['GPT_4o'] = df2['GPT_4o_predict'].combine_first(df2['GPT_4o_explain'])

# Drop the original predict and explain columns (optional)
df3 = df2.drop(columns=['GPT_4_turbo_predict', 'GPT_4_turbo_explain', 'GPT_4o_predict', 'GPT_4o_explain'])

gpt_4_turbo_dict = df3.set_index('Transaction_Id')['GPT_4_turbo'].to_dict()
gpt_4o_dict = df3.set_index('Transaction_Id')['GPT_4o'].to_dict()

In [None]:
import pandas as pd
import time
from tqdm import tqdm

pd.set_option('display.max_columns', None)

df = pd.read_csv('All_Data_Advocacy_Lessons_LAK25 - All Data - Revised.csv', low_memory=False)
df = df[df['Is Last Attempt'] == 1].copy()
df = df[df['Open_response_score_human_truth'].map(lambda s: not pd.isna(s))].copy()
df['Type'] = df['Problem Name'].map(lambda s: 'predicted' if 'What' in s or 'what would you say' in s else 'explained')
df = df[['Transaction_Id', 'Level_Level2_corrected', 'Type', 'Level (Lesson)', 'Input', 'Open_response_score_human_truth']].copy()

data = df.copy()

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm


from sklearn.metrics import f1_score, roc_auc_score, cohen_kappa_score

data['combination'] = data['Type'] + " - " + data['Level (Lesson)']
unique_combinations = data['combination'].unique()

## Descriptives

In [None]:
data['Level (Lesson)'].value_counts()

In [None]:
data_iraise = data[data['Level (Lesson)'].isin(['Helping Students Manage Inequity', 'Avoiding Unconscious Assumptions'])].copy()

In [None]:
data_iraise.groupby(['Level (Lesson)', 'Type', 'Open_response_score_human_truth']).size()

## BERT

In [None]:
data['combination'] = data['Type'] + " - " + data['Level (Lesson)']
unique_combinations = data['combination'].unique()

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

from sklearn.model_selection import train_test_split

def train_and_evaluate(data, tokenizer, max_len, batch_size, epochs, max_splits=5):
    results = []
    aggregated_results = []

    for comb in tqdm(unique_combinations):
        print(f"Processing combination: {comb}")
        comb_data = data[data['combination'] == comb].copy()
        texts = comb_data['Input'].tolist()
        labels = comb_data['Open_response_score_human_truth'].astype(int).tolist()

        orig_texts = texts.copy()  # Needs deep copy as skf split overwrites objects during iteration
        orig_labels = labels.copy()

        # Debugging: Ensure lengths match
        if len(texts) != len(labels):
            print(f"Skipping combination {comb}: Mismatched lengths - texts: {len(texts)}, labels: {len(labels)}")
            continue

        # Skip combinations with fewer than 2 examples
        if len(labels) < 2:
            print(f"Skipping combination {comb} (too few examples).")
            continue

        # Adjust `n_splits` based on data size
        n_splits = min(max_splits, len(labels))
        skf = StratifiedKFold(n_splits=n_splits)

        fold_metrics = []  # Store metrics for each fold

        for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
            if len(orig_texts) != len(orig_labels):
                print(f"Skipping combination {comb} in fold {fold}: Mismatched lengths - texts: {len(orig_texts)}, labels: {len(orig_labels)}")
                continue
            print(f"Fold {fold+1}/{n_splits}")

            train_texts = [orig_texts[i] for i in train_idx]
            val_texts = [orig_texts[i] for i in val_idx]
            train_labels = [orig_labels[i] for i in train_idx]
            val_labels = [orig_labels[i] for i in val_idx]

            train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
            val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            model = BertForSequenceClassification.from_pretrained(
                'bert-base-uncased', num_labels=2)
            model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

            optimizer = AdamW(model.parameters(), lr=2e-5)

            for epoch in range(epochs):
                model.train()
                total_loss = 0
                for batch in train_loader:
                    optimizer.zero_grad()
                    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    total_loss += loss.item()
                    loss.backward()
                    optimizer.step()

                print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader)}")

            # Evaluate
            model.eval()
            all_preds = []
            all_probs = []
            all_labels = []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

                    outputs = model(input_ids, attention_mask=attention_mask)
                    probs = torch.softmax(outputs.logits, axis=1)[:, 1]  # Probability of class 1
                    preds = torch.argmax(outputs.logits, axis=1)

                    all_probs.extend(probs.cpu().numpy())
                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

            # Calculate metrics
            acc = accuracy_score(all_labels, all_preds)
            f1 = f1_score(all_labels, all_preds)
            auc = roc_auc_score(all_labels, all_probs) if len(set(all_labels)) > 1 else None
            kappa = cohen_kappa_score(all_labels, all_preds)
            report = classification_report(all_labels, all_preds, output_dict=True)

            print(f"Fold {fold+1} Metrics - Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc}, Kappa: {kappa:.4f}")

            fold_metrics.append({
                'fold': fold+1,
                'accuracy': acc,
                'f1': f1,
                'auc': auc,
                'kappa': kappa,
                'report': report
            })
            results.append({
                'combination': comb,
                'fold': fold+1,
                'accuracy': acc,
                'f1': f1,
                'auc': auc,
                'kappa': kappa,
                'report': report
            })

        # Calculate average metrics across folds for this combination
        if fold_metrics:
            avg_metrics = {
                'combination': comb,
                'accuracy': np.mean([fm['accuracy'] for fm in fold_metrics]),
                'f1': np.mean([fm['f1'] for fm in fold_metrics]),
                'auc': np.mean([fm['auc'] for fm in fold_metrics if fm['auc'] is not None]),
                'kappa': np.mean([fm['kappa'] for fm in fold_metrics])
            }
            print(f"Average Metrics for combination {comb}: {avg_metrics}")
            aggregated_results.append(avg_metrics)

    return {'fold_results': results, 'avg_results': aggregated_results}

# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256
batch_size = 16
epochs = 10
n_cv_splits = 5

# Run training and evaluation
results = train_and_evaluate(data, tokenizer, max_len, batch_size, epochs, max_splits=5)

## Comparative BERT vs. LLM Eval

In [None]:
# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

from sklearn.model_selection import train_test_split

def evaluate_predictions(true_labels, pred_labels, model_name):
    acc = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    auc = 0.0000 # undefined for llm 
    kappa = cohen_kappa_score(true_labels, pred_labels)
    print(f"{model_name} - Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc}, Kappa: {kappa:.4f}")
    return {'accuracy': acc, 'f1': f1, 'auc': auc, 'kappa': kappa}
    
def train_and_evaluate_with_dicts(data, tokenizer, max_len, batch_size, epochs, max_splits=5):
    results = []
    for comb in tqdm(unique_combinations):
        aggregated_results = []
        bert_metrics_all_folds = []
        gpt4_turbo_metrics_all_folds = []
        gpt4o_metrics_all_folds = []
        total_validation_size = 0  # Track total validation size across all folds
        
        print(f"Processing combination: {comb}")
        comb_data = data[data['combination'] == comb].copy()
        texts = comb_data['Input'].tolist()
        labels = comb_data['Open_response_score_human_truth'].astype(int).tolist()

        # Skip combinations with fewer than 2 labels (AUC undefined)
        if len(labels) < 2:
            print(f"Skipping combination {comb} (too few labels).")
            continue

        # Check for missing dictionary values
        missing_turbo = sum(pd.isnull(comb_data['Transaction_Id'].map(gpt_4_turbo_dict)))
        missing_gpt4o = sum(pd.isnull(comb_data['Transaction_Id'].map(gpt_4o_dict)))
        print(f"{comb} - Missing GPT_4_turbo: {missing_turbo}, Missing GPT_4o: {missing_gpt4o}")

        # Exclude rows with missing dictionary values
        comb_data = comb_data[
            ~comb_data['Transaction_Id'].map(gpt_4_turbo_dict).isnull()
            & ~comb_data['Transaction_Id'].map(gpt_4o_dict).isnull()
        ]

        if comb_data.empty:
            print(f"Skipping combination {comb} after filtering missing dictionary values.")
            continue

        # Add dictionary predictions
        comb_data['GPT_4_turbo_eval'] = comb_data['Transaction_Id'].map(gpt_4_turbo_dict)
        comb_data['GPT_4o_eval'] = comb_data['Transaction_Id'].map(gpt_4o_dict)

        texts = comb_data['Input'].tolist()
        labels = comb_data['Open_response_score_human_truth'].tolist()
        turbo_preds = comb_data['GPT_4_turbo_eval'].tolist()
        gpt4o_preds = comb_data['GPT_4o_eval'].tolist()

        texts_orig = texts.copy()
        labels_orig = labels.copy()
        turbo_preds_orig = turbo_preds.copy()
        gpt4o_preds_orig = gpt4o_preds.copy()

        # Adjust `n_splits` based on data size
        n_splits = min(max_splits, len(labels))
        skf = StratifiedKFold(n_splits=n_splits)

        fold_metrics = []  # Store metrics for each fold

        for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
            print(f"Fold {fold+1}/{n_splits}")

            val_size = len(val_idx)  # Get size of validation set
            total_validation_size += val_size
            print(f"Validation set size for Fold {fold+1}: {val_size}")

            train_texts = [texts_orig[i] for i in train_idx]
            val_texts = [texts_orig[i] for i in val_idx]
            train_labels = [labels_orig[i] for i in train_idx]
            val_labels = [labels_orig[i] for i in val_idx]

            # Evaluate dictionary predictions on validation data
            val_turbo_preds = [turbo_preds_orig[i] for i in val_idx]
            val_gpt4o_preds = [gpt4o_preds_orig[i] for i in val_idx]

            turbo_metrics = evaluate_predictions(val_labels, val_turbo_preds, f"GPT_4_turbo Fold {fold+1}")
            gpt4o_metrics = evaluate_predictions(val_labels, val_gpt4o_preds, f"GPT_4o Fold {fold+1}")

            gpt4_turbo_metrics_all_folds.append(turbo_metrics)
            gpt4o_metrics_all_folds.append(gpt4o_metrics)

            aggregated_results.append({'combination': f"{comb} - GPT_4_turbo Fold {fold+1}", **turbo_metrics})
            aggregated_results.append({'combination': f"{comb} - GPT_4o Fold {fold+1}", **gpt4o_metrics})

            train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
            val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            model = BertForSequenceClassification.from_pretrained(
                'bert-base-uncased', num_labels=2)
            model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

            optimizer = AdamW(model.parameters(), lr=2e-5)

            for epoch in range(epochs):
                model.train()
                total_loss = 0
                for batch in train_loader:
                    optimizer.zero_grad()
                    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    total_loss += loss.item()
                    loss.backward()
                    optimizer.step()

                print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader)}")

            # Evaluate
            model.eval()
            all_preds = []
            all_probs = []
            all_labels = []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
                    labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

                    outputs = model(input_ids, attention_mask=attention_mask)
                    probs = torch.softmax(outputs.logits, axis=1)[:, 1]  # Probability of class 1
                    preds = torch.argmax(outputs.logits, axis=1)

                    all_probs.extend(probs.cpu().numpy())
                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

            # Calculate metrics
            acc = accuracy_score(all_labels, all_preds)
            f1 = f1_score(all_labels, all_preds)
            auc = roc_auc_score(all_labels, all_probs) if len(set(all_labels)) > 1 else None
            kappa = cohen_kappa_score(all_labels, all_preds)

            print(f"Fold {fold+1} Metrics - Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc}, Kappa: {kappa:.4f}")

            bert_metrics_all_folds.append({
                'accuracy': acc,
                'f1': f1,
                'auc': auc,
                'kappa': kappa,
            })

            fold_metrics.append({
                'fold': fold+1,
                'accuracy': acc,
                'f1': f1,
                'auc': auc,
                'kappa': kappa,
            })
            results.append({
                'combination': comb,
                'fold': fold+1,
                'accuracy': acc,
                'f1': f1,
                'auc': auc,
                'kappa': kappa,
            })

        # Calculate average metrics across folds for this combination
        if fold_metrics:
            avg_metrics = {
                'combination': comb,
                'accuracy': np.mean([fm['accuracy'] for fm in fold_metrics]),
                'f1': np.mean([fm['f1'] for fm in fold_metrics]),
                'auc': np.mean([fm['auc'] for fm in fold_metrics if fm['auc'] is not None]),
                'kappa': np.mean([fm['kappa'] for fm in fold_metrics])
            }
            print(f"Average Metrics for combination {comb}: {avg_metrics}")
            aggregated_results.append(avg_metrics)
        
        # Aggregate metrics for BERT, GPT-4 Turbo, and GPT-4o
        bert_avg = {key: np.mean([m[key] for m in bert_metrics_all_folds]) for key in bert_metrics_all_folds[0]}
        gpt4_turbo_avg = {key: np.mean([m[key] for m in gpt4_turbo_metrics_all_folds]) for key in gpt4_turbo_metrics_all_folds[0]}
        gpt4o_avg = {key: np.mean([m[key] for m in gpt4o_metrics_all_folds]) for key in gpt4o_metrics_all_folds[0]}
    
        print(f"\nFinal Metrics for {comb}:")
        print(f"Total Validation Set Size: {total_validation_size}")
        print(f"BERT Average Metrics: {bert_avg}")
        print(f"GPT-4 Turbo Average Metrics: {gpt4_turbo_avg}")
        print(f"GPT-4o Average Metrics: {gpt4o_avg}")

        results.append((comb, bert_avg, gpt4_turbo_avg, gpt4o_avg))

    return results

# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256
batch_size = 16
epochs = 5
n_cv_splits = 5

# Run training and evaluation
results = train_and_evaluate_with_dicts(data, tokenizer, max_len, batch_size, epochs, max_splits=5)