In [None]:
import pandas as pd
import time
from tqdm import tqdm

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = #... Read in your data with columns representing an ID, input, and grading (human-coded data); see README for data reference
df = df[['Transaction_Id', 'Input', 'Open_response_score_human_truth']].copy()

## Starting with Building Cultural Competence Predicted (Largest Human-Coded Sample in Mixed Condition)

In [None]:
POS_PROMPT_SYS = """
You are a tutor responding to a middle school student in a way that makes them feel noticed, valued, and cared for, 
while also showing a genuine effort to understand and appreciate the student's cultural background.
"""

POS_PROMPT_USER = """
Please draft 10 positive, culturally sensitive responses that help build a connection with the student. 
Only give the responses and do not generate anything else. Do not enumerate the responses, just 
separate them one per line.
 
Below are examples and explanations for what constitutes culturally sensitive responses:

-Culturally sensitive responses help the student feel noticed and cared for by making an effort to understand and 
value a student's cultural background. Sample responses include: 

"Thank her for correcting your pronunciation and ask more about her culture" and 
"Did you know that the early Aztecs in Mexico discovered Pythagoras theorem beforehand to create sun temples?"; and 
"it is okay to be forgetful sometimes why don't you tell me a bit about yourself after we do our work for the day."

Response Start ---
"""

NEG_PROMPT_SYS = """
You are a tutor responding to a middle school student in a way that does not make them feel noticed, valued, or cared for, 
and does not show any effort to understand or appreciate the student's cultural background.
"""

NEG_PROMPT_USER = """
Please draft 10 neutral or culturally unaware responses that fail to build a meaningful connection with the student. 
Only give the responses and do not generate anything else. Do not enumerate the responses, just 
separate them one per line.

Below are examples and explanations for what constitutes culturally unaware responses:

-Culturally unaware responses do not show an effort to understand or value a student's cultural background. Sample responses include:

"Thank you for correcting me. Please let me know if I make a mistake again." and 
"Hi, Marcelo. You seem quiet today. Let's focus on the lesson, and we can chat afterward if there's time."

Response Start ---
"""

In [None]:
import openai
import pandas as pd

def read_api_token(file_path):
    with open(file_path, 'r') as file:
        return file.readline().strip()

def get_prompt(response_type):
    if response_type.lower() == "positive":
        return POS_PROMPT_SYS, POS_PROMPT_USER
    elif response_type.lower() == "negative":
        return NEG_PROMPT_SYS, NEG_PROMPT_USER
    else:
        raise ValueError("Invalid response type. Choose either 'positive' or 'negative'.")

def generate_responses(response_type, temperature=0.3):
    client = openai.OpenAI(
        api_key=read_api_token('token-cb.txt'),
    )
    prompt_sys, prompt_user = get_prompt(response_type)
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt_sys},
                {"role": "user", "content": prompt_user},
            ],
            max_tokens=500,
            temperature=temperature
        )
        return response.choices[0].message.content.strip().split("\n")        
    except Exception as e:
        return [f"An error occurred: {str(e)}"]

## Generation with a specific temperature

In [None]:
data = []
TEMPERATURE = 0.7
for _ in tqdm(range(50)):
    time.sleep(1)
    positive_responses = generate_responses("positive", temperature=TEMPERATURE)
    for response in positive_responses:
        data.append({"text": response, "category": "positive", "temperature": TEMPERATURE})
   
   # Generate negative responses
    negative_responses = generate_responses("negative", temperature=TEMPERATURE)
    for response in negative_responses:
        data.append({"text": response, "category": "negative", "temperature": TEMPERATURE})#

 Create a DataFrame
df_tmp = pd.DataFrame(data)
df_tmp.to_csv('your-output.csv', index=False)

## Validation through GPT consistency

In [None]:
PROMPT_SYSTEM = """
You are a tutor evaluator. Please score the following tutor response to a tutor training scenario involving a middle school student as follows:
-if the tutor’s response helps the student feel noticed and cared for by making an effort to understand and value a student's cultural background, score with a 1. Sample responses scoring a 1 include: "Thank her for correcting your pronunciation and ask more about her culture" and "Did you know that the early Aztecs in Mexico discovered Pythagoras theorem beforehand to create sun temples?"; and "it is okay to be forgetful sometimes why don't you tell me a bit about yourself after we do our work for the day."
-if the tutor’s response does not show a building of cultural competence, shown by a lack of care for understanding a student's background, score with a 0. Sample responses scoring a 0 include: "Thank you for correcting me. Please correct me again in the future if it comes up"; "Hi Marcelo, I noticed that you are a bit quiet today. Can we have a conversation? I want to spend some time introducing myself."
"""

PROMPT_USER_START = """
Response Start ---
"""
PROMPT_USER_END = """
--- Response End. Given the earlier response, please return a JSON string following the format, {\"Rationale\": \"your reasoning here\", \"Score\":0/1}.
"""

In [None]:
df_synth = pd.read_csv('your-output.csv').dropna()  

df_synth['category'] = df_synth.category.map(lambda x: float(1) if x=='positive' else float(0))
df_synthtrain = df_synth\
    .rename(columns={'text': 'X', 'category': 'y'})\
    [['X', 'y']]\
    .copy()

In [None]:
def generate_responses_validation(text):
    client = openai.OpenAI(
        api_key=read_api_token('cb-token.txt'),
    )
    prompt_user = PROMPT_USER_START + str(text) + PROMPT_USER_END
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": PROMPT_SYSTEM},
                {"role": "user", "content": prompt_user},
            ],
            max_tokens=500
        )
        return response.choices[0].message.content       
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [None]:
# Grading
responses = []
for generated_text in tqdm(df_soft.Input):
    time.sleep(0.2)
    tmp = generate_responses_validation(generated_text)
    responses.append(tmp)

In [None]:
import ast
def parse_gpt_json(s: str):
    try:
        s = s.replace('```json', '')
        s = s.replace('```', '')
        j = ast.literal_eval(s)
        return j.get('Score')
    except e:
        print(f'Error parsing JSON: {e}, returning an empty string.')
        return ''

df_synth['gpt_grade'] = [parse_gpt_json(r) for r in resp]

### Evaluation of consistency

In [None]:
from sklearn.metrics import cohen_kappa_score, accuracy_score, precision_score, recall_score, f1_score

# Filter out only 0 and 1 labels
binary_df = sampled_df[(sampled_df['category'].isin([0, 1])) & (sampled_df['gpt_grade'].isin([0, 1]))]

# Calculate Cohen's kappa
cohen_kappa = cohen_kappa_score(binary_df['category'], binary_df['gpt_grade'])
print("Cohen's kappa:", cohen_kappa)

# Calculate accuracy
accuracy = accuracy_score(binary_df['category'], binary_df['gpt_grade'])
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(binary_df['category'], binary_df['gpt_grade'])
print("Precision:", precision)

# Calculate recall
recall = recall_score(binary_df['category'], binary_df['gpt_grade'])
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(binary_df['category'], binary_df['gpt_grade'])
print("F1 score:", f1)

## Distillation

### In this example, a separate transfer set is defined (optional)

In [None]:
df_groundtruth = df[(df['Type']=='predicted') & (df['Level (Lesson)']=='Building Cultural Competence')]\
    [['Input', 'Open_response_score_human_truth']]\
    .copy()\
    .rename(columns={'Input': 'X', 'Open_response_score_human_truth': 'y'})\
    .dropna()

df_transfer = df[(df['Type']=='explained') & (df['Level (Lesson)']=='Building Cultural Competence')]\
    [['Input', 'Open_response_score_human_truth']]\
    .copy()\
    .rename(columns={'Input': 'X', 'Open_response_score_human_truth': 'y'})\
    .dropna()

In [None]:
EVAL_LIST = [] # Storing results, which is later the foundation for plots. Global scope as baseline is re-referneced across experiments.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample

# Split df_groundtruth into a training set and a test set (U_df) for evaluation
train_df, U_df = train_test_split(df_groundtruth, test_size=2/3, random_state=42)

# Add transfer set (explained) to U_df (evaluation test set)
U_df = pd.concat([U_df, df_transfer]).copy()

print(f'Number of human training set observations: {train_df.shape[0]}')
print(f'Number of evaluation set observations: {U_df.shape[0]}')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a custom PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create PyTorch Datasets and DataLoaders
train_dataset = TextDataset(train_df['X'].tolist(), train_df['y'].tolist(), tokenizer)
test_dataset = TextDataset(U_df['X'].tolist(), U_df['y'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

from sklearn.metrics import roc_auc_score
import torch

# Function to evaluate the model and calculate AUC
def evaluate(model, dataloader):
    model.eval()
    y_true = []
    y_pred_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

            y_true.extend(labels.cpu().numpy())
            y_pred_probs.extend(probs)

     # Calculate AUC
    auc_estimate = roc_auc_score(y_true, y_pred_probs)

    # Bootstrap to calculate 95% CI
    bootstrapped_aucs = []
    for _ in range(1000):  # 1000 bootstrap samples
        indices = resample(np.arange(len(y_true)), replace=True)
        y_true_boot = np.array(y_true)[indices]
        y_pred_probs_boot = np.array(y_pred_probs)[indices]
        try:
            bootstrapped_aucs.append(roc_auc_score(y_true_boot, y_pred_probs_boot))
        except ValueError:
            continue  # Handle case where AUC cannot be computed for a sample

    lower_bound = np.percentile(bootstrapped_aucs, 2.5) # 95% CIs
    upper_bound = np.percentile(bootstrapped_aucs, 97.5) # 95% CIs

    return auc_estimate, lower_bound, upper_bound

# Initialize variables for early stopping
best_val_auc = 0  # AUC ranges from 0 to 1, so the initial value is the lowest possible
patience = 2  # Number of epochs to wait for improvement
counter = 0

# Train the BERT model with early stopping
for epoch in range(50):  # Set a high number of epochs initially
    print(f"Epoch {epoch + 1}")
    
    # Training phase
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    print(f"Training loss: {train_loss / len(train_loader)}")

    # Validation phase
    val_auc, lower, upper = evaluate(model, test_loader)
    print(f"Validation AUC: {val_auc}, [{lower}, {upper}]")
    EVAL_LIST.append((f'Epoch_{epoch}', val_auc, lower, upper))

    # Check for early stopping
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        counter = 0
        # Save the best model for later experiments
        torch.save(model.state_dict(), 'your-model.pt')
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            break

### After no improvement on human data is observed, synthetic data is added in increments

In [None]:
def synth_eval(synth_f, THE_TEMPERATURE, remove_consistent=False, outf='plot.pdf'):
    df_synth = pd.read_csv(synth_f).dropna() # 1000 exactly. Great
    
    df_synth['category'] = df_synth.category.map(lambda x: float(1) if x=='positive' else float(0))

    if remove_consistent:
        df_synth = df_synth[df_synth['category'] == df_synth['gpt_grade']].copy()
        print(f'retaining {df_synth.shape[0]} observations')
    
    df_synthtrain = df_synth\
        .rename(columns={'text': 'X', 'category': 'y'})\
        [['X', 'y']]\
        .copy()
    
    with open('eval_list-final.p', 'rb') as f:
        eval_list = pickle.load(f) 
    
    # Before continuing training, read in 
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Set up the optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model.load_state_dict(torch.load('your-model.pt')) # Load baseline model trained on human-coded data
    
    baseline_auc = evaluate(model, test_loader)
    print(f"Baseline AUC: {baseline_auc}")

    # Shuffle df_synthtrain to randomize the order
    df_synthtrain_shuffled = df_synthtrain.sample(frac=1, random_state=42)
    synth_texts = df_synthtrain_shuffled['X'].tolist()
    synth_labels = df_synthtrain_shuffled['y'].tolist()
    
    # List to store AUC scores for each increment
    auc_scores = []
    
    # Incrementally introduce data from df_synthtrain for training
    N = 25  # Number of data points to add from df_synthtrain in each increment
    for i in tqdm(list(range(0, 250, N))):
        # Select the next N data points from df_synthtrain
        current_texts = synth_texts[i:i + N]
        current_labels = synth_labels[i:i + N]
    
        # Create a new DataLoader for the incrementally added data
        new_dataset = TextDataset(current_texts, current_labels, tokenizer)
        new_loader = DataLoader(new_dataset, batch_size=16, shuffle=True)
    
        # Augment the training data and retrain the model
        model.train()
        for batch in new_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')
    
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
        # Re-evaluate the model on the same test set (U_df) using AUC
        auc, lower, upper = evaluate(model, test_loader)
        auc_scores.append((auc, lower, upper))
        eval_list.append((f'+ {i+N}', auc, lower, upper))
        print(f"AUC after adding {i + N} data points from df_synthtrain: {auc}")

    avg = np.mean([avg for ref, avg, _, _ in eval_list if '+' in ref])
    print(f'Augmentation average: {avg} which is {avg-baseline_auc} above baseline')

    def increment_index(s):
        ii = int(s.replace('Epoch_', ''))
        return s.replace(str(ii), str(ii+1))
        
    labels = [item[0] for item in eval_list]
    labels = [increment_index(l) if 'Epoch' in l else l for l in labels] # fix 0-indexing of epoch...
    means = [item[1] for item in eval_list]
    lower_bounds = [item[2] for item in eval_list]
    upper_bounds = [item[3] for item in eval_list]
    
    # Convert bounds to error ranges
    errors = [
        (mean - lower, upper - mean)
        for mean, lower, upper in zip(means, lower_bounds, upper_bounds)
    ]
    
    # Plot with error bars
    plt.figure(figsize=(8, 5))  # Adjusted size for better font balance
    plt.errorbar(
        labels, means, yerr=np.array(errors).T, fmt='o', capsize=5, label='AUC with 95% CI'
    )
    plt.xlabel('Epochs and Augmentations', fontsize=11)
    plt.ylabel('AUC Score', fontsize=10)
    plt.axhline(y=0.7439784053156147, color='red', linestyle='--', label='Baseline AUC') # Manually setting baseline AUC based on output...
    plt.title(f'Temperature: {THE_TEMPERATURE}', fontsize=16)
    plt.xticks(rotation=45, fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(visible=True, linestyle='--', alpha=0.5)
    plt.legend(fontsize=10)
    plt.tight_layout()
    plt.savefig(outf, format="pdf", dpi=300)  # Save plot as PDF
    plt.show()
    return

In [None]:
synth_eval('your-output.csv', 0.7)