# Prototype
genAI - an autonomous system that take a Common Core learning standard and a topic of student interest and generate customized free-response questions that test the student’s knowledge

In [94]:
# ----- Initialization -----
import pandas as pd
import openai
import random

# Constants and Configuration
API_KEY = 'YOUR_OPENAI_API_KEY'  # Replace with an OpenAI secret API
MODEL_NAME = "gpt-3.5-turbo" 

# Set up the OpenAI API key
openai.api_key = API_KEY

# ----- Functions -----
def load_ela_ccss(file_path='ela-ccss.csv'):
    """
    Load the English Language Arts (ELA) Common Core State Standards (CCSS) into a dataframe, 
    filtering only writing standards for grades 4 and above.
    Database source: https://gist.github.com/philngo/2735248c98c3e0cd7814
    
    Parameters:
    - file_path (str): Path to the CSV file containing the standards. Default is 'ela-ccss.csv'.
    
    Returns:
    - DataFrame: A dataframe containing all the ELA CCSS standards.
    """
    ela_ccss = pd.read_csv(file_path)
    ela_ccss = ela_ccss[ela_ccss['category_id'] == 'W']  # Only writing standards
    ela_ccss['grade_num'] = ela_ccss['grade_id'].str.extract(r'^(\d+)').astype(float)  # Extract grade number
    ela_ccss = ela_ccss[ela_ccss['grade_num'] >= 4]  # Only grades 4 and above
    return ela_ccss

def chat_with_openai(prompt, max_tokens=150, temperature=0.2, conversation_history=None):
    """
    Interact with OpenAI API given a prompt and return the response content.
    
    Parameters:
    - prompt (str): The prompt to send to the OpenAI API.
    - max_tokens (int): Maximum number of tokens for the response. Default is 150.
    - temperature (float): Sampling temperature for the response. Default is 0.2.
    - conversation_history (list): Previous conversation history. Default is None.
    
    Returns:
    - tuple: A tuple containing the assistant's response and the updated conversation history.
    """
    system_message = """
    You are a helpful assistant specializing in assessing abilities as per The Common Core State Standards (CCSS)
    for English Language Arts (ELA). Keep track of the conversation to provide relevant outputs.
    Do not write explanations. Do not reveal what is being assessed.
    """.strip()

    # Initialize conversation history if not provided
    if not conversation_history:
        conversation_history = [{"role": "system", "content": system_message}]
    
    # Add the user's prompt to the conversation history
    conversation_history.append({"role": "user", "content": prompt})
    
    # Send the prompt to the OpenAI API
    response = openai.ChatCompletion.create(
        model=MODEL_NAME,
        messages=conversation_history,
        max_tokens=max_tokens,
        temperature=temperature,
    )
    
    # Get the assistant's response and add it to the conversation history
    assistant_message = response["choices"][0]["message"]["content"]
    conversation_history.append({"role": "assistant", "content": assistant_message})
    
    return assistant_message.strip(), conversation_history

def get_standard(code, ela_ccss=None):
    """
    Get the description and corresponding grade of a specific Common Core State Standard (CCSS) code for ELA.
    
    Parameters:
    - code (str): The CCSS code (e.g. 'CCSS.ELA-LITERACY.W.4.9').
    - ela_ccss (DataFrame, optional): The ELA CCSS dataframe. If not provided, default will be loaded.
    
    Returns:
    - tuple: A tuple containing the description and grade of the CCSS code.
    """
    if ela_ccss is None:
        ela_ccss = load_ela_ccss()
    description = ela_ccss[ela_ccss['id'] == code]['description'].iloc[0]
    grade = ela_ccss[ela_ccss['id'] == code]['grade_id'].iloc[0]
    return description, grade

def generate_question(learning_standard, interest_topic, conversation_history=None, ela_ccss=None):
    """
    Generates a question based on the learning standard and interest topic.

    Parameters:
    - learning_standard (str): The Common Core State Standard (CCSS) code (e.g. 'CCSS.ELA-LITERACY.W.4.9').
    - interest_topic (str): The topic of interest for the question.
    - conversation_history (list): Previous conversation history. Default is None.
    - ela_ccss (DataFrame, optional): The ELA CCSS dataframe. If not provided, default will be loaded.

    Returns:
    - tuple: A tuple containing the generated question and the updated conversation history.
    """
    ccss_description, grade = get_standard(learning_standard, ela_ccss=ela_ccss)
    prompt = f"""
    Create a free-response question for a student of grade {grade} that prompts an answerer that '{ccss_description}'.
    Additionally, provide an introduction and context required to answer the question,
    and make it based on the topic of '{interest_topic}'. 
    In cases where the learner is expected to interpret a text, provide the text as well.
    Do not require the learner to have any prior knowledge or access to external resources, unless
    the learning standard requires them to do so.
    You need to provide the learner with all the information required to answer the question.
    The introduction and context should be around 200 words.
    """
    question, conversation_history = chat_with_openai(prompt, 600, conversation_history=conversation_history)
    return question, conversation_history

def generate_rubric(learning_standard, conversation_history, ela_ccss=None):
    """
    Generates a rubric based on the question.
    
    Parameters:
    - learning_standard (str): The Common Core State Standard (CCSS) code (e.g. 'CCSS.ELA-LITERACY.W.4.9').
    - conversation_history (list): Previous conversation history. Default is None.
    - ela_ccss (DataFrame, optional): The ELA CCSS dataframe. If not provided, default will be loaded.

    Returns:
    - tuple: A tuple containing the generated rubric and the updated conversation history.
    """
    ccss_description, grade = get_standard(learning_standard, ela_ccss=ela_ccss)
    example_rubric = """
    Criteria        Level 4        Level 3       Level 2       Level 1
    ---------------------------------------------------------
    [Criterion]    [Description]  [Description] [Description] [Description]
    Points         4              3             2             1
    ---------------------------------------------------------
    """
    prompt = f"""
    Based on the content and themes of the question and context you wrote, 
    devise a rubric that evaluates if the answer meets the learning standard of
    '{ccss_description}' for a student of grade {grade}.
    Your output should follow this format:
    {example_rubric}"""
    rubric, conversation_history = chat_with_openai(prompt, 700, conversation_history=conversation_history)
    return rubric, conversation_history

def evaluate_answer(answer, rubric, question):
    """
    Evaluates an answer based on the rubric.

    Parameters:
    - answer (str): The answer to evaluate.
    - rubric (str): The rubric to use for evaluation.
    - question (str): The question that was asked.

    Returns:
    - tuple: A tuple containing the evaluation and the updated conversation history.    
    """
    prompt = f"""Rubric: \n{rubric}\n
    Question: \n'{question}'\n
    Answer: '{answer}'\n
    Given the answer to the question and using the rubric above. First provide the overall score of the answer.
    Then, break down the feedback criterion-wise, and conclude with suggestions for improvement.
    Do not mention the rubric or the specific criterions and write as if you were speaking directly to the answerer."""
    
    conversation_history = [{"role": "system", "content": "You are a helpful assistant. Keep track of the conversation to provide relevant outputs. Do not write explanations."}]

    evaluation, conversation_history = chat_with_openai(prompt, 500, conversation_history=conversation_history)
    return evaluation, conversation_history


# Simulation

In [6]:

def simulate_response_to_question(question, rubric, level):
    """
    Simulate a student's response to a given question at a specific proficiency level using OpenAI.
    
    Parameters:
    - question (str): The assessment question.
    - rubric (str): The rubric for the question.
    - level (int): The proficiency level (e.g., 1, 2, 3, 4).
    
    Returns:
    - tuple: A tuple containing the simulated student response and the conversation history.
    """
    # Initialize conversation history
    conversation_history = [{"role": "system", "content": "You are a helpful assistant. Keep track of the conversation to provide relevant outputs. Do not write explanations."}]
    prompt = f"""Rubric: \n'{rubric}'

    Analyze the provided rubric and write a free-response answer of level '{level}' to the following question:
    '{question}'
    """
    # Send the prompt to the OpenAI API    
    answer, conversation_history = chat_with_openai(prompt, 250, conversation_history=conversation_history)

    return answer, conversation_history    

def run_simulation(topics):
    """
    Run a simulation of the entire workflow: 
    - Generate a question based on a randomly selected learning standard.
    - Generate a rubric for the question.
    - Simulate a student's response at a random proficiency level.
    - Evaluate the student's response.

    Parameters:
    - topics (list): A list of topics to use for the question.
    
    Prints the learning standard, question, rubric, student's response, and evaluation.
    """
    # Load the ELA CCSS.
    ela_ccss = load_ela_ccss()

    # Randomly select a learning standard.
    random_standard = random.choice(ela_ccss['id'].tolist())

    # Define a topic of interest.
    interest_topic = random.choice(topics)
    ccss_description, grade = get_standard(random_standard)

    print(f"Learning Standard: {random_standard} - '{ccss_description}' (Grade {grade})'")
    print(f"Topic of Interest: {interest_topic}\n")

    # Generate a question based on the standard and topic.
    generated_question, conversation_history = generate_question(random_standard, interest_topic)
    
    print("Generated Question:", generated_question)

    # Generate a rubric based on the question.
    generated_rubric, _ = generate_rubric(random_standard, conversation_history)
    
    print("Generated Rubric:", generated_rubric)

    # Simulate a student's response at a random level.
    answer_level = random.choice([1, 2, 3, 4])
    student_answer, _ = simulate_response_to_question(generated_question, generated_rubric, answer_level)

    print(f"\nStudent's Answer (level {str(answer_level)}): {student_answer}")

    # Evaluate the answer.
    evaluation, _ = evaluate_answer(student_answer, generated_rubric, generated_question)

    print("\nEvaluation:", evaluation)

    return random_standard, interest_topic, generated_question, generated_rubric, answer_level, evaluation


In [98]:
topics = ['baseball', 'pop music', 'climate change', 'ancient civilizations', 'space exploration', 'wildlife conservation', 'sports']

# run multiple simulations to compare results
results_df = pd.DataFrame(columns=['model', 'standard', 'topic', 'question', 'rubric', 'answer level', 'evaluation'])
results = []

for i in range(20):
    print(f"Simulation {i+1}")
    random_standard, interest_topic, generated_question, generated_rubric, answer_level, evaluation = run_simulation(topics)
    results.append((MODEL_NAME, random_standard, interest_topic, generated_question, generated_rubric, answer_level, evaluation))

# add results to dataframe
results_df = pd.DataFrame(results, columns=['model', 'standard', 'topic', 'question', 'rubric', 'answer level', 'evaluation'])

# save results to csv
results_df.to_csv('questions.csv', index=False)

# Quality Control


In [84]:
def get_grade(s: str) -> str:
    parts = s.split(".")
    if len(parts) >= 4:
        return parts[3]
    return None

def quality_control(learning_standard, question, rubric):
    """
    Implement a Quality Control (QC) check on the generated question, rubric & evaluation.

    Parameters:
    - learning_standard (str): The CCSS standard.
    - grade (str): The grade level for which the question was developed.
    - question (str): The question generated by the Model.
    - rubric (str): The Rubric generated by the Model.
    - evaluation(str): The answer evaluation generated by the Model.

    Returns:
    - dict: A dictionary with keys 'question_quality', 'rubric_quality', 'evaluation_quality', 
    each value representing a rating in the scale of 1-10.
    """
    grade = get_grade(learning_standard)
    qc_results = {}

    system_message = """You are a friendly assistant that rates the quality of a text based on given instructions.
    You have to decide whether it fits the requirements mentioned in the prompt.
    Your rating should be between 1 and 10 and nothing else.
    Do not write explanations. Do not reveal what is being assessed."""
    conversation_history = [{"role": "system", "content": system_message}]
    
    # Quality check for question
    qc_prompt = f"""Assess the following question based on relevance to grade {grade}, 
    adherence to learning standard '{learning_standard}', and challenge level.
    Question: 
    '{question}'"""
    qc_results['question_quality'], conversation_history = chat_with_openai(qc_prompt, 500, conversation_history=conversation_history)

    # Quality check for rubric
    qc_prompt = f"""Assess the following rubric based on alignment to the learning standard and ability to thoroughly evaluate a student's mastery of the standard.
    Rubric:
    '{rubric}'"""
    qc_results['rubric_quality'], conversation_history = chat_with_openai(qc_prompt, 500, conversation_history=conversation_history)

    return qc_results

In [101]:
# run quality control on the generated results and add results as new columns
qc_results = []
for i, row in results_df.iterrows():
    qc_results.append(quality_control(row['standard'], row['question'], row['rubric']))
    # add results to dataframe
    results_df.loc[i, 'question_quality'] = qc_results[i]['question_quality']
    results_df.loc[i, 'rubric_quality'] = qc_results[i]['rubric_quality']

Unnamed: 0,model,standard,topic,question,rubric,answer level,evaluation,question_quality,rubric_quality
0,gpt-3.5-turbo,CCSS.ELA-LITERACY.W.7.1.d,climate change,Introduction and Context:\n\nClimate change is...,Criteria Level 4 Level 3 L...,3,Overall Score: Level 4\n\nFeedback:\n- Clarity...,8.5,9.5
1,gpt-3.5-turbo,CCSS.ELA-LITERACY.W.4.10,climate change,Introduction and Context:\n\nClimate change is...,Criteria Level 4 Level 3 L...,1,Overall Score: Level 1\n\nFeedback:\n- Organiz...,9.0,8.0
2,gpt-3.5-turbo,CCSS.ELA-LITERACY.W.5.1.d,wildlife conservation,Introduction and Context:\n\nWildlife conserva...,Criteria Level 4 Level 3 L...,3,Overall Score: Level 4 (4 points)\n\nFeedback:...,7.0,8.0
3,gpt-3.5-turbo,CCSS.ELA-LITERACY.W.8.3,pop music,Introduction and Context:\n\nPop music is a ge...,Criteria Level 4 Level 3 L...,1,Overall Score: Level 1\n\nFeedback:\n- The nar...,9.5,8.5
4,gpt-3.5-turbo,CCSS.ELA-LITERACY.W.5.2.b,space exploration,Introduction and Context:\n\nSpace exploration...,Criteria Level 4 Level 3 L...,2,Overall Score: Level 4\n\nFeedback:\n- Accurac...,8.0,9.0


In [None]:
# For the evaluation quality, we will determine it by comparing the evaluation score 
# to the answer level (which is the input provided to the model to generate the answer in the simulation).

# Extract overall score using regex and add to a new column
results_df['eval_score'] = results_df['evaluation'].str.extract(r"Overall Score: Level (\d+)")[0]

# Convert numeric columns
results_df['answer level'] = pd.to_numeric(results_df['answer level'], errors='coerce')
results_df['question_quality'] = pd.to_numeric(results_df['question_quality'], errors='coerce')
results_df['rubric_quality'] = pd.to_numeric(results_df['rubric_quality'], errors='coerce')
results_df['eval_score'] = pd.to_numeric(results_df['eval_score'], errors='coerce')

# Calculate evaluation quality
results_df['evaluation_quality'] = 10 * (1 - abs(results_df['eval_score'] - results_df['answer level']) / results_df[['eval_score', 'answer level']].max(axis=1))

In [103]:
# Save the results to a CSV file
results_df.to_csv('qc-results.csv', index=False)