Importing the libaries

In [4]:
import os
import re
import json
import time
import random
import pandas as pd
import numpy as np
from scipy import stats
import plotly.graph_objects as go
from groq import Groq

In [7]:
client = Groq(
    #api_key=userdata.get("GROQ_API_KEY"),
    api_key = groq_api
    )

In [14]:
df_persona.count()

level1             200
level2             200
total_expertise    200
expertise          200
dtype: int64

In [15]:
df_persona.groupby(['expertise']).count()

Unnamed: 0_level_0,level1,level2,total_expertise
expertise,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high-expertise,90,90,90
low-expertise,110,110,110


### Expertise Reversal Effect

1.  divide the students in 2 groups
  *   high-expertise students (upper-median group)
  *   low-expertise students (lower-median group)
2.   randomly assign half of the students to the practice condition and the other half to the worked-example condition (4 experimental groups)
  *   low-expertise/practice
  *   low-expertise/worked example
  *   high-expertise/practice
  *   high-expertise/worked example

In [16]:
def expertise_reversal(students, num):

  low_condition = ["low-expertise/practice", "low-expertise/worked example"]
  high_condition =["high-expertise/practice", "high-expertise/worked example"]

  # Convert students to DataFrame and shuffle
  students = pd.DataFrame(students)
  students = students.sample(frac=1).reset_index(drop=True)

  # Split students by expertise level
  low_expertise_group = students[students['expertise'] == "low-expertise"]
  high_expertise_group = students[students['expertise'] == "high-expertise"]

  # Ensure each group has exactly `num` students
  if len(low_expertise_group) < num or len(high_expertise_group) < num:
    raise ValueError("Not enough students in one of the groups to match the specified size.")

  # Sample `num` students from each group
  low_expertise_group = low_expertise_group.sample(n=num, random_state=1).reset_index(drop=True)
  high_expertise_group = high_expertise_group.sample(n=num, random_state=1).reset_index(drop=True)

  # Assign expertise_reversal condition alternately
  for index in range(num):
    low_expertise_group.loc[index, 'expertise_reversal'] = low_condition[index % 2]
    high_expertise_group.loc[index, 'expertise_reversal'] = high_condition[index % 2]

  # Combine and shuffle the final balanced dataset
  students_group = pd.concat([low_expertise_group, high_expertise_group]).sample(frac=1).reset_index(drop=True)
  return students_group

In [17]:
# update list of personas
df_persona = expertise_reversal(df_persona, 20)
df_persona.head()

Unnamed: 0,level1,level2,total_expertise,expertise,expertise_reversal
0,5,4,9,high-expertise,high-expertise/practice
1,2,4,6,low-expertise,low-expertise/practice
2,1,1,2,low-expertise,low-expertise/worked example
3,5,2,7,high-expertise,high-expertise/worked example
4,5,4,9,high-expertise,high-expertise/practice


In [18]:
df_persona.groupby(['expertise']).count()

Unnamed: 0_level_0,level1,level2,total_expertise,expertise_reversal
expertise,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high-expertise,20,20,20,20
low-expertise,20,20,20,20


### Variability Effect

Randomly divide these student personas into 4 experimental groups of equal size:
1. low-variability/practice
2. low-variability/worked example.
3. high-variability/practice.
4. high-variability/worked example.

In [108]:
def personas_variability(students):

  variability = ["low-variability/practice", "low-variability/worked example", "high-variability/practice", "high-variability/worked example"]

  # Assign expertise_reversal condition alternately
  students['variability'] = [variability[i % 4] for i in range(len(students))]

  # shuffle the final balanced dataset
  students = students.sample(frac=1).reset_index(drop=True)
  return students

In [111]:
#personas = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_output.csv", delimiter=',')
personas = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_40_output.csv", delimiter=',')

In [112]:
personas_variability(personas)


Unnamed: 0,level1,level2,total_expertise,expertise,expertise_reversal,variability,pretest_score,var_post_test,ere_post_test
0,4,1,5,low-expertise,low-expertise/worked example,low-variability/worked example,20.0,39.56,40.0
1,2,3,5,low-expertise,low-expertise/practice,low-variability/practice,42.25,39.65,36.25
2,5,3,8,high-expertise,high-expertise/worked example,high-variability/worked example,61.25,38.31,56.88
3,5,4,9,high-expertise,high-expertise/practice,high-variability/worked example,80.0,49.06,61.25
4,3,5,8,high-expertise,high-expertise/worked example,low-variability/worked example,74.38,48.86,58.75
5,4,1,5,low-expertise,low-expertise/practice,high-variability/practice,17.0,36.31,25.0
6,4,1,5,low-expertise,low-expertise/practice,high-variability/worked example,20.0,43.04,20.0
7,5,2,7,high-expertise,high-expertise/practice,low-variability/practice,45.0,37.18,31.88
8,5,3,8,high-expertise,high-expertise/practice,low-variability/practice,66.25,39.27,60.0
9,1,5,6,low-expertise,low-expertise/practice,low-variability/practice,22.86,36.87,37.5


In [116]:
counts = personas.groupby('variability').count()
print(counts)

                                 level1  level2  total_expertise  expertise  \
variability                                                                   
high-variability/practice            10      10               10         10   
high-variability/worked example      10      10               10         10   
low-variability/practice             10      10               10         10   
low-variability/worked example       10      10               10         10   

                                 expertise_reversal  pretest_score  \
variability                                                          
high-variability/practice                        10             10   
high-variability/worked example                  10             10   
low-variability/practice                         10             10   
low-variability/worked example                   10             10   

                                 var_post_test  ere_post_test  
variability                             

## The algebra problems dataset
The algebra problems do not have step-by-step solutions. The authors manually created solutions for a few randomly selected problems for the instructions. Algebra dataset source: https://github.com/joyheyueya/declarative-math-word-problem/blob/main/algebra222.csv

In [118]:
#getting the math problems for the pre-test
url = "https://raw.githubusercontent.com/joyheyueya/declarative-math-word-problem/refs/heads/main/algebra222.csv"
df_math_problems = pd.read_csv(url)
#df_math_problems.head()
#problem_ramdom = random.choice(df_math_problems['question'])

### Pre and Post test problems

In [119]:
def generate_math_problems(num):
  """Generates a list of random math problems from the math proplem dataframe.

  Args:
    num = the number of problems to generate [6 for pre-test and 8 for post-test].

  Returns:
    A list of math problems.
  """
  problems = random.sample(list(df_math_problems['question']), num)
  return problems

# usage examples
problems_ex1 = generate_math_problems(8)
print(problems_ex1)

problems_ex2 = generate_math_problems(6)
print(problems_ex2)

['Suzy pays $111 for an order of 4 black printer cartridges and 7 color cartridges. When supplies start running low, she goes back to the same store and pays $126 for an order of 3 black and 10 color cartridges. Find the price of one black cartridge.', 'One morning Aristotle starts walking from Athens to Megara, and at the same time Socrates starts on a journey from Megara to Athens. Being younger, Aristotle is walking 1.8 times faster than Socrates. After a 5 hour journey, they meet somewhere in the middle of the way. Find the speed of Aristotle, in km per hour, if the total distance between Athens and Megara is 42 km.', 'Lab 1 can process blood samples 3 times faster than Lab 2. Working together, the two labs can process 84 samples over 7 days. Find the rate of work for Lab 1, in samples per day.', 'The size of a compressed file is 1.74 MiB, while the size of the original uncompressed file is 5.5 times greater. What is the size of the uncompressed file, in MiB?', 'Hernando’s salary w

In [14]:
problems_ex1 = ['A bread recipe calls for a mix of bleached flour, whole wheat flour, and rye flour. The volume of whole wheat should be two times greater than the volume of bleached, and the volume of rye should be 1 cup less than the volume of bleached. Find the volume in cups for bleached flour if the recipe is asking for 15 cups of flour in total.', 'Dolores bought a crib on sale for $350. The sale price was 40% of the original price. What was the original price of the crib?', 'Hilda has $210 worth of $10 and $12 stock shares. The numbers of $10 shares is five more than twice the number of $12 shares. How many $10 shares does she have?', 'Three times the sum of a number and nine is 12. Find the number.', 'Eve typed up a 4050 word document over two and a half hours. Find Eve’s typing rate in words per minute.', 'A cashier has 30 bills, all of which are $10 or $20 bills. The total value of the money is $460. How many of the  $10 bill does the cashier have?', 'Cindy and Richard leave their dorm in Charleston at the same time. Cindy rides her bicycle north at a speed of 18 miles per hour. Richard rides his bicycle south at a speed of 14 miles per hour. How many hours will it take them to be 96 miles apart?', 'A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake?']
problems_ex1

['A bread recipe calls for a mix of bleached flour, whole wheat flour, and rye flour. The volume of whole wheat should be two times greater than the volume of bleached, and the volume of rye should be 1 cup less than the volume of bleached. Find the volume in cups for bleached flour if the recipe is asking for 15 cups of flour in total.',
 'Dolores bought a crib on sale for $350. The sale price was 40% of the original price. What was the original price of the crib?',
 'Hilda has $210 worth of $10 and $12 stock shares. The numbers of $10 shares is five more than twice the number of $12 shares. How many $10 shares does she have?',
 'Three times the sum of a number and nine is 12. Find the number.',
 'Eve typed up a 4050 word document over two and a half hours. Find Eve’s typing rate in words per minute.',
 'A cashier has 30 bills, all of which are $10 or $20 bills. The total value of the money is $460. How many of the  $10 bill does the cashier have?',
 'Cindy and Richard leave their dor

In [120]:
#test_problem = problem_ramdom

instruction = f'''Given the student's initial skill levels and the worksheet the student has received, what's the probability that the student can solve the problem correctly? Explain your reasoning and give a single number between 0 and 100 in square brackets.'''


## Experiment 1: Expert Reversal Effect
The authors did not try to only create unique personas (duplicates were used). High temperature was used to introduce variance.
The idea was to generate a population of students that are somewhat different.

TODO:
- Test different instructions with students (the author tried to make sure each intervention group has at least 30 students to get reliable estimates)
- Generate more samples to get greater statistical significance

### Pre-Test

The author only simulated pre-tests for the expertise reversal effect to check that the results are consistent with the personas (e.g., the lower-expertise group should have lower pre-test scores).
However, they didn't actually need pre-tests to replicate these experiments because they knew all students' latent skill levels.
They were not trying to look at the ***absolute learning gain difference across groups*** only interested in the general trend than the effect sizes (since there's no exact human experiment to compare to).

In [125]:
def try_convert_to_float(s):
    """
    This function tries to convert a string to a float.
    If successful, it returns the float value.
    Otherwise, it returns 0.

    Args:
        s: The string to convert.

    Returns:
        The converted float value if successful, otherwise 0.
    """
    try:
        return float(s.strip('[]'))
    except ValueError:
        return 0  # Or handle the error in a more suitable way for your use case

In [None]:
# TODO: run 100 sample ### failed due to the token limitation for post-test
messages = []

personas = df_persona

for i, row in personas.iterrows():
  test_scores = []
  for problem in range(len(problems_ex1)):
    test_problem = problems_ex1[problem]
    prompt = f'''
    Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
      1. Being able to set up systems of equations given a word problem: {row['level1']}
      2. Being able to solve systems of equations: {row['level2']}

    Now the student is asked to work on the following problem on a test: {test_problem}
    {instruction}
    '''

    user_message = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt,}],
        model="gemma2-9b-it", #"llama3-8b-8192" #"llama-3.2-1b-preview", #llama3-70b-8192 ", # The language model which will generate the completion.
        temperature=0, # Controls randomness: lowering results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive.
        max_tokens=1024, # The maximum number of tokens to generate. Requests can use up to 2048 tokens shared between prompt and completion.
        top_p=1, # Controls diversity via nucleus sampling: 0.5 means half of all likelihood-weighted options are considered.
        stop=None,
        stream=False, # If set, partial message deltas will be sent.
        )

    decoded_output = user_message.choices[0].message.content

    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": decoded_output})

    score = re.findall(r'\[.*?\]', decoded_output)
    if score:
      test_scores.append(score[0])

  sum_score = 0
  for s in test_scores:
    sum_score += try_convert_to_float(s)

    avg_score = sum_score / len(test_scores)
  print(f"{decoded_output}")
  print("---------------------------------------------")
  print(f"Persona {i} got pre-test score = {avg_score}")
  personas.loc[i, 'pretest_score'] = round(avg_score, 2)

Here's a breakdown of why the student is likely to solve this problem correctly:

* **Understanding the Problem:** The problem is a straightforward word problem involving a simple relationship between the sugar amounts in two recipes.  It doesn't require complex algebraic manipulation.

* **Skill Level:**
    * The student has a good understanding of how to set up systems of equations (level 2), although they might not need to formally set one up for this problem.
    * Their strong ability to solve systems of equations (level 4) suggests they can handle the arithmetic involved in finding the difference in sugar amounts.

**Reasoning:**

This problem primarily tests the student's ability to translate words into mathematical operations.  They need to understand that "270 more grams" means adding 270 to the brownie recipe's sugar amount.  

**Probability:**

Given the student's skill level and the nature of the problem, the probability of them solving it correctly is high. I'd estimate i

In [None]:
personas

Unnamed: 0,level1,level2,total_expertise,expertise,expertise_reversal,variability,pretest_score,var_post_test,ere_post_test
0,2,4,6,low-expertise,low-expertise/worked example,low-variability/practice,66.88,47.86,55.0
1,4,3,7,high-expertise,high-expertise/practice,high-variability/worked example,60.62,45.5,58.75
2,5,5,10,high-expertise,high-expertise/practice,high-variability/practice,92.5,46.54,65.0
3,5,4,9,high-expertise,high-expertise/practice,high-variability/worked example,86.25,49.06,61.25
4,5,4,9,high-expertise,high-expertise/practice,high-variability/practice,86.25,50.26,57.5
5,3,5,8,high-expertise,high-expertise/worked example,high-variability/practice,57.5,48.86,58.75
6,1,3,4,low-expertise,low-expertise/practice,low-variability/practice,38.75,45.4,20.0
7,4,1,5,low-expertise,low-expertise/practice,low-variability/practice,13.12,43.04,20.0
8,1,1,2,low-expertise,low-expertise/worked example,low-variability/practice,15.0,39.52,20.0
9,4,1,5,low-expertise,low-expertise/worked example,low-variability/practice,13.12,39.56,40.0


### Post- Test

1.  divide the students in 2 groups
  *   high-expertise students (upper-median group)
  *   low-expertise students (lower-median group)
2.   randomly assign half of the students to the practice condition and the other half to the worked-example condition (4 experimental groups)
  *   low-expertise/practice
  *   low-expertise/worked example
  *   high-expertise/practice
  *   high-expertise/worked example

In [None]:
messages = []

#personas = df_persona

for i, row in personas.iterrows():
  test_scores = []
  for problem in range(len(problems_ex1)):
    test_problem = problems_ex1[problem]
    if row['expertise_reversal'] in ["low-expertise/practice", "high-expertise/practice"]:
      prompt = f'''
      Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
        1. Being able to set up systems of equations given a word problem: {row['level1']}
        2. Being able to solve systems of equations: {row['level2']}

      Here's the instruction that the student receives. The student is asked to work on the following problems on their own:
        1. A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake?
        2. The size of a compressed file is 1.74 MiB, while the size of the original uncompressed file is 5.5 times greater. What is the size of the uncompressed file, in MiB?
        3. Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age.
        4. When Jenna spent 10 minutes on the elliptical trainer and then did circuit training for 20 minutes, her fitness app says she burned 278 calories. When she spent 20 minutes on the elliptical trainer and 30 minutes circuit training she burned 473 calories. How many calories does she burn for each minute on the elliptical trainer?
        5. Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn?
        6. Pam is 3 years older than her sister, Jan. The sum of their ages is 99. Find Pam's age.
        7. Lynn paid a total of $2,780 for 261 tickets to the theater. Student tickets cost $10 and adult tickets cost $15. How many student tickets did Lynn buy?
        8. Yumi wants to make 12 cups of party mix using candies and nuts. Her budget requires the party mix to cost her $1.29 per cup. The candies are $2.49 per cup and the nuts are $0.69 per cup. How many cups of candies should she use?

      Now the student is asked to work on the following problem on a test: {test_problem}
      {instruction}
      '''
    else:
      prompt = f'''
      Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
        1. Being able to set up systems of equations given a word problem: {row['level1']}
        2. Being able to solve systems of equations: {row['level2']}

      Here's the instruction that the student receives. The student is asked to study a problem and its solution. Here's the problem:
        A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake? Here's its solution:
          Step 1: Identify the amount of sugar needed for the brownie recipe, which is 350 grams.
          Step 2: Understand that the pound cake recipe requires 270 more grams of sugar than the brownie recipe.
          Step 3: Add the additional 270 grams of sugar to the 350 grams required for the brownie recipe.
          Step 4: The total amount of sugar needed for the pound cake recipe is 350 grams + 270 grams = 620 grams.

      The student is then asked to work on the following problem on their own:
        The size of a compressed file is 1.74 MiB, while the size of the original uncompressed file is 5.5 times greater. What is the size of the uncompressed file, in MiB?

      The student is then asked to study another problem and its solution. Here's the problem:
        Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age. Here's its solution:
          Step 1: Let's denote Devon's age as D and Cooper's age as C.
          Step 2: According to the problem, we have two equations. The first one is D = C + 26 (since Devon is 26 years older than Cooper). The second one is D + C = 50 (since the sum of their ages is 50).
          Step 3: We can substitute the first equation into the second one. So, instead of D in the second equation, we can put C + 26. We get C + 26 + C = 50.
          Step 4: Simplify the equation. Combine like terms to get 2C + 26 = 50.
          Step 5: Subtract 26 from both sides of the equation to get 2C = 24.
          Step 6: Divide both sides of the equation by 2 to get C = 12. So, Cooper is 12 years old.
          Step 7: Substitute C = 12 into the first equation (D = C + 26) to get D = 12 + 26 = 38.
          So, Devon is 38 years old.

      The student is then asked to work on the following problem on their own:
        When Jenna spent 10 minutes on the elliptical trainer and then did circuit training for 20 minutes, her fitness app says she burned 278 calories. When she spent 20 minutes on the elliptical trainer and 30 minutes circuit training she burned 473 calories. How many calories does she burn for each minute on the elliptical trainer?

      The student is then asked to study another problem and its solution. Here's the problem:
        Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn? Here's its solution:
          Step 1: Let's denote the amount Jim earned as J.
          Step 2: According to the problem, Debbie earned $1,600 more than Jim, so we can denote the amount Debbie earned as J + $1,600.
          Step 3: We know that the total amount they earned together is $7,200. So, we can set up the following equation: J + (J + $1,600) = $7,200.
          Step 4: Simplify the equation by combining like terms: 2J + $1,600 = $7,200.
          Step 5: To solve for J, subtract $1,600 from both sides of the equation: 2J = $7,200 - $1,600 = $5,600.
          Step 6: Divide both sides of the equation by 2 to solve for J: J = $5,600 / 2 = $2,800.
          Step 7: Substitute J = $2,800 back into the equation from Step 2 to find out how much Debbie earned: Debbie earned J + $1,600 = $2,800 + $1,600 = $4,400.
          So, Debbie earned $4,400 last month.

      The student is then asked to work on the following problem on their own:
        Pam is 3 years older than her sister, Jan. The sum of their ages is 99. Find Pam's age.

      The student is then asked to study another problem and its solution. Here's the problem:
        Lynn paid a total of $2,780 for 261 tickets to the theater. Student tickets cost $10 and adult tickets cost $15. How many student tickets did Lynn buy? Here's its solution:
          Step 1: Let's denote the number of student tickets as S and the number of adult tickets as A.
          Step 2: We know that the total number of tickets is 261, so we can write the equation: S + A = 261.
          Step 3: We also know that the total cost of the tickets is $2,780, with student tickets costing $10 and adult tickets costing $15. This gives us the equation: 10S + 15A = 2780.
          Step 4: To solve these two equations, we can first multiply the first equation by 10 to make the coefficients of S the same in both equations: 10S + 10A = 2610.
          Step 5: Now, subtract the new first equation from the second equation: 10S + 15A - (10S + 10A) = 2780 - 2610. This simplifies to 5A = 170.
          Step 6: Divide both sides of the equation by 5 to solve for A: A = 170 / 5 = 34.
          Step 7: Substitute A = 34 into the first equation: S + 34 = 261. Solve for S: S = 261 - 34 = 227.
          So, Lynn bought 227 student tickets.

      The student is then asked to work on the following problem on their own:
        Yumi wants to make 12 cups of party mix using candies and nuts. Her budget requires the party mix to cost her $1.29 per cup. The candies are $2.49 per cup and the nuts are $0.69 per cup. How many cups of candies should she use?

      Now the student is asked to work on the following problem on a test: {test_problem}
      {instruction}
      '''

    user_message = [{"role": "user", "content": prompt}]

    user_message = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt,}],
        model="gemma2-9b-it",#"llama3-8b-8192", #"llama-3.2-3b-preview",#"llama-guard-3-8b", #"llama-3.1-8b-instant",
        temperature=0,
        max_tokens=1024,
        top_p=1,
        stop=None,
        stream=False,
        )

    decoded_output = user_message.choices[0].message.content

    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": decoded_output})

    score = re.findall(r'\[.*?\]', decoded_output)
    if score:
      test_scores.append(score[0])
    print(decoded_output)

  sum_score = 0
  for s in test_scores:
    sum_score += try_convert_to_float(s)
    #print(sum_score)
    avg_score = sum_score / len(test_scores)
  print("----------------------------------------------")
  print(f"Persona {i} got post-test score = {avg_score}")
  personas.loc[i, 'ere_post_test'] = round(avg_score,2)

Here's a breakdown of the student's likely success rate and the reasoning behind it:

**Strengths:**

* **Solving Systems:** The student demonstrates a good ability to solve systems of equations (skill level 4). This is crucial for the bread recipe problem.

**Weaknesses:**

* **Setting Up Systems:** The student struggles a bit with setting up systems of equations from word problems (skill level 2). This could be a hurdle in translating the bread recipe problem into a solvable format.

**Problem Analysis:**

The bread recipe problem requires:

1. **Defining Variables:**  Clearly representing the unknown volumes of each flour type.
2. **Formulating Equations:**  Creating equations based on the given relationships between the flour types and the total volume.
3. **Solving the System:**  Using the student's strong skill in this area to find the volume of bleached flour.

**Probability Assessment:**

While the student is strong at solving systems, their weaker skill in setting them up intr

In [None]:
personas.head(20)

Unnamed: 0,level1,level2,total_expertise,expertise,expertise_reversal,variability,pretest_score,var_post_test,ere_post_test
0,2,4,6,low-expertise,low-expertise/worked example,low-variability/practice,66.88,57.86,55.0
1,4,3,7,high-expertise,high-expertise/practice,high-variability/worked example,60.62,51.0,58.75
2,5,5,10,high-expertise,high-expertise/practice,high-variability/practice,92.5,56.92,65.0
3,5,4,9,high-expertise,high-expertise/practice,high-variability/worked example,86.25,61.25,61.25
4,5,4,9,high-expertise,high-expertise/practice,high-variability/practice,86.25,63.16,57.5
5,3,5,8,high-expertise,high-expertise/worked example,high-variability/practice,57.5,64.32,58.75
6,1,3,4,low-expertise,low-expertise/practice,low-variability/practice,38.75,59.6,20.0
7,4,1,5,low-expertise,low-expertise/practice,low-variability/practice,13.12,54.38,20.0
8,1,1,2,low-expertise,low-expertise/worked example,low-variability/practice,15.0,49.92,20.0
9,4,1,5,low-expertise,low-expertise/worked example,low-variability/practice,13.12,46.47,40.0


In [None]:
personas.describe()

Unnamed: 0,level1,level2,total_expertise,pretest_score,var_post_test,ere_post_test
count,40.0,40.0,40.0,40.0,40.0,40.0
mean,3.45,3.075,6.525,47.5185,47.84325,45.204
std,1.431334,1.456858,1.853444,25.426088,5.885669,23.128384
min,1.0,1.0,2.0,13.12,43.09,12.5
25%,2.0,2.0,5.0,22.86,44.5225,29.375
50%,4.0,3.0,6.5,42.19,45.2,40.94
75%,5.0,4.25,8.0,63.13,47.265,58.75
max,5.0,5.0,10.0,92.5,64.32,93.75


## Experiment 2: Variability

### Post-Test

Randomly divide these student personas into 4 experimental groups of equal size:
1. low-variability/practice
2. low-variability/worked example.
3. high-variability/practice.
4. high-variability/worked example.

In [131]:
messages = []

for i, row in personas.iterrows():
  test_scores = []
  for problem in range(len(problems_ex2)):
    test_problem = problems_ex2[problem]
    if (row['variability'] == "high-variability/practice"):
        # A prompt for the high-variability/practice group group in simulated expert evaluations replicating the Variability Effect. Note that problems with even numbers are different than problems with odd numbers in both formats and values.
      prompt = f'''
        Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
          1. Being able to set up systems of equations given a word problem: {row['level1']}
          2. Being able to solve systems of equations: {row['level2']}

        Here's the instruction that the student receives. The student is asked to work on the following problems on their own:
          1. A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake?
          2. The size of a compressed file is 1.74 MiB, while the size of the original uncompressed file is 5.5 times greater. What is the size of the uncompressed file, in MiB?
          3. Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age.
          4. When Jenna spent 10 minutes on the elliptical trainer and then did circuit training for 20 minutes, her fitness app says she burned 278 calories. When she spent 20 minutes on the elliptical trainer and 30 minutes circuit training she burned 473 calories. How many calories does she burn for each minute on the elliptical trainer?
          5. Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn?
          6. Lynn paid a total of $2,780 for 261 tickets to the theater. Student tickets cost $10 and adult tickets cost $15. How many student tickets did Lynn buy?

        Now the student is asked to work on the following problem on a test: {test_problem}
        {instruction}
      '''

    elif (row['variability'] == "low-variability/practice"):
        # A prompt for the low-variability/practice group in simulated expert evaluations replicating the Variability Effect. Note that problems with even numbers have the same format as problems with odd numbers.
      prompt = f'''
        Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
          1. Being able to set up systems of equations given a word problem: {row['level1']}
          2. Being able to solve systems of equations: {row['level2']}

        Here's the instruction that the student receives. The student is asked to work on the following problems on their own:
          1. A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake?
          2. A cookie recipe is asking for 400 grams of sugar, and a brownie recipe requires 70 more grams of sugar than a cookie recipe. How many grams of sugar are needed for the brownie?
          3. Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age.
          4. Pam is 3 years older than her sister, Jan. The sum of their ages is 99. Find Pam's age.
          5. Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn?
          6. A married couple together earn $75,000. The husband earns $15,000 more than five times what his wife earns. What does the wife earn?

        Now the student is asked to work on the following problem on a test: {test_problem}
        {instruction}
        '''

    elif (row['variability'] == "high-variability/work example"):
        # A prompt for the high-variability/worked-example group in simulated expert evaluations replicating the Variability Effect. Note that problems with even numbers are different than problems with odd numbers in both formats and values. All problems are presented with their step-by-step solutions.
      prompt = f'''
        Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
          1. Being able to set up systems of equations given a word problem: {row['level1']}
          2. Being able to solve systems of equations: {row['level2']}

        Here's the instruction that the student receives. The student is asked to study a problem and its solution. Here's the problem:
        A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake? Here's its solution:
          Step 1: Identify the amount of sugar needed for the brownie recipe, which is 350 grams.
          Step 2: Understand that the pound cake recipe requires 270 more grams of sugar than the brownie recipe.
          Step 3: Add the additional 270 grams of sugar to the 350 grams required for the brownie recipe.
          Step 4: The total amount of sugar needed for the pound cake recipe is 350 grams + 270 grams = 620 grams.

        The student is then asked to study another problem and its solution. Here's the problem:
        The size of a compressed file is 1.74 MiB, while the size of the original uncompressed file is 5.5 times greater. What is the size of the uncompressed file, in MiB? Here's its solution:
          Step 1: Identify the size of the compressed file, which is 1.74 MiB.
          Step 2: Identify the ratio of the uncompressed file size to the compressed file size, which is 5.5.
          Step 3: Multiply the size of the compressed file by the ratio to find the size of the uncompressed file.
          So, 1.74 MiB * 5.5 = 9.57 MiB.
          Therefore, the size of the uncompressed file is 9.57 MiB.

        The student is then asked to study another problem and its solution. Here's the problem:
        Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age. Here's its solution:
          Step 1: Let's denote Devon's age as D and Cooper's age as C.
          Step 2: According to the problem, we have two equations. The first one is D = C + 26 (since Devon is 26 years older than Cooper). The second one is D + C = 50 (since the sum of their ages is 50).
          Step 3: We can substitute the first equation into the second one. So, instead of D in the second equation, we can put C + 26. We get C + 26 + C = 50.
          Step 4: Simplify the equation. Combine like terms to get 2C + 26 = 50.
          Step 5: Subtract 26 from both sides of the equation to get 2C = 24.
          Step 6: Divide both sides of the equation by 2 to get C = 12. So, Cooper is 12 years old.
          Step 7: Substitute C = 12 into the first equation (D = C + 26) to get D = 12 + 26 = 38.
          So, Devon is 38 years old.

        The student is then asked to study another problem and its solution. Here's the problem:
        When Jenna spent 10 minutes on the elliptical trainer and then did circuit training for 20 minutes, her fitness app says she burned 278 calories. When she spent 20 minutes on the elliptical trainer and 30 minutes circuit training she burned 473 calories. How many calories does she burn for each minute on the elliptical trainer?
        Here's its solution:
          This problem can be solved using a system of linear equations. Let's denote the number of calories burned per minute on the elliptical trainer as E and the number of calories burned per minute during circuit training as C.
          From the problem, we can form two equations:
          10E + 20C = 278 (1) (from the first scenario)
          20E + 30C = 473 (2) (from the second scenario)
          To solve this system, we can multiply the first equation by 2 to make the coefficients of E in both equations the same:
          20E + 40C = 556 (3)
          Now we can subtract equation (1) from equation (3):
          20E + 30C - (20E + 40C) = 473 - 556
          This simplifies to:
          -10C = -83
          Dividing both sides by -10 gives:
          C = 8.3 calories/minute
          Substitute C = 8.3 into equation (1):
          10E + 20*8.3 = 278
          10E + 166 = 278
          Subtract 166 from both sides:
          10E = 112
          Divide both sides by 10:
          E = 11.2 calories/minute
          So, Jenna burns 11.2 calories for each minute on the elliptical trainer.

        The student is then asked to study another problem and its solution. Here's the problem:
        Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn? Here's its solution:
          Step 1: Let's denote the amount Jim earned as J.
          Step 2: According to the problem, Debbie earned $1,600 more than Jim, so we can denote the amount Debbie earned as J + $1,600.
          Step 3: We know that the total amount they earned together is $7,200. So, we can set up the following equation: J + (J + $1,600) = $7,200.
          Step 4: Simplify the equation by combining like terms: 2J + $1,600 = $7,200.
          Step 5: To solve for J, subtract $1,600 from both sides of the equation: 2J = $7,200 - $1,600 = $5,600.
          Step 6: Divide both sides of the equation by 2 to solve for J: J = $5,600 / 2 = $2,800.
          Step 7: Substitute J = $2,800 back into the equation from Step 2 to find out how much Debbie earned: Debbie earned J + $1,600 = $2,800 + $1,600 = $4,400.
          So, Debbie earned $4,400 last month.

        The student is then asked to study another problem and its solution. Here's the problem:
        Lynn paid a total of $2,780 for 261 tickets to the theater. Student tickets cost $10 and adult tickets cost $15. How many student tickets did Lynn buy? Here's its solution:
          Step 1: Let's denote the number of student tickets as S and the number of adult tickets as A.
          Step 2: We know that the total number of tickets is 261, so we can write the equation: S + A = 261.
          Step 3: We also know that the total cost of the tickets is $2,780, with student tickets costing $10 and adult tickets costing $15. This gives us the equation: 10S + 15A = 2780.
          Step 4: To solve these two equations, we can first multiply the first equation by 10 to make the coefficients of S the same in both equations: 10S + 10A = 2610.
          Step 5: Now, subtract the new first equation from the second equation: 10S + 15A - (10S + 10A) = 2780 - 2610. This simplifies to 5A = 170.
          Step 6: Divide both sides of the equation by 5 to solve for A: A = 170 / 5 = 34.
          Step 7: Substitute A = 34 into the first equation: S + 34 = 261. Solve for S: S = 261 - 34 = 227.
          So, Lynn bought 227 student tickets.

        Now the student is asked to work on the following problem on a test: {test_problem}
        {instruction}
        '''

    else:
        # A prompt for the low-variability/worked-example group in simulated expert evaluations replicating the Variability Effect. Note that problems with even numbers have the same format as problems with odd numbers. All problems are presented with their step-by-step solutions.
      prompt = f'''
        Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
            1. Being able to set up systems of equations given a word problem: {row['level1']}
            2. Being able to solve systems of equations: {row['level2']}

        Here's the instruction that the student receives. The student is asked to study a problem and its solution. Here's the problem:
        A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake? Here's its solution:
          Step 1: Identify the amount of sugar needed for the brownie recipe, which is 350 grams.
          Step 2: Understand that the pound cake recipe requires 270 more grams of sugar than the brownie recipe.
          Step 3: Add the additional 270 grams of sugar to the 350 grams required for the brownie recipe.
          Step 4: The total amount of sugar needed for the pound cake recipe is 350 grams + 270 grams = 620 grams.

        The student is then asked to study another problem and its solution. Here's the problem:
        A cookie recipe is asking for 400 grams of sugar, and a brownie recipe requires 70 more grams of sugar than a cookie recipe. How many grams of sugar are needed for the brownie? Here's its solution:
          Step 1: Identify the amount of sugar needed for the cookie recipe, which is 400 grams.
          Step 2: Understand that the brownie recipe needs 70 grams more sugar than the cookie recipe.
          Step 3: Add the extra 70 grams of sugar to the 400 grams needed for the cookie recipe.
          Step 4: The total amount of sugar needed for the brownie recipe is 400 grams + 70 grams = 470 grams.

        The student is then asked to study another problem and its solution. Here's the problem:
        Devon is 26 years older than his son Cooper. The sum of their ages is 50. Find Devon's age. Here's its solution:
          Step 1: Let's denote Devon's age as D and Cooper's age as C.
          Step 2: According to the problem, we have two equations. The first one is D = C + 26 (since Devon is 26 years older than Cooper). The second one is D + C = 50 (since the sum of their ages is 50).
          Step 3: We can substitute the first equation into the second one. So, instead of D in the second equation, we can put C + 26. We get C + 26 + C = 50.
          Step 4: Simplify the equation. Combine like terms to get 2C + 26 = 50.
          Step 5: Subtract 26 from both sides of the equation to get 2C = 24.
          Step 6: Divide both sides of the equation by 2 to get C = 12. So, Cooper is 12 years old.
          Step 7: Substitute C = 12 into the first equation (D = C + 26) to get D = 12 + 26 = 38.
          So, Devon is 38 years old.

        The student is then asked to study another problem and its solution. Here's the problem:
        Pam is 3 years older than her sister, Jan. The sum of their ages is 99. Find Pam's age. Here's its solution:
          Step 1: Let's denote Jan's age as x.
          Step 2: Since Pam is 3 years older than Jan, we can denote Pam's age as x + 3.
          Step 3: According to the problem, the sum of their ages is 99. So, we can write the equation: x + (x + 3) = 99.
          Step 4: Simplify the equation by combining like terms: 2x + 3 = 99.
          Step 5: Subtract 3 from both sides of the equation to isolate the term with x: 2x = 99 - 3, which simplifies to 2x = 96.
          Step 6: Divide both sides of the equation by 2 to solve for x: x = 96 / 2, which simplifies to x = 48.
          So, Jan is 48 years old.
          Step 7: To find Pam's age, add 3 to Jan's age: 48 + 3 = 51.
          So, Pam is 51 years old.

        The student is then asked to study another problem and its solution. Here's the problem:
        Last month Jim and Debbie earned $7,200. Debbie earned $1,600 more than Jim earned. How much did Debbie earn? Here's its solution:
          Step 1: Let's denote the amount Jim earned as J.
          Step 2: According to the problem, Debbie earned $1,600 more than Jim, so we can denote the amount Debbie earned as J + $1,600.
          Step 3: We know that the total amount they earned together is $7,200. So, we can set up the following equation: J + (J + $1,600) = $7,200.
          Step 4: Simplify the equation by combining like terms: 2J + $1,600 = $7,200.
          Step 5: To solve for J, subtract $1,600 from both sides of the equation: 2J = $7,200 - $1,600 = $5,600.
          Step 6: Divide both sides of the equation by 2 to solve for J: J = $5,600 / 2 = $2,800.
          Step 7: Substitute J = $2,800 back into the equation from Step 2 to find out how much Debbie earned: Debbie earned J + $1,600 = $2,800 + $1,600 = $4,400.
          So, Debbie earned $4,400 last month.

        The student is then asked to study another problem and its solution. Here's the problem:
        A married couple together earn $75,000. The husband earns $15,000 more than five times what his wife earns. What does the wife earn? Here's its solution:
          Step 1: Let's denote the wife's earnings as W.
          Step 2: According to the problem, the husband earns $15,000 more than five times what his wife earns. So, we can express the husband's earnings as 5W + $15,000.
          Step 3: The problem also states that the couple together earn $75,000. So, we can set up the following equation to represent the total earnings of the couple: W + (5W + $15,000) = $75,000.
          Step 4: Simplify the equation by combining like terms: 6W + $15,000 = $75,000.
          Step 5: Subtract $15,000 from both sides of the equation to isolate the term with W: 6W = $60,000.
          Step 6: Divide both sides of the equation by 6 to solve for W: W = $10,000.
          So, the wife earns $10,000.

      Now the student is asked to work on the following problem on a test: {test_problem}
      {instruction}
      '''

    user_message = [{"role": "user", "content": prompt}]

    user_message = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt,}],
        model="gemma2-9b-it",#"llama3-8b-8192", #"llama3-groq-70b-8192-tool-use-preview",#"llama-3.2-1b-preview",
        temperature=0,
        max_tokens=1024,
        top_p=1,
        stop=None,
        stream=False,
        )

    decoded_output = user_message.choices[0].message.content

    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": decoded_output})

    score = re.findall(r'\[.*?\]', decoded_output)
    if score:
      test_scores.append(score[0])
    #print(test_scores)

  sum_score = 0
  for s in test_scores:
    sum_score += try_convert_to_float(s)
    #print(sum_score)
    avg_score = sum_score / len(test_scores)
  print(decoded_output)
  print("---------------------------------------------------------")
  print(f"Persona {i} got variability post-test score = {avg_score}")

  personas.loc[i, 'var_post_test'] = round(avg_score,2)

Here's a breakdown of why the student likely has a good chance of solving this problem:

* **Strong Solving Skills:** The student rates a 4 out of 5 in solving systems of equations. This indicates they have a solid grasp of the mechanics of solving for variables.
* **Relevant Problem Type:** The problem presented on the test is very similar to problem 5 on the worksheet.  Both involve finding one person's earnings when given the total earnings and a difference between the two earners. This suggests the student has encountered this type of word problem before.
* **Setting Up Equations:** While the student's rating in setting up systems of equations is lower (2 out of 5), the test problem doesn't require a complex setup.  It's a straightforward scenario that can likely be translated into a single equation.

**Probability:**

Given these factors, it's reasonable to estimate the student has a high probability of solving the problem correctly.  

**[85]** 


Let me know if you'd like to exp

In [133]:
personas.to_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_11122024_output.csv", index=False)

In [132]:
personas

Unnamed: 0,level1,level2,total_expertise,expertise,expertise_reversal,variability,pretest_score,var_post_test,ere_post_test
0,2,4,6,low-expertise,low-expertise/worked example,low-variability/practice,48.25,52.5,55.0
1,4,3,7,high-expertise,high-expertise/practice,low-variability/worked example,57.75,55.0,58.75
2,5,5,10,high-expertise,high-expertise/practice,high-variability/practice,95.0,92.5,65.0
3,5,4,9,high-expertise,high-expertise/practice,high-variability/worked example,80.0,50.0,61.25
4,5,4,9,high-expertise,high-expertise/practice,low-variability/practice,71.25,70.0,57.5
5,3,5,8,high-expertise,high-expertise/worked example,low-variability/worked example,74.38,71.67,58.75
6,1,3,4,low-expertise,low-expertise/practice,high-variability/practice,10.88,33.33,20.0
7,4,1,5,low-expertise,low-expertise/practice,high-variability/worked example,20.0,10.0,20.0
8,1,1,2,low-expertise,low-expertise/worked example,low-variability/practice,10.62,8.33,20.0
9,4,1,5,low-expertise,low-expertise/worked example,low-variability/worked example,20.0,10.0,40.0


In [21]:
personas.to_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_output.csv", index=False)

## Plot

### Experise revelsal Effect

pre-test
post-test

In [8]:
#personas = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_output.csv", delimiter=',')
personas = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_11122024_output.csv", delimiter=',')

In [9]:
# Posttest average
round(personas.groupby(['expertise_reversal']).mean(numeric_only=True), 2)

Unnamed: 0_level_0,level1,level2,total_expertise,pretest_score,var_post_test,ere_post_test
expertise_reversal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high-expertise/practice,4.5,3.9,8.4,67.53,55.25,54.34
high-expertise/worked example,4.1,3.6,7.7,55.81,48.0,48.5
low-expertise/practice,2.8,2.5,5.3,28.61,32.08,28.46
low-expertise/worked example,2.4,2.3,4.7,28.64,36.7,41.1


In [10]:
# standard error of posttest 
round(personas.groupby(['expertise_reversal']).sem(numeric_only=True), 2)

Unnamed: 0_level_0,level1,level2,total_expertise,pretest_score,var_post_test,ere_post_test
expertise_reversal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high-expertise/practice,0.31,0.35,0.37,5.76,4.56,4.24
high-expertise/worked example,0.31,0.43,0.21,4.94,3.79,4.45
low-expertise/practice,0.49,0.48,0.26,3.74,4.73,3.05
low-expertise/worked example,0.34,0.42,0.4,3.85,4.8,3.64


In [12]:
# Data clean
#columns_to_clean = ['pretest_response', 'posttest_response', 'reversal_effect_response', 'var_post_test']
columns_to_clean = ['pretest_score', 'ere_post_test', 'var_post_test']

for column in columns_to_clean:
    personas[column] = personas[column].astype(str)

    # Removing '[' and ']' using str.strip()
    personas[column] = personas[column].str.strip('[%%]')

    # Converting to numeric (int or float based on data)
    personas[column] = pd.to_numeric(personas[column], errors='coerce')

In [13]:
# Data for practice and worked examples
practice_high_expert = personas[personas['expertise_reversal'].isin(['high-expertise/practice'])]
worked_example_high_expert = personas[personas['expertise_reversal'].isin(['high-expertise/worked example'])]

practice_low_expert = personas[personas['expertise_reversal'].isin(['low-expertise/practice'])]
worked_example_low_expert= personas[personas['expertise_reversal'].isin(['low-expertise/worked example'])]

# Calculate average pre-test and post-test scores
avg_pretest_practice_high_expert = practice_high_expert['pretest_score'].mean()
avg_pretest__example_high_expert = worked_example_high_expert['pretest_score'].mean()
avg_posttest_practice_high_expert = practice_high_expert['ere_post_test'].mean()
avg_posttest_example_high_expert = worked_example_high_expert['ere_post_test'].mean()

avg_pretest_practice_low_expert = practice_low_expert['pretest_score'].mean()
avg_pretest__example_low_expert = worked_example_low_expert['pretest_score'].mean()
avg_posttest_practice_low_expert = practice_low_expert['ere_post_test'].mean()
avg_posttest_example_low_expert = worked_example_low_expert['ere_post_test'].mean()

In [15]:
# Create scatter plot High-expertise learner
fig = go.Figure(
    data=[
        go.Scatter(
            x=["Pre-test", "Post-test"],
            y=[avg_pretest_practice_high_expert, avg_posttest_practice_high_expert],
            mode="lines",
            marker=dict(size=10, color="cyan"),
            name="Practice",
            error_y=dict(
                type="data",  # value of error bar
                array=[
                    practice_high_expert["pretest_score"].sem(),
                    practice_high_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
        go.Scatter(
            x=["Pre-test", "Post-test"],
            y=[avg_pretest__example_high_expert, avg_posttest_example_high_expert],
            mode="lines",
            marker=dict(size=10, color="orange", symbol="square"),
            name="Worked Example",
            error_y=dict(
                type="data",  # value of error bar
                array=[
                    worked_example_high_expert["pretest_score"].sem(),
                    worked_example_high_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
    ]
)

fig.update_layout(
    title="High-expertise learner",
    yaxis_title="Score",
    legend_title="Instruction",
    width=600,
    height=400,
)
fig.show()

In [16]:
fig = go.Figure(
    data=[
        go.Scatter(
            x=["Pre-test", "Post-test"],
            y=[avg_pretest_practice_low_expert, avg_posttest_practice_low_expert],
            mode="lines",
            marker=dict(size=10, color="cyan"),
            name="Practice",
            error_y=dict(
                type="data",
                array=[
                    practice_low_expert["pretest_score"].sem(),
                    practice_low_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
        go.Scatter(
            x=["Pre-test", "Post-test"],
            y=[avg_pretest__example_low_expert, avg_posttest_example_low_expert],
            mode="lines",
            marker=dict(size=10, color="orange", symbol="square"),
            name="Worked Example",
            error_y=dict(
                type="data",
                array=[
                    worked_example_low_expert["pretest_score"].sem(),
                    worked_example_low_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
    ]
)

fig.update_layout(
    title="Low-expertise learner",
    yaxis_title="Score",
    legend_title="Instruction",
    width=600,
    height=400,
)
fig.show()

In [17]:
fig = go.Figure(
    data=[
        go.Scatter(
            x=["Low-expertise learners", "High-expertise learners"],
            y=[avg_posttest_practice_low_expert, avg_posttest_practice_high_expert],
            mode="markers",
            marker=dict(size=10, color="cyan"),
            name="Practice",
            error_y=dict(
                type="data",
                array=[
                    practice_low_expert["ere_post_test"].sem(),
                    practice_high_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
        go.Scatter(
            x=["Low-expertise learners", "High-expertise learners"],
            y=[avg_posttest_example_low_expert, avg_posttest_example_high_expert],
            mode="markers",
            marker=dict(size=10, color="orange", symbol="square"),
            name="Worked Example",
            error_y=dict(
                type="data",
                array=[
                    worked_example_low_expert["ere_post_test"].sem(),
                    worked_example_high_expert["ere_post_test"].sem(),
                ],
                visible=True,
            ),
        ),
    ]
)

fig.update_layout(
    title="Expertise Reversal Effect",
    yaxis_title="Posttest score",
    legend_title="Instruction",
    width=600,
    height=400,
)
fig.show()

### Variability

In [21]:
def graph_var(personas):
    # Data for practice and worked examples
    practice_high_variability = personas[personas['variability'].isin(['high-variability/practice'])]
    worked_example_high_variability = personas[personas['variability'].isin(['high-variability/worked example'])]

    practice_low_variability = personas[personas['variability'].isin(['low-variability/practice'])]
    worked_example_low_variability = personas[personas['variability'].isin(['low-variability/worked example'])]

    # Calculate average pre-test and post-test scores
    avg_posttest_practice_high_variability = practice_high_variability['var_post_test'].mean()
    avg_posttest_worked_example_high_variability = worked_example_high_variability['var_post_test'].mean()
    avg_posttest_practice_low_variability= practice_low_variability['var_post_test'].mean()
    avg_posttest_worked_example_low_variability = worked_example_low_variability['var_post_test'].mean()

    

    fig = go.Figure(data=[
        go.Scatter(
            x=['Low-variability', 'High-variability'],
            y=[avg_posttest_practice_low_variability,
            avg_posttest_practice_high_variability],
            mode='markers',
            marker=dict(size=10, color='cyan'),
            name='Practice',
            error_y=dict(
                type='data',
                array=[practice_low_variability['var_post_test'].sem(),
                    practice_high_variability['var_post_test'].sem()],
                visible=True)
        ),
        go.Scatter(
            x=['Low-variability', 'High-variability'],
            y=[avg_posttest_worked_example_low_variability,
            avg_posttest_worked_example_high_variability],
            mode='markers',
            marker=dict(size=10, color='orange', symbol='square'),
            name='Worked Example',
            error_y=dict(
                type='data',
                array=[worked_example_low_expert['var_post_test'].sem(),
                    worked_example_high_variability['var_post_test'].sem()],
                visible=True,
                )
        )
    ])

    fig.update_layout(
        title='Variability Effect',
        yaxis_title='Posttest score',
        legend_title='Instruction',
        width=600, 
        height=400
    )
    fig.show()

In [23]:
# llm experiments results 
gemma_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_11122024_output.csv", delimiter=',')
llama_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_11122024_output.csv", delimiter=',')

In [26]:
# llama  
graph_var(llama_result)

In [27]:
# gemma 
graph_var(gemma_result)

## 2 Sample t-test

### Expertise Reversal Effect
ttest 2 sample pre-test and post-test score
Low-expertise and High-expertise learners: Worked example VS practice group

#### High-expertise Lenaners: Worked example VS practice

In [101]:
gemma_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_output.csv", delimiter=',')
llama_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_40_output.csv", delimiter=',')

In [57]:
# pre-test low vs high
#
stats.ttest_ind(practice_high_expert['pretest_score'], worked_example_high_expert['pretest_score'], equal_var=False)

TtestResult(statistic=np.float64(1.5431552885017614), pvalue=np.float64(0.14058845981840987), df=np.float64(17.59378954459061))

In [77]:
# pre-test low vs high
#
stats.ttest_ind(practice_high_expert['pretest_score'], worked_example_high_expert['pretest_score'], equal_var=False)

TtestResult(statistic=np.float64(1.7539816317865102), pvalue=np.float64(0.09745378140267887), df=np.float64(16.98870311571616))

In [58]:
# posttest: low expertise learners vs high expertise learners 
# 
stats.ttest_ind(practice_high_expert['ere_post_test'], worked_example_high_expert['ere_post_test'])

TtestResult(statistic=np.float64(0.9498445864977483), pvalue=np.float64(0.3547713034238801), df=np.float64(18.0))

In [None]:
# posttest: low expertise learners vs high expertise learners 
# 
stats.ttest_ind(practice_high_expert['ere_post_test'], worked_example_high_expert['ere_post_test'])

TtestResult(statistic=np.float64(2.468885038793123), pvalue=np.float64(0.023795606820084254), df=np.float64(18.0))

### Low-expertise Lenaners: Worked example VS practice

In [30]:
delta_we_low = worked_example_low_expert['ere_post_test'] - worked_example_low_expert['pretest_score']
delta_practice_low = practice_low_expert['ere_post_test'] - practice_low_expert['pretest_score']
delta_we_high = worked_example_high_expert['ere_post_test'] - worked_example_high_expert['pretest_score']
delta_practice_high = practice_high_expert['ere_post_test'] - practice_high_expert['pretest_score']

stats.ttest_ind(delta_we_low, delta_practice_low)


TtestResult(statistic=np.float64(3.0523419263137206), pvalue=np.float64(0.0068573256824547655), df=np.float64(18.0))

In [18]:
# high expertise learners 
stats.ttest_ind(delta_we_high, delta_practice_high)

TtestResult(statistic=np.float64(1.009336524898735), pvalue=np.float64(0.32618976379028974), df=np.float64(18.0))

In [46]:
stats.ttest_ind(worked_example_low_expert['pretest_score'], practice_low_expert['pretest_score'])

TtestResult(statistic=np.float64(0.00465760607258286), pvalue=np.float64(0.9963350134768982), df=np.float64(18.0))

In [81]:
stats.ttest_ind(worked_example_low_expert['pretest_score'], practice_low_expert['pretest_score'])

TtestResult(statistic=np.float64(-0.35756425859907576), pvalue=np.float64(0.724827686447484), df=np.float64(18.0))

In [52]:
stats.ttest_ind(worked_example_low_expert['ere_post_test'], practice_low_expert['ere_post_test'])

TtestResult(statistic=np.float64(2.6587696655411928), pvalue=np.float64(0.015989646249249766), df=np.float64(18.0))

In [82]:
stats.ttest_ind(worked_example_low_expert['ere_post_test'], practice_low_expert['ere_post_test'])

TtestResult(statistic=np.float64(-0.17200192431749325), pvalue=np.float64(0.8653550644924899), df=np.float64(18.0))

### Variability Effect
ttest 2 sample post-test score (Low-expertise and High-expertise learners: )
Worked example VS practice group

In [59]:
stats.ttest_ind(worked_example_low_expert['var_post_test'], practice_low_expert['var_post_test'])

TtestResult(statistic=np.float64(0.2912556721675631), pvalue=np.float64(0.7741854441342807), df=np.float64(18.0))

In [83]:
stats.ttest_ind(worked_example_low_expert['var_post_test'], practice_low_expert['var_post_test'])

TtestResult(statistic=np.float64(-0.16597915339530048), pvalue=np.float64(0.8700234465011352), df=np.float64(18.0))

In [60]:
stats.ttest_ind(worked_example_high_expert['var_post_test'], practice_high_expert['var_post_test'])

TtestResult(statistic=np.float64(-1.2075078349983608), pvalue=np.float64(0.24286449320886752), df=np.float64(18.0))

In [84]:
stats.ttest_ind(worked_example_high_expert['var_post_test'], practice_high_expert['var_post_test'])

TtestResult(statistic=np.float64(-1.001759791998513), pvalue=np.float64(0.3297371417371765), df=np.float64(18.0))

## Optimization
TODO: 

In [None]:
#To revise 
posttest_problems = test_problem
persona = f'''
        Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5):
            1. Being able to set up systems of equations given a word problem: {row['level1']}
            2. Being able to solve systems of equations: {row['level2']}
'''


In [None]:
# utility_string
def utility(worksheet: str):
    '''
    Evaluates the worksheet in terms of test performance. Returns final test score of the student.
    '''
    messages = []
    for j in range(len(posttest_problems)):
        test_problem = posttest_problems.iloc[j]['question']
        test_instruction = f'''Now the student is asked to work on the following problem on a test: {test_problem} Given the student's initial skill levels and the worksheet the student has received, what's the probability that the student can solve the problem correctly? Explain your reasoning and give a single number between 0 and 100 in square brackets.'''
        if j == 0:
            user_message = f"""{persona}
              Here's a worksheet that the student receives:
              {worksheet}

              {test_instruction}
              """
        else:
            user_message = test_instruction
        messages.append({"role": "user", "content": user_message})

        #chat_completion = openai.ChatCompletion.create(model=model_id, messages=messages, temperature=temperature)

        #output = chat_completion.choices[0].message.content

        outputs = llama_model.generate(**inputs, max_new_tokens=512, temperature=0.1)
        output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)

        messages.append({"role": "assistant", "content":  output})
    scores = re.findall(r'\[.*?\]', messages_string)
    sum_score = 0
    for s in scores:
        sum_score += try_convert_to_float(s)
    avg_score = sum_score / len(scores)

    return avg_score

In [None]:
# The optimization prompt for generating a new instruction (e.g., a worksheet) based on prior instructions and post-test scores for a given student
worksheet_prompt = '''
Here is an 8th-grade student with the following skill levels (each skill is rated on a scale from 1 to 5, where higher numbers indicate more proficiency):
    1. Being able to set up systems of equations given a word problem: 1
    2. Being able to solve systems of equations: 1

I have some worksheets along with the student's test scores after receiving the worksheets. The worksheets are arranged in ascending order based on their scores, where higher scores indicate better quality.

Worksheet:
You need to study a problem and its solution. Here's the problem:
A brownie recipe is asking for 350 grams of sugar, and a pound cake recipe requires 270 more grams of sugar than a brownie recipe. How many grams of sugar are needed for the pound cake? Here's its solution:
Step 1: Identify the amount of sugar needed for the brownie recipe, which is 350 grams.\nStep 2: Understand that the pound cake recipe requires 270 more grams of sugar than the brownie recipe.\nStep 3: Add the additional 270 grams of sugar to the 350 grams required for the brownie recipe.\nStep 4: The total amount of sugar needed for the pound cake recipe is 350 grams + 270 grams = 620 grams.
Test score:
20

Generate a new worksheet to further increase the test score of the student. You will be evaluated based on this score function:
```python
{utility_string}
```
The new worksheet should begin with <WORKSHEET> and end with </WORKSHEET>.
'''

In [None]:
utility_string = utility(worksheet_prompt)