In [3]:
import pandas as pd
import numpy as np
from scipy import stats

In [4]:
def tttest(personas):
    
    # expert reversal effect 
    practice_high_expert = personas[personas['expertise_reversal'].isin(['high-expertise/practice'])]
    worked_example_high_expert = personas[personas['expertise_reversal'].isin(['high-expertise/worked example'])]
    practice_low_expert = personas[personas['expertise_reversal'].isin(['low-expertise/practice'])]
    worked_example_low_expert= personas[personas['expertise_reversal'].isin(['low-expertise/worked example'])]

    # Delta = posttrst - pretest 
    delta_we_low = worked_example_low_expert['ere_post_test'] - worked_example_low_expert['pretest_score']
    delta_practice_low = practice_low_expert['ere_post_test'] - practice_low_expert['pretest_score']
    delta_we_high = worked_example_high_expert['ere_post_test'] - worked_example_high_expert['pretest_score']
    delta_practice_high = practice_high_expert['ere_post_test'] - practice_high_expert['pretest_score']

    alpha = 0.05
    # Perform the two-sample t-test
    # 1. Low-expertise Learners: Worked example VS practice
    t_stat, p_value = stats.ttest_ind(delta_we_low, delta_practice_low, equal_var=False)
    print(f"\nDelta mean low worked-example: {delta_we_low.mean()}")
    print(f"Delta mean low practice: {delta_practice_low.mean()}")
    
    # Output results
    print("\n<<<<< 1. Low-expertise Learners: worked-example VS practice >>>>>")
    print(f"T-statistic = {round(t_stat,2)}")
    print(f"P-value = {p_value}")

    # Interpret results
    if p_value < alpha:
        print("Reject the null hypothesis: The means are significantly different.")
    else:
        print("Fail to reject the null hypothesis: The means are not significantly different.")

    # 2. Low-expertise Learners: Worked example VS practice
    t_stat, p_value = stats.ttest_ind(delta_we_high, delta_practice_high, equal_var=False)
    print(f"\nDelta mean high worked-example: {delta_we_high.mean()}")
    print(f"Delta mean high practice: {delta_practice_high.mean()}")

    # Output results
    print("\n<<<<< 2. High-expertise Learners: worked example VS practice >>>>>")
    print(f"T-statistic = {round(t_stat,2)}")
    print(f"P-value = {round(p_value,2)}")

    # Interpret results
    if p_value < alpha:
        print("Reject the null hypothesis: The means are significantly different.")
    else:
        print("Fail to reject the null hypothesis: The means are not significantly different.")

    # Compute Pearson's correlation coefficient
    print("\nPearson's correlation")
    print("\n<<<<< Overall: Pre-Test VS Post-Test >>>>>")
    print(stats.pearsonr(personas['pretest_score'], personas['ere_post_test']))

    print("\n<<<<< Low expert learners - Practice: Pre-Test VS Post-Test >>>>>")
    print(stats.pearsonr(practice_low_expert['pretest_score'], practice_low_expert['ere_post_test']))

    print("\n<<<<< Low expert learners - Worked-Example: Pre-Test VS Post-Test >>>>>")
    print(stats.pearsonr(worked_example_low_expert['pretest_score'], worked_example_low_expert['ere_post_test']))

    print("\n<<<<< High expert learners - Worked-Example: Pre-Test VS HE Post-Test >>>>>")
    print(stats.pearsonr(worked_example_high_expert['pretest_score'], worked_example_high_expert['ere_post_test']))

    print("\n<<<<< High expert learners - Practice: Pre-Test VS HE Post-Test >>>>>")
    print(stats.pearsonr(practice_high_expert['pretest_score'], practice_high_expert['ere_post_test']))

In [5]:
# llm experiments results
gemma_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_output.csv", delimiter=',')
llama_result = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_40_output.csv", delimiter=',')

In [6]:
print("Llama two sample ttest results")
tttest(llama_result)

Llama two sample ttest results

Delta mean low worked-example: 12.462
Delta mean low practice: -0.14900000000000002

<<<<< 1. Low-expertise Learners: worked-example VS practice >>>>>
T-statistic = 3.05
P-value = 0.007006705460378294
Reject the null hypothesis: The means are significantly different.

Delta mean high worked-example: -7.312
Delta mean high practice: -13.186000000000002

<<<<< 2. High-expertise Learners: worked example VS practice >>>>>
T-statistic = 1.01
P-value = 0.33
Fail to reject the null hypothesis: The means are not significantly different.

Pearson's correlation

<<<<< Overall: Pre-Test VS Post-Test >>>>>
PearsonRResult(statistic=np.float64(0.7616778755911491), pvalue=np.float64(1.144458248681121e-08))

<<<<< Low expert learners - Practice: Pre-Test VS Post-Test >>>>>
PearsonRResult(statistic=np.float64(0.5873525328458731), pvalue=np.float64(0.07420076971709387))

<<<<< Low expert learners - Worked-Example: Pre-Test VS Post-Test >>>>>
PearsonRResult(statistic=np.fl

In [34]:
print("Gemma two sample ttest results")
tttest(gemma_result)

Gemma two sample ttest results

Delta mean low worked-example: 2.1260000000000012
Delta mean low practice: -0.007999999999999119

<<<<< 1. Low-expertise Learners: worked-example VS practice >>>>>
T-statistic = 0.33
P-value = 0.7456025675081492
Fail to reject the null hypothesis: The means are not significantly different.

Delta mean low worked-example: -9.938999999999998
Delta mean high practice: -1.4370000000000012

<<<<< 2. High-expertise Learners: worked example VS practice >>>>>
T-statistic = -1.25
P-value = 0.23
Fail to reject the null hypothesis: The means are not significantly different.

Pearson's correlation

<<<<< Overall: Pre-Test VS Post-Test >>>>>
PearsonRResult(statistic=np.float64(0.8126006148571757), pvalue=np.float64(1.9240873162114128e-10))

<<<<< Low expert learners - Practice: Pre-Test VS Post-Test >>>>>
PearsonRResult(statistic=np.float64(0.597798628681474), pvalue=np.float64(0.06795818196627681))

<<<<< Low expert learners - Worked-Example: Pre-Test VS Post-Test >

In [35]:
def tttest_var(personas):
    
    #variability 
    practice_high_variability = personas[personas['variability'].isin(['high-variability/practice'])]
    worked_example_high_variability = personas[personas['variability'].isin(['high-variability/worked example'])]
    practice_low_variability = personas[personas['variability'].isin(['low-variability/practice'])]
    worked_example_low_variability = personas[personas['variability'].isin(['low-variability/worked example'])]

    # Delta worked_example =high_variability - low variability  
    low_variability_we = (worked_example_low_variability['var_post_test'].reset_index(drop=True))
    low_variability_p = practice_low_variability['var_post_test'].reset_index(drop=True)
    high_variability_we = worked_example_high_variability['var_post_test'].reset_index(drop=True)
    high_variability_p = practice_high_variability['var_post_test'].reset_index(drop=True)

    delta_we = high_variability_we - low_variability_we
    delta_practice = high_variability_p - low_variability_p
    
    alpha = 0.05
    # Perform the two-sample t-test
    # 1. Worked-example: High VS Low condition 
    t_stat, p_value = stats.ttest_ind(worked_example_high_variability['var_post_test'], worked_example_low_variability['var_post_test'], equal_var=False)
    
    # Output results
    print("<<<<<< Worked-example condition: High VS Low >>>>>")
    print(f"T-statistic = {round(t_stat,2)}")
    print(f"P-value = {round(p_value,2)}")
    
    # Interpret results
    if p_value < alpha:
        print("Reject the null hypothesis: The means are significantly different.")
    else:
        print("Fail to reject the null hypothesis: The means are not significantly different.")

    # 2. Practice: High VS Low condition 
    t_stat, p_value = stats.ttest_ind(practice_high_variability['var_post_test'], practice_low_variability['var_post_test'], equal_var=False)
    
    # Output results
    print("\n<<<<<< Practice condition: High VS Low >>>>>")
    print(f"T-statistic: {round(t_stat,2)}")
    print(f"P-value: {round(p_value,2)}")

    # Interpret results
    if p_value < alpha:
        print("Reject the null hypothesis: The means are significantly different.")
    else:
        print("Fail to reject the null hypothesis: The means are not significantly different.")

    # 3. Gain: Worked example VS practice
    t_stat, p_value = stats.ttest_ind(delta_we, delta_practice, equal_var=False)
    
    # Output results
    print(f"\ndelta mean worked example: {delta_we.mean()}")
    print(f"delta mean practice: {delta_practice.mean()}")

    print("\n<<<<< Gain: Worked example VS practice >>>>>")
    print(f"T-statistic: {round(t_stat,2)}")
    print(f"P-value: {p_value}")
    # Interpret results
    if p_value < alpha:
        print("Reject the null hypothesis: The means are significantly different.")
    else:
        print("Fail to reject the null hypothesis: The means are not significantly different.")

In [36]:
# dat 11/12/2024 results
gemma = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/gemma2_9b_it_11122024_output.csv", delimiter=',', index_col=False)
llama = pd.read_csv("/Users/ctivir/projects/ml/ed_simulator/unchk/llama3_8b_8192_11122024_output.csv", delimiter=',', index_col=False)

In [37]:
print("Llama two sample ttest results")
tttest_var(llama)

Llama two sample ttest results
<<<<<< Worked-example condition: High VS Low >>>>>
T-statistic = 1.1
P-value = 0.29
Fail to reject the null hypothesis: The means are not significantly different.

<<<<<< Practice condition: High VS Low >>>>>
T-statistic: 0.79
P-value: 0.44
Fail to reject the null hypothesis: The means are not significantly different.

delta mean worked example: 6.997999999999999
delta mean practice: 6.634

<<<<< Gain: Worked example VS practice >>>>>
T-statistic: 0.04
P-value: 0.9699885834278519
Fail to reject the null hypothesis: The means are not significantly different.


In [38]:
print("Gemma two sample ttest results")
tttest_var(gemma)

Gemma two sample ttest results
<<<<<< Worked-example condition: High VS Low >>>>>
T-statistic = 1.01
P-value = 0.33
Fail to reject the null hypothesis: The means are not significantly different.

<<<<<< Practice condition: High VS Low >>>>>
T-statistic: 0.95
P-value: 0.36
Fail to reject the null hypothesis: The means are not significantly different.

delta mean worked example: 10.248999999999999
delta mean practice: 9.748999999999999

<<<<< Gain: Worked example VS practice >>>>>
T-statistic: 0.03
P-value: 0.9737558398912209
Fail to reject the null hypothesis: The means are not significantly different.
