In [2]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
import kaleido
import plotly.io as pio
pio.kaleido.scope.mathjax = None

In [27]:
df = pd.read_csv('../results/clip_analysis_20250425_121658.csv')

In [28]:
df['condition'] = np.where(df['word'].str.lower() == df['color'].str.lower(), 'congruent', 'incongruent')
df.loc[df['word'] == 'XXXX', 'condition'] = 'neutral'


In [47]:
class_names = df["class_name"].unique()
for class_set, class_set_df in df.groupby("class_set", ):
    fig = px.bar(
        class_set_df,
        x="probability",
        y="class_name",
        color="color",
        facet_col="word",
        text="word",
        color_discrete_map={"blue": "blue", "red": "red"},
        hover_data=df.columns.tolist(),
        labels={"probability": "Probability", "class_set": "Experiment Condition", 'class_name': 'Prompt'},
        barmode="group",
        category_orders={"class_name": class_names},
        title=f'Experiment "{class_set.replace('_', ' ').title()}"',
        orientation="h",
        #range_x=[0, 1],
    )
    # update x-range to be 0-1
    #fig.update_xaxes(range=[0, 1])
    fig.show()

# # remove facets = in the titles

# # the the x-axis does not need to be the same
# fig.update_xaxes(matches=None)
# for col in range(1, 4):
#     for row in range(2, 4):
#         fig.update_xaxes(matches=f"x{col}", row=row, col=col)  # x1 matches x2
# fig.update_layout(height=800, width=1200, title_text="CLIP Stroop Test")
# fig.show()
# import plotly.io as pio

# pio.kaleido.scope.mathjax = None
# fig.write_image("clip_stroop_test.pdf")

In [None]:
# anova
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm


In [60]:
import pingouin

In [72]:
# analyzing the wrod only and color only conditions:

def extended_anova(df, equation):
    model = ols(equation, data=df).fit()
    anova_table = anova_lm(model)
    # Calculate eta squared
    ss_total = anova_table['sum_sq'].sum()
    ss_error = anova_table.loc['Residual', 'sum_sq']
    anova_table['eta_sq'] = anova_table['sum_sq'] / ss_total
    anova_table['partial_eta_sq'] = anova_table['sum_sq'] / (anova_table['sum_sq'] + ss_error)
    anova_table['cohen_f'] = np.sqrt(anova_table['partial_eta_sq'] / (1 - anova_table['partial_eta_sq']))
    return anova_table
    



for class_set in ['word_only', 'color_only']:
    print(class_set)
    subset = df.query(f'(class_set == "{class_set}") & (word != "XXXX")').copy()
    subset.loc[:, 'response'] = np.where(subset['class_name'].str.contains('blue'), 'blue', 'red')
    anova_table = extended_anova(subset, 'probability ~  C(word) * C(response) + C(color)*C(response)')
    display(anova_table)



word_only


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),eta_sq,partial_eta_sq,cohen_f
C(word),1.0,1.729828e-15,1.729828e-15,7.230957e-10,0.999981,8.883044e-16,3.615478e-10,1.9e-05
C(response),1.0,1.18103e-05,1.18103e-05,4.936892,0.156385,6.064845e-06,0.7116865,1.571129
C(color),1.0,2.069306e-16,2.069306e-16,8.650027e-11,0.999993,1.062634e-16,4.325013e-11,7e-06
C(word):C(response),1.0,1.947046,1.947046,813896.2,1e-06,0.9998505,0.9999975,637.924832
C(color):C(response),1.0,0.0002745798,0.0002745798,114.7787,0.0086,0.0001410027,0.9828736,7.575577
Residual,2.0,4.784507e-06,2.392253e-06,,,2.456948e-06,0.5,1.0


color_only


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),eta_sq,partial_eta_sq,cohen_f
C(word),1.0,4.282768e-15,4.282768e-15,8.317015e-11,0.999994,2.249893e-15,4.158507e-11,6e-06
C(response),1.0,0.0001614999,0.0001614999,3.136284,0.218582,8.484178e-05,0.6106134,1.252255
C(color),1.0,5.50402e-16,5.50402e-16,1.068865e-11,0.999998,2.891462e-16,5.344326e-12,2e-06
C(word):C(response),1.0,1.902332,1.902332,36942.74,2.7e-05,0.9993638,0.9999459,135.909425
C(color):C(response),1.0,0.0009465404,0.0009465404,18.38155,0.050331,0.000497252,0.901872,3.031629
Residual,2.0,0.0001029881,5.149405e-05,,,5.410339e-05,0.5,1.0


In [77]:
# statistical analysis

subset = df.query('(class_set in ["word_only", "color_only"]) & (word == "XXXX")').copy()
subset.loc[:, 'response'] = np.where(subset['class_name'].str.contains('blue'), 'blue', 'red')
anova_table = extended_anova(subset, 'probability ~  C(color)*C(response)')
display(anova_table)


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),eta_sq,partial_eta_sq,cohen_f
C(color),1.0,1.667069e-15,1.667069e-15,4.467583e-11,0.999995,1.150703e-15,1.116896e-11,3e-06
C(response),1.0,0.005146677,0.005146677,137.9259,0.0003007143,0.003552518,0.9718163,5.872093
C(color):C(response),1.0,1.443444,1.443444,38682.9,4.00902e-09,0.9963445,0.9998966,98.33985
Residual,4.0,0.0001492592,3.731479e-05,,,0.0001030269,0.5,1.0


In [78]:
subset = df.query('(class_set in ["combined"]) & (word != "XXXX")').copy()
subset.loc[:, 'response_word'] = subset['class_name'].str.extract('word is (\\w+)')
subset.loc[:, 'response_color'] = subset['class_name'].str.extract('(\\w+) color')
anova_table = extended_anova(subset, 'probability ~  C(word) * C(response_word) + C(color)*C(response_color) + C(word)*C(response_color) + C(color)*C(response_word)')
display(anova_table)


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),eta_sq,partial_eta_sq,cohen_f
C(word),1.0,2.5940390000000003e-17,2.5940390000000003e-17,6.523285e-16,1.0,2.4553480000000002e-17,9.318978000000001e-17,9.653485e-09
C(response_word),1.0,0.004541195,0.004541195,0.1141984,0.745322,0.0042984,0.01605218,0.1277265
C(color),1.0,2.723118e-16,2.723118e-16,6.847883e-15,1.0,2.577526e-16,9.78269e-16,3.127729e-08
C(response_color),1.0,0.01098783,0.01098783,0.2763133,0.615363,0.01040037,0.03797436,0.198679
C(word):C(response_word),1.0,0.5885323,0.5885323,14.79995,0.006316,0.5570664,0.6788983,1.454056
C(color):C(response_color),1.0,0.1691197,0.1691197,4.252889,0.078105,0.1600777,0.3779375,0.7794585
C(word):C(response_color),1.0,5.62033e-05,5.62033e-05,0.001413356,0.971061,5.319839e-05,0.0002018673,0.01420944
C(color):C(response_word),1.0,0.004886975,0.004886975,0.1228938,0.736229,0.004625693,0.01725335,0.1325
Residual,7.0,0.2783608,0.03976583,,,0.2634783,0.5,1.0


In [79]:
# repeat for XXXX
subset = df.query('(class_set in ["combined"]) & (word == "XXXX")').copy()
subset.loc[:, 'response_word'] = subset['class_name'].str.extract('word is (\\w+)')
subset.loc[:, 'response_color'] = subset['class_name'].str.extract('(\\w+) color')
anova_table = extended_anova(subset, 'probability ~  C(color)*C(response_color) + C(color)*C(response_word)')
display(anova_table)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),eta_sq,partial_eta_sq,cohen_f
C(color),1.0,5.013351e-16,5.013351e-16,3.526719e-14,1.0,1.58144e-15,1.763359e-14,1.327915e-07
C(response_color),1.0,4.728045e-05,4.728045e-05,0.003326016,0.959254,0.0001491442,0.001660247,0.04077999
C(response_word),1.0,0.02591455,0.02591455,1.822999,0.309456,0.08174636,0.4768505,0.9547249
C(color):C(response_color),1.0,0.1506669,0.1506669,10.5989,0.0828,0.4752724,0.8412559,2.302053
C(color):C(response_word),1.0,0.1119523,0.1119523,7.875455,0.106984,0.3531487,0.7974777,1.984371
Residual,2.0,0.02843068,0.01421534,,,0.08968338,0.5,1.0
