# MathsSim experiment analysis

Analysis of the data collected with the MathsSim online experiment.

## Imports

In [None]:
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', FutureWarning)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from statannotations.Annotator import Annotator # https://github.com/trevismd/statannotations/tree/master
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.weightstats import ttest_ind
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd as tukeyhsd
from scipy import stats
from scipy.stats import rankdata
from scipy.optimize import curve_fit
from scikit_posthocs import posthoc_dunn
from sklearn.metrics import pairwise_distances 
from sklearn.metrics.pairwise import cosine_similarity
import ast
from tqdm.notebook import tqdm
from IPython.display import display, Markdown
import os
import subprocess
from shutil import which

In [None]:
%matplotlib inline
custom = {'grid.color': '.8', 'axes.edgecolor': 'black', 'axes.spines.top': False, 'axes.spines.right': False, 'figure.figsize': (11.7,8.27), 'font.size':11, 'font.family': 'Arial', 'font.sans-serif': 'Arial'}
sns.set_theme(style="whitegrid", rc=custom)
plt.rcParams['svg.fonttype'] = 'none'
figWidth = 7.677165 # 19.5cm pour PLOS
ratio = 8.27/11.7

In [None]:
def rankOLS(y,X, **kws):
    X = np.array(X)
    if len(np.shape(X)) == 1:
        X = np.reshape(X, (1,len(X)))
    rankx = np.transpose(np.array([rankdata(x) for x in X]))
    ranky = rankdata(y)
    rankxconst = sm.add_constant(rankx)
    model = sm.OLS(endog=ranky, exog=rankxconst, **kws)
    return model

In [None]:
subData = pd.read_csv('../Data/subDataEnglish.csv', encoding='utf-8', index_col='SubID')
expData = pd.read_csv('../Data/expDataEnglish.csv', encoding='utf-8')
stimData = pd.read_csv('../Data/pairSim/English/pairSim_50_maths.csv', encoding='utf-8', index_col='PairID')
vocData = pd.read_csv('../Data/finalVocab_English_preprocessed.csv', encoding='utf-8', index_col='word',
                      converters={'grammaticalForm': ast.literal_eval}, dtype={'mathsFrequency': float, 'nonMathsFrequency': float})

In [None]:
EdLevelToId = {'Bac+2':5, 'Bac+5 (master)':8, 'Bac+3 (licence)':6, 'Bac+4':7, 
                'Bac':3, 'Primaire':0, 'Bac+1':4, 'Bac+8 (doctorat)':9, 'Lycée':2, 'Collège':1}
edLevelOrder = ['Primary school', 'Medium school', 'High school', 'High school diploma', '1st year of college', '2nd year of college (bachelor)', '3rd year of college (licence)', '4th year of college', 'Graduate (master)', 'Graduate (PhD)']
edLevelOrderTwoLines = ['Primary school', 'Medium school', 'High school', 'High school diploma', '1st year of college', '2nd year of college\n(bachelor)', '3rd year of college\n(licence)', '4th year of college', 'Graduate\n(master)', 'Graduate\n(PhD)']
wordLevelOrder = ['Primary school', '6-7th grade', '8-9th grade', '10th grade', '11-12th grade', 'Bachelor', 'Licence', 'Master']
subData['EdLevelId'] = [EdLevelToId[l] for l in subData.EdLevel]
subData = subData[['Sex', 'Age', 'Major', 'EdLevelId', 'EdLevel', 'SelfAssessment', 'StimLevel']].copy()

In [None]:
# translate labels into English
EdLevel_FrToEn = {}
for i, x in enumerate(['Primaire', 'Collège', 'Lycée', 'Bac', 'Bac+1', 'Bac+2', 'Bac+3 (licence)', 'Bac+4', 'Bac+5 (master)', 'Bac+8 (doctorat)']):
    EdLevel_FrToEn[x] = edLevelOrder[i]

WordLevel_FrToEn = {}
for i, x in enumerate(['primary', '6-7th grade', '8-9th grade', '10th grade', '11-12th grade', 'bachelor', 'licence', 'master']):
    WordLevel_FrToEn[x] = wordLevelOrder[i]

vocData['levelName'] = [WordLevel_FrToEn[x.levelName] for x in vocData.itertuples()]
subData['EdLevel'] = [EdLevel_FrToEn[x.EdLevel] for x in subData.itertuples()]

In [None]:
# exclude participants
toExclude = []
for part in toExclude:
    expData = expData.loc[expData.SubID != part].copy()
    subData.drop(index=part, inplace=True)

In [None]:
# exclude judgements for pairs of level >  given by participants of ed level bac
tmp = expData.join(subData, on="SubID").join(stimData, on='Question')
toDelete = tmp[(tmp.EdLevel == 'Bac') & (tmp.Level >= 5)].index
expData.drop(index=toDelete, inplace=True)

In [None]:
df = expData.join(subData, on="SubID").join(stimData, on="Question").join(vocData, on="Question")
df.rename(columns={'StimLevel': 'SubLevel', 'Level': 'PairLevel', 'levelId': 'WordLevelId', 'levelName': 'WordLevelName'}, inplace=True)
df.drop(['metaMaths', 'tooPolysemic', 'grammaticalForm', 'mathsFrequency', 'nonMathsFrequency'], axis=1, inplace=True)

In [None]:
display(df)

## Demographic data

### Data summary

In [None]:
subData['Sex'].value_counts()

In [None]:
subData['Age'].value_counts()

In [None]:
subData['Major'].value_counts()

In [None]:
subData['EdLevel'].value_counts()

In [None]:
subData['SelfAssessment'].value_counts()

In [None]:
len(subData)

In [None]:
stats.spearmanr(subData.SelfAssessment, subData.EdLevelId)

### Plots

In [None]:
majorsOrder = ["Mathematics", "Statistics", "Economics", "Engineering", "Natural Science", "Health and Life Science", "Psychology", "Humanities", "Law", "None"]

In [None]:
# distribution of education level
ax = sns.countplot(data=subData, x="EdLevel", color=sns.color_palette()[0], order=edLevelOrder)
ax.set(xlabel="Last classes followed in maths")
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# distribution of self-assessed maths level
ax = sns.countplot(data=subData, x="SelfAssessment", color=sns.color_palette()[0])
ax.set(xlabel="Self-assessed maths-level")
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# distribution of majors
ax = sns.countplot(data=subData, x="Major", color=sns.color_palette()[0], order=majorsOrder)
ax.set(xlabel="College Major")
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# self-assessed maths level against education level
ax = sns.barplot(data=subData, x="EdLevel", y="SelfAssessment", errorbar="sd", color=sns.color_palette()[0],
                order=edLevelOrder)
ax.set(xlabel="Last classes followed in maths", ylabel="Self-assessed maths level",
       title="Self-assessed maths level against last classes followed in maths",
       ylim=[0,10])
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# self-assessed maths level against major
ax = sns.barplot(data=subData, x="Major", y="SelfAssessment", errorbar="sd", color=sns.color_palette()[0],
                order=majorsOrder)
ax.set(xlabel="College Major", ylabel="Self-assessed maths level", title="Self-assessed maths level against college major",
       ylim=[0,10])
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# self-assessed maths level against age
ax = sns.barplot(data=subData, x="Age", y="SelfAssessment", errorbar="sd", color=sns.color_palette()[0],
                order=["18-25", "25-40", "40-60", "60-more"])
ax.set(xlabel="Age", ylabel="Self-assessed maths level", title="Self-assessed maths level against age",
       ylim=[0,10])
plt.tight_layout()
plt.show()
plt.clf()

## Voc knowledge analysis

Questions:
- Overall, is our classification of words correct? Does it fit with the actual education of participants?
- Are some words misclassified?

In [None]:
# prepare data
vData = df.loc[df.Trial == 'VocKnowledge'].copy()
vData.drop(['Trial', 'RT', 'PresentationOrder', 'Training', 'SubLevel', 'word1', 'word2', 'PairLevel', 'Similarity', 'EuclideanDistance'], 
           axis=1, inplace=True)
vData['Answer'] = vData.Answer.astype(float)

In [None]:
vData

### Analysis of the average knowledge for each word

In [None]:
meanKnowledge = vData.groupby('Question').mean(numeric_only=True)
meanKnowledge['Count'] = vData.value_counts('Question')
meanKnowledge['WordLevelName'] = meanKnowledge.join(vocData, on='Question').levelName

In [None]:
meanKnowledge

In [None]:
saveVoc = False

if saveVoc:
    df = meanKnowledge.copy()
    df['STD'] = [np.std(vData[vData.Question == x].Answer) for x in df.index]
    df.to_excel('vocAnalyses/vocKnowledge.xlsx')

#### Relation between average knowledge and proposed classification

In [None]:
# Spearman's rank correlation analysis
res = stats.spearmanr(meanKnowledge.WordLevelId, meanKnowledge.Answer)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.3f} (p = {res.pvalue:.2e})"))

In [None]:
wordLevelOrder

In [None]:
ax = sns.pointplot(meanKnowledge, x="WordLevelName", y="Answer", errorbar='sd',
                 order=wordLevelOrder)
ax.set(ylim=[-0.1,8.1], yticks=[i for i in range(9)],
      xlabel="Estimated level of acquisition", ylabel="Mean knowledge rating per word (from 0 to 8)")
ax.text(0.5, 0.5, f"Spearman's $r_s$ = {res.statistic:.2f}\np = {res.pvalue:.2e}", 
       horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
plt.show()
plt.clf()

#### Distribution of average knowledge across words

In [None]:
ax = sns.violinplot(meanKnowledge, x="Answer", y="WordLevelName", cut=0, scale='count', 
                    order=wordLevelOrder, color=sns.color_palette()[0])
b = set(list(ax.get_children()))
ax = sns.pointplot(meanKnowledge, x="Answer", y="WordLevelName", errorbar=None, 
              order=wordLevelOrder, color=sns.color_palette()[5], markers='x', ax=ax)
f = set(list(ax.get_children()))-b
for e in f:
    e.set_zorder(100)
ax.set(xlim=[-0.1,8.1], xticks=[i for i in range(9)],
       xlabel="Mean knowledge rating per word (from 0 to 8)", ylabel="Estimated level of acquisition")
for level, levelData in meanKnowledge.groupby('WordLevelId'):
    ax.text(0.5,level,f"n = {len(levelData)}", horizontalalignment='center')
plt.show()
plt.clf()

In [None]:
g = sns.displot(meanKnowledge, x="Answer", col="WordLevelName", kind="kde",
               col_order=wordLevelOrder, col_wrap=4, facet_kws={'sharey':False})
g.set_axis_labels("Mean knowledge rating per word (from 0 to 8)", "Density")
g.set_titles(col_template="Estimated level of acquisition = {col_name}")
g.set(xlim=(-0.1, 8.1), xticks=[i for i in range(9)])
plt.tight_layout()
plt.show()
plt.clf()

### Variation of the average knowledge with self-reported maths education

#### Redo the as above for each self-report maths education level

In [None]:
meanKnowledgeLevelDep = vData.groupby(['Question', 'EdLevel']).mean(numeric_only=True)
meanKnowledgeLevelDep['Count'] = vData.groupby(['Question', 'EdLevel']).count().SubID
meanKnowledgeLevelDep['WordLevelName'] = meanKnowledgeLevelDep.join(vocData, on='Question').levelName
for val in ['Question', 'EdLevel']:
    meanKnowledgeLevelDep[val] = meanKnowledgeLevelDep.index.get_level_values(val)

In [None]:
meanKnowledgeLevelDep

##### Relation between average knowledge and proposed classification

In [None]:
def annotateCorrelation(data, x=None, y=None, x_an=None, y_an=None, **kws):
    res = stats.spearmanr(data[x], data[y])
    ax = plt.gca()
    ax.text(x_an, y_an, f"Spearman's $r_s$ = {res.statistic:.2f}\np = {res.pvalue:.2e}", 
        horizontalalignment='left', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})

In [None]:
g = sns.catplot(meanKnowledgeLevelDep, x="WordLevelName", y="Answer", col="EdLevel", errorbar='sd', kind='point',
                order=wordLevelOrder, col_order=edLevelOrder, col_wrap=4)
g.map_dataframe(annotateCorrelation, x="WordLevelId", y="Answer", x_an=.1, y_an=.5)
g.set(ylim=[-0.1,8.1], yticks=[i for i in range(9)],
      xlabel="Estimated level of acquisition", ylabel="Mean knowledge rating per word (from 0 to 8)")
g.set_titles(col_template="Self-reported education level: {col_name}")
plt.show()
plt.clf()

In [None]:
meanKnowledgeLevelDep

In [None]:
ax= sns.pointplot(meanKnowledgeLevelDep, x="EdLevel", y="Answer", hue="WordLevelName", 
                   order=edLevelOrder, hue_order=wordLevelOrder, palette=['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666'])
ax.set(xlabel="", ylabel="")
ax.set_xticklabels(labels=edLevelOrder, rotation = 30, ha='right')
ax.legend(title="Word grade", loc='center right', bbox_to_anchor=[1.38,.5])
#ax.text(ax.get_xlim()[0], ax.get_ylim()[1]+.2, "Familiarity rating", size=12, horizontalalignment="center", va="bottom")
#ax.text(ax.get_xlim()[1]+.2, ax.get_ylim()[0], "Participant education level", size=12, horizontalalignment="left", va="center")
fig = plt.gcf()
fig.set_size_inches(figWidth, figWidth*ratio)
plt.grid(axis='x')
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# article fig
ax= sns.pointplot(meanKnowledgeLevelDep, y="EdLevel", x="Answer", hue="WordLevelName", dodge=True, 
                  order=edLevelOrder, hue_order=wordLevelOrder, palette=['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666'])
ax.set(xlabel="", ylabel="")
ax.set_yticklabels(labels=edLevelOrderTwoLines)
leg = ax.legend(title="Word grade", loc='center right', bbox_to_anchor=[1,.5])
leg.remove()
#ax.text(ax.get_xlim()[0], ax.get_ylim()[1]+.2, "Familiarity rating", size=12, horizontalalignment="center", va="bottom")
#ax.text(ax.get_xlim()[1]+.2, ax.get_ylim()[0], "Participant education level", size=12, horizontalalignment="left", va="center")
fig = plt.gcf()
fig.set_size_inches(figWidth/1.4, figWidth*ratio)
plt.grid(axis='y')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(14)
#plt.tight_layout()
plt.show()
plt.clf()

In [None]:
dff = vData.join(vocData, on='Question', rsuffix="_r")
dff['WordLevelId'] = (lambda x: (x-np.mean(x))/np.std(x))((lambda y: rankdata(y))(dff['WordLevelId']))
dff['mathsFrequency'] = (lambda x: (x-np.mean(x))/np.std(x))((lambda y: rankdata(y))(dff['mathsFrequency']))
dff['EdLevelId'] = (lambda x: (x-np.mean(x))/np.std(x))((lambda y: rankdata(y))(dff['EdLevelId']))

In [None]:
model = ols('Answer ~ WordLevelId * EdLevelId', data=dff)
results = model.fit()
results.summary2()

In [None]:
model = ols('Answer ~ WordLevelId * mathsFrequency * EdLevelId', data=dff)
results = model.fit()
results.summary2()

### Participant analysis

In [None]:
vDataPerSub = vData.groupby('SubID').mean(numeric_only=True)

What is the mean knowledge of a given participant?

In [None]:
ax = sns.kdeplot(vDataPerSub, x="Answer")
ax.set(xlim=[-0.1,8.1], xticks=[i for i in range(9)],
       xlabel="Mean knowledge rating per participant (from 0 to 8)")
plt.show()
plt.clf()

 Is it correlated with its self-report education and maths level?

In [None]:
res = stats.spearmanr(vDataPerSub.EdLevelId, vDataPerSub.Answer)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})"))

In [None]:
ax = sns.pointplot(vDataPerSub, x="EdLevelId", y="Answer", errorbar='sd')
ax.set(ylim=[-0.1,8.1], yticks=[i for i in range(9)],
       xlabel="Reported education level", ylabel="Mean knowledge rating per participant (from 0 to 8)",
       xticks=[i for i in range(len(edLevelOrder))], xticklabels=edLevelOrder)
ax.tick_params(axis='x', rotation=45)
ax.text(8.5, 0.5, f"Spearman's $r_s$ = {res.statistic:.2f}\nN = {len(vDataPerSub)}\np = {res.pvalue:.2e}", 
        horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
plt.show()
plt.clf()

In [None]:
len(vDataPerSub.SelfAssessment)

In [None]:
res = stats.spearmanr(vDataPerSub.SelfAssessment, vDataPerSub.Answer)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})"))

In [None]:
ax = sns.pointplot(vDataPerSub, x="SelfAssessment", y="Answer", errorbar='sd')
ax.set(ylim=[-0.1,8.1], yticks=[i for i in range(9)], xticks=[i for i in range(10)],
       xlabel="Self-assessed maths level (from 1 to 10)", ylabel="Mean knowledge rating per participant (from 0 to 8)")
ax.text(8.5, 0.5, f"Spearman's $r_s$ = {res.statistic:.2f}\np = {res.pvalue:.2e}", 
        horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
plt.show()
plt.clf()

## Word similarity analysis

Questions:
- Do subjects agree on the similarities of math words? (split-half consistency)
- Is GloVe a good model of those similarities?
- Do similarities change with education (get refined?? get more similar to Glove??)?

In [None]:
# prepare data
pData = df.loc[df.Trial == 'SimilarityJudgement'].copy()
pData.drop(['Trial', 'RT', 'WordLevelId', 'WordLevelName'], 
           axis=1, inplace=True)
pData.rename({'Similarity': 'GloVeSimilarity', 'SubLevel': 'StimLevelCategory'}, axis=1, inplace=True)
pData['Training'] = [not i for i in pData.Training] # fix this unintuitive issue
pData['Answer'] = pData.Answer.astype(float)

In [None]:
# remove training data
pMathsData = pData.loc[~pData.Training].copy()
# remove unanswered questions
pMathsDataFiltered = pMathsData.dropna(subset=["Answer"]).copy()
# average over participants for each question
pMathsDataAgg = pMathsDataFiltered.groupby("Question").mean(numeric_only=True).join(stimData[['word1', 'word2']])

In [None]:
pMathsDataAgg['MeanKnowledge'] = [np.mean([vData[vData.Question == x.word1].Answer.mean(),vData[vData.Question == x.word2].Answer.mean()]) for x in pMathsDataAgg.itertuples()]
pMathsDataAgg['MeanFreq'] = [np.mean([vocData.loc[x.word1].mathsFrequency, vocData.loc[x.word2].mathsFrequency]) for x in pMathsDataAgg.itertuples()]

In [None]:
pData

### Sanity checks

#### Training questions

In [None]:
trainingData = pData.loc[pData.Training]

In [None]:
ax = sns.boxplot(trainingData, x="Question", y="Answer")
ax.tick_params(axis='x', rotation=45)
ax.set(xlabel="Training pair", ylabel="Distribution of estimated proximity")
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
g = sns.displot(trainingData, x="Answer", col="Question", kde=True,
                col_wrap=4, facet_kws={'sharey':False})
g.set_titles(col_template='Pair: "{col_name}"')
g.set(xlabel="Estimated proximity")
plt.tight_layout()
plt.show()
plt.clf()

#### Number of presentation of each pair

In [None]:
numPres = pMathsDataFiltered.groupby("Question").count()
numPresOrder = pMathsDataFiltered.groupby(["Question", "PresentationOrder"]).count()
numPresOrder['Order'] = numPresOrder.index.get_level_values('PresentationOrder')

In [None]:
ax = sns.boxplot(numPres, x='SubID', showmeans=True)
ax.set(xlabel="Number of presentations of each pair")
plt.show()
plt.clf()

In [None]:
ax = sns.histplot(numPres, x='SubID')
ax.set(xlabel="Number of presentation of each pair")
plt.show()
plt.clf()

In [None]:
ax = sns.boxplot(numPresOrder, x='SubID', y='Order', showmeans=True)
ax.set(xlabel="Number of presentations of each pair")
plt.show()
plt.clf()

In [None]:
ax = sns.histplot(numPresOrder, x='SubID', hue='Order')
ax.set(xlabel="Number of presentation of each pair")
plt.show()
plt.clf()

#### Effect of order of presentation of words

In [None]:
tmp = pMathsData.groupby(['Question', 'PresentationOrder'], as_index=False).mean(numeric_only=True)
orderPresentationData = tmp.pivot(index='Question', columns='PresentationOrder', values='Answer')
orderPresentationData

In [None]:
# correlation test
model = ols("word2_word1 ~ word1_word2", data=orderPresentationData)
results = model.fit()
results.summary()

In [None]:
ax = sns.regplot(orderPresentationData, x="word1_word2", y="word2_word1", 
                 line_kws={'color': sns.color_palette()[1]})
ax.set(xlabel="Mean human judged similarity per pair (word1-word2)", 
       ylabel="Mean human judged similarity per pair (word2-word1)")
ax.text(0.5, 4.8, f"$R^2$ = {results.rsquared:.2f}\nN = {len(orderPresentationData)}\np = {results.f_pvalue:.2e}", 
        horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
plt.show()
plt.clf()

### Overall analysis

#### Preprocessing

In [None]:
vectors = pd.read_csv('../Embeddings/English/GloVe/words_vec_50_maths.csv', index_col="word")

In [None]:
def pairLevelToCat(l):
    if l == 0:
        return 0
    elif l <= 2:
        return 2
    elif l <= 4:
        return 4
    elif l <= 5:
        return 5
    else:
        return 7

In [None]:
# add categorical levels of predicted similarities
pMathsDataAgg['CategoricalSim'] = ['']*len(pMathsDataAgg)
for l, levelData in pMathsDataAgg.groupby("PairLevel"):
    level = pairLevelToCat(l)
    df = pd.read_csv(f"../Data/EnglishPairs/selectedPairs_{level}.csv", index_col="PairID")
    for t in levelData.itertuples():
        pMathsDataAgg.at[t.Index, 'CategoricalSim'] = df.loc[t.Index].SimCategory

In [None]:
catSimOrder = ["Furthest", "Orthogonal", "Average", "Closest"]

#### Compute noise ceiling

In [None]:
def crossVal(data, other, stimVar, respVar, groups):
    a = []
    p = []
    r2 = []
    
    for fold, foldData in data.groupby(groups):
        
        otherData = other[other[groups] != fold]
        otherData = otherData.groupby(stimVar).mean(numeric_only=True)
        
        allData = otherData.join(foldData.set_index(stimVar), how='inner', rsuffix='fold')
        allData.dropna(subset=[respVar, respVar+'fold'], how='any', inplace=True)
        
        if len(allData[respVar].unique()) >= 2 and len(allData[respVar+'fold'].unique()) >= 2:
        
            p.append(len(foldData)-len(allData))
            
            model = rankOLS(allData[respVar+'fold'], allData[respVar], missing='drop')
            result = model.fit()
            r2.append(result.rsquared)
            a.append(result.params[1])

    return np.mean(r2), a, p

In [None]:
noiseCeiling = {}
allA = []
allP = []
lab = []

noiseCeiling['Global'], a, p = crossVal(pMathsDataFiltered, pMathsDataFiltered, 'Question', 'Answer', 'SubID')

allA += a
allP += p
lab += ["Global"] * len(a)

for (level, levelId), levelData in pMathsDataFiltered.groupby(['EdLevel', 'EdLevelId']):
    cval, a, p = crossVal(levelData, pMathsDataFiltered, 'Question', 'Answer', 'SubID')
    
    noiseCeiling[level] = cval
    noiseCeiling[levelId] = cval
    allA += a
    allP += p
    lab += [level] * len(a)
    
noiseData = pd.DataFrame({"Level": lab, "Slopes": allA, "N": allP})

In [None]:
noiseCeilingWordLevel = {}
allA = []
allP = []
lab = []

noiseCeilingWordLevel['Global'], a, p = crossVal(pMathsDataFiltered, pMathsDataFiltered, 'Question', 'Answer', 'SubID')

allA += a
allP += p
lab += ["Global"] * len(a)

for (levelId, levelData), level in zip(pMathsDataFiltered.groupby('PairLevel'), wordLevelOrder):
    cval, a, p = crossVal(levelData, pMathsDataFiltered, 'Question', 'Answer', 'SubID')
    
    noiseCeilingWordLevel[level] = cval
    noiseCeilingWordLevel[levelId] = cval
    allA += a
    allP += p
    lab += [level] * len(a)
    
noiseDataWordLevel = pd.DataFrame({"Level": lab, "Slopes": allA, "N": allP})

In [None]:
display(Markdown(rf"Overall noise ceiling: {noiseCeiling['Global']:.2f}"))

In [None]:
g = sns.displot(noiseData, x="Slopes", kind='kde', col="Level", cut=0,
                col_wrap=4, col_order=["Global"]+edLevelOrder, facet_kws={'sharey':False})
g.refline(x=0)
plt.show()
plt.clf()

In [None]:
# for each fold, number of trials that were unique to the fold (pairs presented only to the left-over participant)
ax = sns.boxplot(noiseData, x="N", y="Level", order=["Global"]+edLevelOrder)
plt.show()
plt.clf()

#### Correlation between rated similarity and our four categorical levels of predicted similarities

In [None]:
pMathsDataAgg.groupby("CategoricalSim").mean()

In [None]:
# Kruskal-Wallis
tmp = pMathsDataAgg.reset_index().pivot(index='Question', columns='CategoricalSim', values='Answer')
stats.kruskal(tmp.Average, tmp.Closest, tmp.Furthest, tmp.Orthogonal, nan_policy='omit')

In [None]:
# Dunn
posthoc_dunn(pMathsDataAgg, val_col="Answer", group_col="CategoricalSim", p_adjust="bonferroni")

In [None]:
ax = sns.boxplot(pMathsDataAgg, x="CategoricalSim", y="Answer", 
                 order=catSimOrder)
ax.set(xlabel="Categorical levels of GloVe predicted similarities (cosine)", ylabel="Distribution of human estimated similarity")
plt.show()
plt.clf()

#### Correlation between rated similarity and a continuous measure (cosine angle or Euclidean distance)

In [None]:
def quantileCut(df, cols, q=100):
    
    def oneshot(df, col, q):
        try:
            quantiles = pd.DataFrame(pd.qcut(df[col], q=q))
        except ValueError:
            quantiles = pd.DataFrame(pd.qcut(df[col].rank(method='first'), q=q))
        tmp = df.join(quantiles, rsuffix="_bins")
        means = tmp.groupby(col+'_bins').mean()
        means = pd.DataFrame(means[col])
        dff = tmp.join(means, on=col+'_bins', rsuffix='Bins')
        dff.drop(columns=[col+'_bins'], inplace=True)
        return dff
    
    if len(np.shape(cols)) == 0:
        cols = np.reshape(cols, (len(cols)))

    for col in cols:
        df = oneshot(df, col, q)

    return df

In [None]:
try:
    assert not pMathsDataAgg_cop is None
except:
    pMathsDataAgg_cop = pMathsDataAgg.copy()

In [None]:
pMathsDataAgg = quantileCut(pMathsDataAgg, ['EuclideanDistance', 'GloVeSimilarity', 'Answer'])

##### Cosine similarity

In [None]:
pMathsDataAgg

In [None]:
# Spearman's rank correlation analysis
res = stats.spearmanr(pMathsDataAgg.Answer, pMathsDataAgg.GloVeSimilarity)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})<br>$r_s^2$ = {res.statistic**2:.3f}"))

In [None]:
model = rankOLS(pMathsDataAgg.Answer, pMathsDataAgg.GloVeSimilarity)
results = model.fit()
results.summary2()

In [None]:
model = rankOLS(pMathsDataAgg.Answer, [pMathsDataAgg.GloVeSimilarity, pMathsDataAgg.MeanFreq])
results = model.fit()
results.summary2()

In [None]:
# article fig
g = sns.JointGrid(pMathsDataAgg, x="GloVeSimilarityBins", y="AnswerBins", xlim=(-0.3389345948961322, 0.9453314081360167), ylim=(-0.10624661054156131, 5.090714332061703))
g.plot_joint(sns.lineplot)
g.plot_marginals(sns.histplot, kde=True)
g.ax_joint.axvline(x=0, linestyle='--', color='.4')
#g.set_axis_labels("GloVe similarity", "Human similarity")
g.set_axis_labels("","")
# g.ax_joint.text(0.7, 0.5, f"N = {len(pMathsDataAgg.Answer)}\nSpearman's $r_s$ = {res.statistic:.2f}\np < .001", 
#                 horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
fig = plt.gcf()
fig.set_size_inches(figWidth/1.2, figWidth*ratio)
plt.tight_layout()
plt.show()
plt.clf()

##### Cosine similarity in the 95% IQ only

In [None]:
lb, ub = pMathsDataAgg_cop.GloVeSimilarity.quantile([.025, .975])
iqPred = pMathsDataAgg_cop[(pMathsDataAgg_cop.GloVeSimilarity >= lb) & (pMathsDataAgg_cop.GloVeSimilarity <= ub)]
iqPred = quantileCut(iqPred, ['Answer', 'GloVeSimilarity', 'EuclideanDistance'])

In [None]:
res = stats.spearmanr(iqPred.AnswerBins, iqPred.GloVeSimilarityBins)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})<br>$r_s^2$ = {res.statistic**2:.3f}"))

In [None]:
model = rankOLS(iqPred.AnswerBins, [iqPred.GloVeSimilarityBins, iqPred.MeanFreq])
results = model.fit()
results.summary2()

In [None]:
g = sns.JointGrid(iqPred, x="GloVeSimilarityBins", y="AnswerBins")
g.plot_joint(sns.lineplot)
g.plot_marginals(sns.histplot, kde=True)
g.set_axis_labels("GloVe predicted similarity (cosine)", "Average human judged similarity by item")
plt.tight_layout()
plt.show()
plt.clf()

##### Cosine similarity for negative predicted similarities only

In [None]:
negPred = pMathsDataAgg_cop[pMathsDataAgg_cop.GloVeSimilarity <= 0]
negPred = quantileCut(negPred, ['Answer', 'GloVeSimilarity'])

In [None]:
res = stats.spearmanr(negPred.AnswerBins, negPred.GloVeSimilarityBins)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})<br>$r_s^2$ = {res.statistic**2:.3f}"))

In [None]:
model = rankOLS(negPred.AnswerBins, negPred.GloVeSimilarityBins)
results = model.fit()
results.summary2()

In [None]:
bic = results.bic
display(Markdown(rf"BIC = {bic}"))

In [None]:
g = sns.JointGrid(negPred, x="GloVeSimilarityBins", y="AnswerBins", height=11.7)
g.plot_joint(sns.lineplot)
g.plot_marginals(sns.histplot, kde=True)
g.set_axis_labels("GloVe predicted similarity (cosine)", "Average human judged similarity by item")
plt.tight_layout()
plt.show()
plt.clf()

##### Euclidean distance

In [None]:
model = rankOLS(pMathsDataAgg.AnswerBins, pMathsDataAgg.EuclideanDistanceBins)
results = model.fit()
results.summary2()

In [None]:
g = sns.JointGrid(pMathsDataAgg, x="EuclideanDistanceBins", y="AnswerBins", height=11.7)
g.plot_joint(sns.lineplot)
g.plot_marginals(sns.histplot, kde=True)
g.set_axis_labels("GloVe predicted distance (Euclidean)", "Average human judged similarity by item")
plt.tight_layout()
plt.show()
plt.clf()

##### Embedding pruning 

In [None]:
def prune(humanSim, embeddings):
    """
    implementation of the pruning algorithm described in Manrique, N. F., Bao, W., Herbelot, A., & Hasson, U. (2023). Enhancing Interpretability using Human Similarity Judgements to Prune Word Embeddings (arXiv:2310.10262). arXiv. http://arxiv.org/abs/2310.10262

    """
    words = list(embeddings.index)
    embeddings = embeddings.to_numpy()
    nwords, nfeatures = embeddings.shape

    humanSim = humanSim.to_numpy()

    # Compute baseline Spearman’s Rho
    modelSim = 1-pairwise_distances(embeddings, embeddings, metric='cosine')
    baseline = stats.spearmanr(humanSim.flatten(), modelSim.flatten(), nan_policy='omit').statistic

    # Rank features
    diff = []
    for i in range(nfeatures):
        partial = np.delete(embeddings, i, axis=1)
        partialSim = 1-pairwise_distances(partial, partial, metric='cosine')
        rho = stats.spearmanr(humanSim.flatten(), partialSim.flatten(), nan_policy='omit').statistic
        diff.append(baseline-rho)
    featuresImportance = np.argsort(diff)[::-1]

    # Construct pruned embeddings
    a = []
    for i in range(nfeatures):
        toRemove = featuresImportance[i+1:]
        partial = np.delete(embeddings, toRemove, axis=1)
        partialSim = 1-pairwise_distances(partial, partial, metric='cosine')
        rho = stats.spearmanr(humanSim.flatten(), partialSim.flatten(), nan_policy='omit').statistic
        a.append(rho)
    indexMax = np.argsort(a)[-1]
    featuresToKeep = featuresImportance[:indexMax+1]

    return featuresToKeep, a[indexMax]

In [None]:
w1 = []
w2 = []
sim = []
for l in pMathsDataFiltered.itertuples():
    if l.PresentationOrder == 'word1_word2':
        w1.append(l.word1)
        w2.append(l.word2)
    else:
        w1.append(l.word2)
        w2.append(l.word1)
    sim.append(l.Answer)
pMathsDataOrder = pd.DataFrame({'word1': w1, 'word2': w2, 'answer': sim})
pMathsDataOrderAgg = pMathsDataOrder.groupby(['word1', 'word2']).mean().reset_index()

humanSim = pd.pivot(pMathsDataOrderAgg, index='word1', columns='word2', values='answer')

In [None]:
featuresToKeep, corr = prune(humanSim, vectors.loc[humanSim.index])

In [None]:
newEmbeddings = vectors[[str(i+1) for i in featuresToKeep]].copy()
newSim = pd.DataFrame(cosine_similarity(newEmbeddings)).set_index(vectors.index).rename(columns={i: vectors.index[i] for i in range(999)})
newSimStack = newSim.stack().reset_index().rename(columns={'word': 'word1', 'level_1': 'word2', 0: 'PrunedSim'}).set_index(['word1','word2'])
pMathsDataAggPruned = pMathsDataAgg.join(newSimStack, on=['word1', 'word2'])

In [None]:
newEmbeddings.to_csv('prunedEmbeddings.csv')

In [None]:
pMathsDataAggPruned = quantileCut(pMathsDataAggPruned, ['PrunedSim'])

In [None]:
# Spearman's rank correlation analysis
res = stats.spearmanr(pMathsDataAggPruned.AnswerBins, pMathsDataAggPruned.PrunedSimBins)
display(Markdown(rf"Spearman's $r_s$ coefficient: {res.statistic:.2f} (p = {res.pvalue:.2e})<br>$r_s^2$ = {res.statistic**2:.3f}"))

In [None]:
model = rankOLS(pMathsDataAggPruned.AnswerBins, pMathsDataAggPruned.PrunedSimBins)
results = model.fit()
results.summary2()

In [None]:
g = sns.JointGrid(pMathsDataAggPruned, x="PrunedSimBins", y="AnswerBins")
g.plot_joint(sns.lineplot)
g.plot_marginals(sns.histplot, kde=True)
g.ax_joint.axvline(x=0, linestyle='--', color='.4')
#g.set_axis_labels("GloVe similarity", "Human similarity")
g.set_axis_labels("","")
# g.ax_joint.text(0.7, 0.5, f"N = {len(pMathsDataAgg.Answer)}\nSpearman's $r_s$ = {res.statistic:.2f}\np < .001", 
#                 horizontalalignment='center', verticalalignment='center', bbox={'edgecolor':'black', 'facecolor':'none'})
fig = plt.gcf()
fig.set_size_inches(figWidth/1.2, figWidth*ratio)
plt.tight_layout()
plt.show()
plt.clf()

#### Quality of GloVe fit depending on education level

In [None]:
nonMathsData = pd.read_csv('../Data/pairSim/English/pairSim_50_nonmaths.csv', encoding='utf-8', index_col='PairID')
allData = pd.read_csv('../Data/pairSim/English/pairSim_50_all.csv', encoding='utf-8', index_col='PairID')

In [None]:
# global fit global corpus
globalFit = pMathsDataFiltered.join(allData, on="Question", lsuffix="_part")
model = rankOLS(globalFit.Answer,globalFit.Similarity)
results = model.fit()
results.summary2()

In [None]:
# global fit non-maths corpus
nonmathsFit = pMathsDataFiltered.join(nonMathsData, on="Question", lsuffix="_part")
model = rankOLS(nonmathsFit.Similarity, nonmathsFit.Answer)
results = model.fit()
results.summary2()

In [None]:
# global fit maths corpus
mathsFit = pMathsDataFiltered.join(stimData, on="Question", lsuffix="_part")
model = rankOLS(mathsFit.Similarity, mathsFit.Answer)
results = model.fit()
results.summary2()

In [None]:
# education level

mathsR = []
nonMathsR = []
allR = []
edLevel = []

for level, levelData in pMathsDataFiltered.groupby('EdLevelId'):
    edLevel.append(int(level))
    for sims, simList in zip([stimData, nonMathsData, allData], [mathsR, nonMathsR, allR]):
        dat = levelData.join(sims, on="Question", rsuffix="Sim")
        model = rankOLS(dat.Answer, dat.Similarity)
        result = model.fit()
        simList.append(result.rsquared)
        
        
diffGloVeEdLevel = pd.DataFrame(index=edLevel, data={'Maths Corpus': mathsR, 'Non Maths Corpus': nonMathsR, 'All Corpora': allR})
diffGloVeEdLevel = pd.DataFrame(diffGloVeEdLevel.stack()).rename(columns={0: 'Fit'})
diffGloVeEdLevel = diffGloVeEdLevel.reset_index().rename(columns={'level_0': 'Level', 'level_1': 'Training Corpus'})
diffGloVeEdLevel['Fit'] = np.array(diffGloVeEdLevel.Fit)*100
diffGloVeEdLevel['NoiseCeiling'] = [noiseCeiling[x]*100 for x in diffGloVeEdLevel.Level]

In [None]:
# level of acquisition of the pair

mathsR = []
nonMathsR = []
allR = []
wordLevel = []

for level, levelData in pMathsDataFiltered.groupby('PairLevel'):
    wordLevel.append(int(level))
    for sims, simList in zip([stimData, nonMathsData, allData], [mathsR, nonMathsR, allR]):
        dat = levelData.join(sims, on="Question", rsuffix="Sim")
        model = rankOLS(dat.Answer, dat.Similarity)
        result = model.fit()
        simList.append(result.rsquared)
        
        
diffGloVeWordLevel = pd.DataFrame(index=wordLevel, data={'Maths Corpus': mathsR, 'Non Maths Corpus': nonMathsR, 'All Corpora': allR})
diffGloVeWordLevel = pd.DataFrame(diffGloVeWordLevel.stack()).rename(columns={0: 'Fit'})
diffGloVeWordLevel = diffGloVeWordLevel.reset_index().rename(columns={'level_0': 'Level', 'level_1': 'Training Corpus'})
diffGloVeWordLevel['Fit'] = np.array(diffGloVeWordLevel.Fit)*100
diffGloVeWordLevel['NoiseCeiling'] = [noiseCeilingWordLevel[x]*100 for x in diffGloVeWordLevel.Level]

In [None]:
# article fig
diffGloVeEdLevel['rank'] = diffGloVeEdLevel['Level'].rank(method='dense')-1
ax = sns.pointplot(diffGloVeEdLevel, x='Level', y='Fit', hue='Training Corpus')
sns.lineplot(diffGloVeEdLevel, x='rank', y='NoiseCeiling', ax=ax, linestyle='--', color='grey', sort=False, legend=False)
ax.set(ylim=[0,55], ylabel='', xlabel='',
       xticks=[i for i in range(len(edLevelOrder)-1)])
ax.set_xticklabels(edLevelOrder[1:], rotation = 45, ha='right')
leg = plt.legend()
leg.remove()
plt.grid(axis='x')
fig = plt.gcf()
fig.set_size_inches(figWidth/1.4, ratio*figWidth)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(12)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
# article fig
ax = sns.pointplot(diffGloVeWordLevel, x='Level', y='Fit', hue='Training Corpus')#, sort=False)
sns.lineplot(diffGloVeWordLevel, x='Level', y='NoiseCeiling',legend=False, linestyle='--', color='grey', sort=False, ax=ax,
            label='Noise ceiling')
ax.set(ylim=[0,55], ylabel='', xlabel='',
       xticks=[i for i in range(len(wordLevelOrder)-1)])
# ax.set(ylim=[0,100], ylabel='% of explained variance', xlabel='Estimated level of acquisition of words of the pair',
#        xticks=[i for i in range(len(wordLevelOrder))])
ax.set_xticklabels(wordLevelOrder[:-1], rotation = 30, ha='right')
leg = plt.legend()
leg.remove()
plt.grid(axis='x')
fig = plt.gcf()
fig.set_size_inches(figWidth/1.4, ratio*figWidth)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(14)
plt.tight_layout()
plt.show()
plt.clf()

## Optimisation of number of dimensions of GloVe vectors

In [None]:
GloVeDims = []

indices = [i for i in range(1,50)] + [i for i in range(50,501,50)]

for i in indices:
    for corpus in ["maths", "nonmaths", "all"]:
        dat = pd.read_csv(f'../Data/pairSim/English/pairSim_{i}_{corpus}.csv', encoding='utf-8')
        dat['NumberDim'] = [i]*len(dat)
        dat['TrainingCorpus'] = [corpus]*len(dat)
        GloVeDims.append(dat)
        
GloVeDims = pd.concat(GloVeDims)

In [None]:
r = []
corpus = []
nDims = []

for (n, c), corpusData in GloVeDims.groupby(['NumberDim', 'TrainingCorpus']):
        nDims.append(n)
        dat = pMathsDataFiltered.join(corpusData.set_index('PairID'), on="Question", rsuffix="Sim")
        corpus.append(c)
        model = rankOLS(dat.Answer, dat.Similarity)
        result = model.fit()
        r.append(result.rsquared)
        
        
GloVeDimsSubs = pd.DataFrame(data={'NumberDim': nDims, 'TrainingCorpus': corpus, 'Fit': np.array(r)*100})

In [None]:
GloVeDimsSubs

In [None]:
# article fig
ax = sns.lineplot(GloVeDimsSubs.loc[GloVeDimsSubs.TrainingCorpus == "maths"], x="NumberDim", y="Fit")
ax.axhline(y=noiseCeiling['Global']*100, color='grey', linestyle='--')
ax.set(xlabel="", ylabel="", ylim=[0,50])
#ax.set(ylim=[0,100], ylabel="% of explained variance", xlabel="Number of dimensions of GloVe vectors")
fig = plt.gcf()
fig.set_size_inches(figWidth/1.4, ratio*figWidth/1.4)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(10)
plt.tight_layout()
plt.show()
plt.clf()