In [None]:
import os

import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
import plotly.graph_objects as go
import plotly.express as px
from statistics import mean

In [2]:
#Load Notebook with hierarchies Created

Hierarchy=pd.read_excel('./AIMLESS/Hierarchy_Generation_Results.xlsx',sheet_name='RDoC_4o_1')
Hierarchy.head()

Unnamed: 0,level_0,level_1,level_2,description
0,Negative Valence Systems,,,Negative Valence Systems are primarily respons...
1,Negative Valence Systems,Fear,,A response to threat that results in an emotio...
2,Negative Valence Systems,Fear,Phobia,"An intense, irrational fear of specific object..."
3,Negative Valence Systems,Fear,Panic,"Sudden, intense fear or discomfort that may in..."
4,Negative Valence Systems,Fear,Conditioned Fear,Fear that is learned through association with ...


In [7]:

Hierarchy=pd.read_csv('./results/generated_openai_hierarchy.csv')
Hierarchy.head()

Unnamed: 0,level_1,level_2,level_3,level_4,level_5,level_6,description
0,Cognitive Function,,,,,,Cognitive Function encompasses various mental ...
1,Cognitive Function,Attention,,,,,The ability to sustain concentration on a part...
2,Cognitive Function,Attention,Selective Attention,,,,The focus on one particular stimulus or task w...
3,Cognitive Function,Attention,Selective Attention,Focused Attention,,,Concentrating on a specific stimulus while ign...
4,Cognitive Function,Attention,Selective Attention,Filtering,,,The ability to select relevant inputs from a f...


In [9]:
#Isolate the First level of the tree (for comparison across this level ONLY)

Tops=Hierarchy[Hierarchy['level_2'].isna()].copy()
Tops.head(10)

Unnamed: 0,level_1,level_2,level_3,level_4,level_5,level_6,description
0,Cognitive Function,,,,,,Cognitive Function encompasses various mental ...
44,Emotion Regulation,,,,,,"The process of monitoring, evaluating, and mod..."
66,Social Interaction,,,,,,Methods to amplify feelings of motivation and ...
113,Communication and Language,,,,,,Focusing on the underlying interests of the pa...
146,Behavioral Control,,,,,,Expressions conveying emotions like pride or e...
162,Physical Health and Activity,,,,,,Self-Reflection involves the capacity to think...
224,Attention and Perception,,,,,,The intake of water to meet daily hydration ne...
315,Adaptive Functioning and Life Skills,,,,,,Using the level difference in sound reaching e...
361,Motivation and Goal-Directed Behavior,,,,,,Ability to clean various surfaces effectively.
413,Self-Concept and Identity,,,,,,Objectives set to be achieved over an extended...


In [10]:
#Check the data looks/makes sense

Summaries=list(Tops['description'])
Summaries

['Cognitive Function encompasses various mental processes that allow an individual to carry out any task, from the simplest to the most complex. It includes processes like perception, memory, judgment, and reasoning.',
 "The process of monitoring, evaluating, and modulating one's emotional reactions to achieve a goal or meet situational demands.",
 'Methods to amplify feelings of motivation and engagement.',
 'Focusing on the underlying interests of the parties involved rather than their initial positions.',
 'Expressions conveying emotions like pride or embarrassment.',
 "Self-Reflection involves the capacity to think about one's actions and the impact they have on oneself and others.",
 'The intake of water to meet daily hydration needs.',
 'Using the level difference in sound reaching each ear for localization.',
 'Ability to clean various surfaces effectively.',
 'Objectives set to be achieved over an extended period, requiring sustained effort and planning.']

In [11]:
#Download and sync an embedding space from Huggingface (using 384 parameters in this case example)

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(Summaries, convert_to_tensor=True)

#For visualization purposes to see what it looks like
embeddf = model.encode(Summaries)
print(embeddf)

[[ 0.06662201  0.00855916 -0.036182   ...  0.11996334  0.02672595
  -0.0407065 ]
 [ 0.00103635  0.06531614 -0.0544176  ...  0.07222036 -0.03509864
  -0.03115866]
 [ 0.01878467  0.01473565  0.0211373  ...  0.05483231 -0.02937611
  -0.00212246]
 ...
 [ 0.0539593  -0.05789973 -0.0130451  ...  0.04886086 -0.05555789
   0.03703918]
 [-0.02544525  0.03005852  0.11798873 ... -0.00395616  0.01199988
  -0.04168724]
 [ 0.01091021  0.05785342  0.01567116 ... -0.01319768 -0.03616991
   0.02635377]]


In [6]:
#Create a pandas dataframe (in case we want to use it anywhere later)

# embed_df=pd.DataFrame(embeddf)
# print(embeddings.shape)

# Save Scoring Matrix for later if needed
#embed_df.to_csv('VectorEmbed1.csv',index=False)

# embed_df.head()

In [12]:
# Compare the similarity scores of all items in this level of the hierarchy

#query='what is the adhd rates among young kids?'
#query=Tops['description'].iloc[0]


# A quick example to encode the first item, to see how it scores across the other items at this level of the tree
query=Summaries[0]
print(query)
query_enc=model.encode(query, convert_to_tensor=True)

Cognitive Function encompasses various mental processes that allow an individual to carry out any task, from the simplest to the most complex. It includes processes like perception, memory, judgment, and reasoning.


In [17]:
embeddings

  nonzero_finite_vals = torch.masked_select(


tensor([[ 0.0666,  0.0086, -0.0362,  ...,  0.1200,  0.0267, -0.0407],
        [ 0.0010,  0.0653, -0.0544,  ...,  0.0722, -0.0351, -0.0312],
        [ 0.0188,  0.0147,  0.0211,  ...,  0.0548, -0.0294, -0.0021],
        ...,
        [ 0.0540, -0.0579, -0.0130,  ...,  0.0489, -0.0556,  0.0370],
        [-0.0254,  0.0301,  0.1180,  ..., -0.0040,  0.0120, -0.0417],
        [ 0.0109,  0.0579,  0.0157,  ..., -0.0132, -0.0362,  0.0264]],
       device='mps:0')

In [69]:
# Now score the results against the embedding space

cos_scores=util.cos_sim(query_enc, embeddings)[0]
#cos_scores

In [14]:
# Retrieve Scores 

#(top returns is the number of comparisons) - careful since it is based on list, cannot excede list length
#Top_Returns=5
Top_Returns=len(Summaries)

top_results=torch.topk(cos_scores, k=Top_Returns)
retrievals=list(zip(top_results[1].tolist(),top_results[0].tolist()))

print("Index for retrieval and similarity scores")
retrievals

Index for retrieval and similarity scores


[(0, 1.0),
 (1, 0.3300507664680481),
 (9, 0.24111489951610565),
 (5, 0.2180522233247757),
 (2, 0.20977811515331268),
 (8, 0.1907094419002533),
 (3, 0.1144225001335144),
 (4, 0.10928411781787872),
 (7, 0.044849902391433716),
 (6, -0.02589181438088417)]

In [15]:
#Enumerate the list we are interested in comparing so the results are easier to evaluate

Summaries_Enumerated=dict(enumerate(Summaries))
Summaries_Enumerated

{0: 'Cognitive Function encompasses various mental processes that allow an individual to carry out any task, from the simplest to the most complex. It includes processes like perception, memory, judgment, and reasoning.',
 1: "The process of monitoring, evaluating, and modulating one's emotional reactions to achieve a goal or meet situational demands.",
 2: 'Methods to amplify feelings of motivation and engagement.',
 3: 'Focusing on the underlying interests of the parties involved rather than their initial positions.',
 4: 'Expressions conveying emotions like pride or embarrassment.',
 5: "Self-Reflection involves the capacity to think about one's actions and the impact they have on oneself and others.",
 6: 'The intake of water to meet daily hydration needs.',
 7: 'Using the level difference in sound reaching each ear for localization.',
 8: 'Ability to clean various surfaces effectively.',
 9: 'Objectives set to be achieved over an extended period, requiring sustained effort and pla

In [16]:
for result in retrievals:
    print("Score:  ", result[1])
    print("Grouping Text:  ", Summaries_Enumerated.get(result[0]))
    print("\n")

Score:   1.0
Grouping Text:   Cognitive Function encompasses various mental processes that allow an individual to carry out any task, from the simplest to the most complex. It includes processes like perception, memory, judgment, and reasoning.


Score:   0.3300507664680481
Grouping Text:   The process of monitoring, evaluating, and modulating one's emotional reactions to achieve a goal or meet situational demands.


Score:   0.24111489951610565
Grouping Text:   Objectives set to be achieved over an extended period, requiring sustained effort and planning.


Score:   0.2180522233247757
Grouping Text:   Self-Reflection involves the capacity to think about one's actions and the impact they have on oneself and others.


Score:   0.20977811515331268
Grouping Text:   Methods to amplify feelings of motivation and engagement.


Score:   0.1907094419002533
Grouping Text:   Ability to clean various surfaces effectively.


Score:   0.1144225001335144
Grouping Text:   Focusing on the underlying i

# compare hierarchies

In [3]:
def level_embeddings(df, level):
    level=df.query(str(level) + '.notnull()')[str(level)].unique()
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(level, convert_to_tensor=False)
    return embeddings, level


In [5]:
def cos_scores(hierarchy_1, hierarchy_2):

    top_scores = []
    hierarchy_1_embeddings = []
    hierarchy_2_embeddings = []


    for x in hierarchy_1:
        cat_scores = []
        for y in hierarchy_2:
            hierarchy_1_embeddings.append(x)
            hierarchy_2_embeddings.append(y)
            cat_scores.append(util.cos_sim(x, y)[0][0])
        top_cat_score = np.max([x.numpy() for x in cat_scores])
        top_scores.append(top_cat_score)
        mean_score = np.mean(top_scores)
        
    return mean_score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings

In [117]:
gpt_5x=pd.read_csv('./results/generated_separate_openai_hierarchy_max5_5.csv')
gpt_10x = pd.read_csv('./results/generated_separate_openai_hierarchy_max10_2.csv')
gpt_5x_2 = pd.read_csv('./results/generated_separate_openai_hierarchy_max5_4.csv')


claude_5x_1 = pd.read_csv('./results/generated_separate_anthropic_hierarchy_max5_1.csv')
claude_5x_2 = pd.read_csv('./results/generated_separate_anthropic_hierarchy_max5_2.csv')

claude_10x_1 = pd.read_csv('./results/generated_separate_anthropic_hierarchy_max10_1.csv')
claude_10x_2 = pd.read_csv('./results/generated_separate_anthropic_hierarchy_max10_2.csv')
claude_10x_3 = pd.read_csv('./results/generated_separate_anthropic_hierarchy_max10_3.csv')



In [118]:
gpt_10x_embeddings = {}
for level in range(1, 11):
    gpt_10x_embeddings['level_' + str(level) + '_embeddings'],gpt_10x_embeddings['level_' + str(level) + '_categories'] =level_embeddings(gpt_10x, level='level_' + str(level))

gpt_5x_embeddings = {}
for level in range(1, 6):
    gpt_5x_embeddings['level_' + str(level) + '_embeddings'],gpt_5x_embeddings['level_' + str(level) + '_categories'] =level_embeddings(gpt_5x, level='level_' + str(level))

gpt_5x_2_embeddings = {}
for level in range(1, 6):
    gpt_5x_2_embeddings['level_' + str(level) + '_embeddings'],gpt_5x_2_embeddings['level_' + str(level) + '_categories'] =level_embeddings(gpt_5x_2, level='level_' + str(level))

claude_5x_1_embeddings = {}
for level in range(1, 6):
    claude_5x_1_embeddings['level_' + str(level) + '_embeddings'],claude_5x_1_embeddings['level_' + str(level) + '_categories'] =level_embeddings(claude_5x_1, level='level_' + str(level))

claude_5x_2_embeddings = {}
for level in range(1, 6):
    claude_5x_2_embeddings['level_' + str(level) + '_embeddings'],claude_5x_2_embeddings['level_' + str(level) + '_categories'] =level_embeddings(claude_5x_2, level='level_' + str(level))

claude_10x_1_embeddings = {}
for level in range(1, len(claude_10x_1.columns.to_list())):
    claude_10x_1_embeddings['level_' + str(level) + '_embeddings'],claude_10x_1_embeddings['level_' + str(level) + '_categories'] =level_embeddings(claude_10x_1, level='level_' + str(level))

claude_10x_2_embeddings = {}
for level in range(1, len(claude_10x_2.columns.to_list())):
    claude_10x_2_embeddings['level_' + str(level) + '_embeddings'],claude_10x_2_embeddings['level_' + str(level) + '_categories'] =level_embeddings(claude_10x_2, level='level_' + str(level))

claude_10x_3_embeddings = {}
for level in range(1, len(claude_10x_3.columns.to_list())):
    claude_10x_3_embeddings['level_' + str(level) + '_embeddings'],claude_10x_3_embeddings['level_' + str(level) + '_categories'] =level_embeddings(claude_10x_3, level='level_' + str(level))


# claude_5x_embeddings, claude_5x_level2 =level_embeddings(claude_5x_1, level='level_2')
# claude_5x_embeddings, claude_5x_level2 =level_embeddings(claude_5x_1, level='level_2')
# claude_5x_embeddings, claude_5x_level2 =level_embeddings(claude_5x_1, level='level_2')

gpt4 5 iterations

In [119]:
gpt_5x_scores = {}
for level in range(1,6):
    score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings = cos_scores(gpt_5x_embeddings['level_' + str(level) + '_embeddings'], gpt_5x_2_embeddings['level_' + str(level) + '_embeddings'])
    gpt_5x_scores['level_' + str(level) + '_score'] = [float(np_float) for np_float in top_scores]

levels = [1, 2, 3, 4, 5, 6]
# mean_scores = [np.mean(gpt_5x_scores['level_' + str(level) + '_score'][0]) for level in levels]


all_levels = []
all_scores = []
means = []

for level in range(1,6):
    x = [level] * len(gpt_5x_scores['level_' + str(level) + '_score'])
    y = gpt_5x_scores['level_' + str(level) + '_score']
    mean_score = mean(y)
    all_levels.extend(x)
    all_scores.extend(y)
    means.append(mean_score)

viz_df = pd.DataFrame(all_levels, all_scores)
fig = px.strip(viz_df, x = all_levels, y = all_scores )

fig.update_traces(marker_color = 'lightslategrey')
fig.add_trace(go.Scatter(x = levels, 
                         y = means,
                         name = 'mean',
                        #  marker_color = 'darkblue',
                         line=dict(color='darkblue', width=2, dash='dash'),
                         ))

fig.update_layout(height = 800, width = 800, title = 'Cosine Similarity Scores for GPT Hierarchies',
                  xaxis_title = 'Tree Level', yaxis_range = [0,1.01])

fig.show()


claude 5 iterations

In [120]:
claude_5x_scores = {}
for level in range(1,6):
    score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings = cos_scores(claude_5x_1_embeddings['level_' + str(level) + '_embeddings'], claude_5x_2_embeddings['level_' + str(level) + '_embeddings'])
    claude_5x_scores['level_' + str(level) + '_score'] = [float(np_float) for np_float in top_scores]

levels = [1, 2, 3, 4, 5, 6]


all_levels = []
all_scores = []
means = []

for level in range(1,6):
    x = [level] * len(claude_5x_scores['level_' + str(level) + '_score'])
    y = claude_5x_scores['level_' + str(level) + '_score']
    mean_score = mean(y)
    all_levels.extend(x)
    all_scores.extend(y)
    means.append(mean_score)

viz_df = pd.DataFrame(all_levels, all_scores)
fig = px.strip(viz_df, x = all_levels, y = all_scores)

fig.update_traces(marker_color = 'lightslategrey')
fig.add_trace(go.Scatter(x = levels, 
                         y = means,
                         name = 'mean',
                        #  marker_color = 'darkblue',
                         line=dict(color='darkblue', width=2, dash='dash'),
                         ))
fig.update_layout(height = 800, width = 800, title = 'Cosine Similarity Scores for Claude Hierarchies',
                  xaxis_title = 'Tree Level',yaxis_range = [0,1.01])

fig.show()

In [166]:
# depths = min(len(claude_10x_1.columns.to_list()), len(claude_10x_2.columns.to_list()))
# depths = {}

claude_10x_1_2_scores = {}
claude_10x_1_2_scores['depth'] = min(len(claude_10x_1.columns.to_list()), len(claude_10x_2.columns.to_list())) - 1

for level in range(1,claude_10x_1_2_scores['depth'] + 1):
    score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings = cos_scores(claude_10x_1_embeddings['level_' + str(level) + '_embeddings'], claude_10x_2_embeddings['level_' + str(level) + '_embeddings'])
    claude_10x_1_2_scores['level_' + str(level) + '_score'] = [float(np_float) for np_float in top_scores]

claude_10x_2_3_scores = {}
claude_10x_2_3_scores['depth'] = min(len(claude_10x_2.columns.to_list()), len(claude_10x_3.columns.to_list())) - 1

for level in range(1,claude_10x_2_3_scores['depth'] + 1):
    score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings = cos_scores(claude_10x_2_embeddings['level_' + str(level) + '_embeddings'], claude_10x_3_embeddings['level_' + str(level) + '_embeddings'])
    claude_10x_2_3_scores['level_' + str(level) + '_score'] = [float(np_float) for np_float in top_scores]

claude_10x_3_1_scores = {}
claude_10x_3_1_scores['depth'] = min(len(claude_10x_3.columns.to_list()), len(claude_10x_1.columns.to_list())) - 1

for level in range(1,claude_10x_3_1_scores['depth'] + 1):
    score, top_scores, hierarchy_1_embeddings, hierarchy_2_embeddings = cos_scores(claude_10x_3_embeddings['level_' + str(level) + '_embeddings'], claude_10x_1_embeddings['level_' + str(level) + '_embeddings'])
    claude_10x_3_1_scores['level_' + str(level) + '_score'] = [float(np_float) for np_float in top_scores]

# levels = list(range(1,max(claude_10x_1_2_scores['depth'], claude_10x_2_3_scores['depth'], claude_10x_3_1_scores['depth']) + 1))

levels = []
all_levels = []
all_scores = []
means = []

for comp in [claude_10x_1_2_scores, claude_10x_2_3_scores, claude_10x_3_1_scores]:
    depth = comp['depth']
    print(depth)
    for level in range(1,depth + 1):
        x = [level] * len(comp['level_' + str(level) + '_score'])
        y = comp['level_' + str(level) + '_score']
        mean_score = mean(y)
        all_levels.extend(x)
        all_scores.extend(y)
        levels.append(level)
        means.append(mean_score)

results_df = pd.DataFrame({'all_levels': all_levels, 'all_scores': all_scores})

corr_means = [] 
corr_levels = []

for level in np.unique(all_levels):
    corr_means.append(mean(results_df.query('all_levels == @level')['all_scores'].to_list()))
    corr_levels.append(level)

# mean_df = pd.DataFrame({'mean': corr_means, 'level': corr_levels})

viz_df = pd.DataFrame(all_levels, all_scores)
fig = px.strip(viz_df, x = all_levels, y = all_scores)

fig.update_traces(marker_color = 'lightslategrey')
fig.add_trace(go.Scatter(x = corr_levels, y = corr_means, name = 'mean', marker_color = 'darkblue'))

fig.update_layout(height = 800, width = 800, title = 'Cosine Similarity Scores for Claude Hierarchies',
                  xaxis_title = 'Tree Level',yaxis_range = [0,1.01])

fig.show()

6
7
6


In [145]:
results_df

Unnamed: 0,0
1.000000,1
1.000000,1
1.000000,1
1.000000,1
1.000000,1
...,...
0.462860,6
0.419196,6
0.456009,6
0.265769,6


In [164]:
mean_df

Unnamed: 0,mean,level
0,1.0,1
1,0.925512,2
2,0.826795,3
3,0.760533,4
4,0.680774,5
5,0.500115,6
6,0.042835,7


In [157]:
results_df.query('all_levels == 2')['all_scores'].to_list()

[1.000000238418579,
 1.000000238418579,
 1.0,
 0.9999998807907104,
 0.7424114942550659,
 1.0,
 0.9999999403953552,
 1.000000238418579,
 1.0,
 0.6912596821784973,
 1.0,
 1.0000001192092896,
 1.0000001192092896,
 1.0,
 1.0,
 1.0000001192092896,
 0.9999999403953552,
 1.0000001192092896,
 1.0,
 0.9999998807907104,
 1.0,
 1.0000001192092896,
 1.000000238418579,
 0.3179624676704407,
 0.5453002452850342,
 0.7796064019203186,
 1.000000238418579,
 1.0,
 1.000000238418579,
 1.0000001192092896,
 1.0000001192092896,
 1.0000001192092896,
 1.0,
 0.9999999403953552,
 0.4763067364692688,
 1.0,
 1.0000001192092896,
 0.8229025602340698,
 0.5240507125854492,
 0.5769823789596558,
 1.0,
 1.0000001192092896,
 1.0,
 0.5113064050674438,
 1.0,
 1.000000238418579,
 1.000000238418579,
 1.0,
 0.9999998807907104,
 0.9999998807907104,
 1.0,
 0.9999999403953552,
 1.000000238418579,
 1.0,
 1.0,
 0.7071607708930969,
 1.0000001192092896,
 0.5605977773666382,
 1.0,
 1.0,
 1.0000001192092896,
 0.9999999403953552,
 1.0000

In [139]:
pd.DataFrame(levels, means)


AttributeError: 'DataFrame' object has no attribute 'sorted'

In [108]:
depths['claude_10x_1_2_scores']

7