In [1]:
import os
import pandas as pd
import numpy as np
import json

In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

  from tqdm.autonotebook import tqdm, trange


In [3]:
#Clean Data for Processing

def LoadandProcess(fp):
    df=pd.read_csv(fp)
    df.fillna(value='')
    source=df.columns.to_list()
    source.pop()
    
    df=df.astype(str)
    df["TextSet"] = df[source].agg('|'.join, axis=1)
    df['TextSet2'] = df['TextSet'].str.replace('|nan', '')
    X=df['TextSet2'].tolist()
    y=df['description'].tolist()
    return (source,X,y)

In [4]:
# Extract groupings of text for level comparisons

def TextCleanup(source,X,y):
    Master_dict=dict()
    
    #Iterate through depth of the existing levels
    for depth in range(1,len(source)+1):
        # Iterate through the full tree, accumulating text for each pass to assign the the category
        for idx,item in enumerate(X):
            current_item=item.split('|')[:depth]
            ci="|".join(current_item)
            
            #first see if item exist, if not create new list
            if Master_dict.get(ci) is None:
                newlist=[]
                newlist.append(y[idx])
                Master_dict[ci]=newlist
            # otherwise, if exists- append to existing list
            else:
                current_list=Master_dict.get(ci)
                current_list.append(y[idx])
                Master_dict[ci]=list(set(current_list))
    return Master_dict


In [8]:
fp='./results/generated_separate_openai_hierarchy_max5_3.csv'
source,X,y=LoadandProcess(fp)
Master_dict=TextCleanup(source,X,y)
print(len(Master_dict))

687


# Embeddings

In [11]:
# Load embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def getEmbeddings(Master_dict):
    EmbedDict=dict()
    for k,v in Master_dict.items():
        texts=' '.join(list(v))
        embeddings = model.encode(texts, convert_to_tensor=False)
        EmbedDict[k]=embeddings.tolist()
    return EmbedDict



## Embed Text and save to disk

In [13]:


for file in os.listdir('./results/'):
    if file[-4:]=='.csv':
        fp=os.path.join('./results',file)
        source,X,y=LoadandProcess(fp)
        Master_dict=TextCleanup(source,X,y)
        filename=file[:-4]+'_Corpus.json'
        fsave=os.path.join(os.getcwd(),'Embeddings',filename)
        with open(fsave, "w") as f:
            json.dump(Master_dict, f, indent=4)

        Embeddings=getEmbeddings(Master_dict)
        filename2=file[:-4]+'_Embeddings.json'
        fsave=os.path.join(os.getcwd(),'Embeddings',filename2)
        with open(fsave, "w") as f:
            json.dump(Embeddings, f, indent=4)


# Compare Embeddings

In [21]:
Embeddings=dict()


for file in os.listdir('./Embeddings/'):
    if file[-15:]=='Embeddings.json':
        print(file)
        fp=os.path.join('./Embeddings',file)
        with open(fp) as f:
            Embedding = json.load(f)
        Embeddings[file[:-16]]=Embedding

generated_separate_anthropic_hierarchy_max10_1_Embeddings.json
generated_separate_anthropic_hierarchy_max5_2_Embeddings.json
psych_openai_hierarchy_Embeddings.json
generated_separate_openai_hierarchy_max10_2_Embeddings.json
generated_separate_openai_hierarchy_max5_Embeddings.json
generated_separate_anthropic_hierarchy_max5_3_Embeddings.json
HiTOP_openai_hierarchy_Embeddings.json
DSM_openai_hierarchy_Embeddings.json
generated_separate_openai_hierarchy_max10_1_Embeddings.json
generated_separate_anthropic_hierarchy_max5_1_Embeddings.json
generated_separate_openai_hierarchy_max5_4_Embeddings.json
HiTOP_hierarchy_Embeddings.json
generated_separate_openai_hierarchy_max5_3_Embeddings.json
generated_separate_anthropic_hierarchy_max10_2_Embeddings.json
rdoc_anthropic_hierarchy_Embeddings.json
generated_separate_openai_hierarchy_max5_2_Embeddings.json
generated_separate_anthropic_hierarchy_max10_3_Embeddings.json
generated_separate_openai_hierarchy_max5_5_Embeddings.json
RDoC_hierarchy_Embedding

In [22]:
Embeddings.keys()

dict_keys(['generated_separate_anthropic_hierarchy_max10_1', 'generated_separate_anthropic_hierarchy_max5_2', 'psych_openai_hierarchy', 'generated_separate_openai_hierarchy_max10_2', 'generated_separate_openai_hierarchy_max5', 'generated_separate_anthropic_hierarchy_max5_3', 'HiTOP_openai_hierarchy', 'DSM_openai_hierarchy', 'generated_separate_openai_hierarchy_max10_1', 'generated_separate_anthropic_hierarchy_max5_1', 'generated_separate_openai_hierarchy_max5_4', 'HiTOP_hierarchy', 'generated_separate_openai_hierarchy_max5_3', 'generated_separate_anthropic_hierarchy_max10_2', 'rdoc_anthropic_hierarchy', 'generated_separate_openai_hierarchy_max5_2', 'generated_separate_anthropic_hierarchy_max10_3', 'generated_separate_openai_hierarchy_max5_5', 'RDoC_hierarchy'])

In [None]:
# Pick two embeddings to compare:
e1=Embeddings.get('generated_separate_anthropic_hierarchy_max10_1')
e2=Embeddings.get('generated_separate_anthropic_hierarchy_max10_2')


In [62]:
# Visualize some keys:

e1.keys()

dict_keys(['Cognitive Function', 'Emotion Regulation', 'Social Interaction', 'Communication and Language', 'Behavioral Control', 'Physical Health and Activity', 'Attention and Perception', 'Adaptive Functioning and Life Skills', 'Motivation and Goal-Directed Behavior', 'Self-Concept and Identity', 'Cognitive Function|Memory', 'Cognitive Function|Executive Function', 'Cognitive Function|Learning', 'Cognitive Function|Processing Speed', 'Cognitive Function|Reasoning and Problem Solving', 'Emotion Regulation|Emotional Awareness', 'Emotion Regulation|Emotional Expression', 'Emotion Regulation|Emotional Control', 'Emotion Regulation|Emotional Recovery', 'Social Interaction|Social Awareness', 'Social Interaction|Social Communication', 'Social Interaction|Social Relationships', 'Social Interaction|Social Initiative', 'Communication and Language|Receptive Language', 'Communication and Language|Expressive Language', 'Communication and Language|Pragmatic Communication', 'Communication and Langua

In [61]:


# Compare two sets:

ValueCompare='Cognitive Function'
v1=e1[ValueCompare]
local_scores=list()

for k,v in e2.items():
    cos_scores=util.cos_sim(v1, v)[0][0]
    top_results=torch.topk(cos_scores, k=1)
    top=top_results[0].tolist()
    local_scores.append((k,top))

print(ValueCompare)
#print(local_scores)
df=pd.DataFrame(local_scores)
df.sort_values(by=1, ascending=False).head()

Cognitive Function


Unnamed: 0,0,1
10,Cognitive Function|Memory,0.809959
0,Cognitive Function,0.733615
36,Attention and Perception|Sensory Processing,0.698068
53,Cognitive Function|Memory|Working Memory,0.691716
4,Behavioral Control,0.67844


In [60]:
# Compare two sets:

ValueCompare='Communication and Language'
v1=e1[ValueCompare]
local_scores=list()

for k,v in e2.items():
    cos_scores=util.cos_sim(v1, v)[0][0]
    top_results=torch.topk(cos_scores, k=1)
    top=top_results[0].tolist()
    local_scores.append((k,top))

print(ValueCompare)
#print(local_scores)
df=pd.DataFrame(local_scores)
df.sort_values(by=1, ascending=False).head()

Communication and Language


Unnamed: 0,0,1
90,Communication and Language|Receptive Language|...,0.854902
23,Communication and Language|Receptive Language,0.851444
3,Communication and Language,0.833586
267,Communication and Language|Receptive Language|...,0.828037
24,Communication and Language|Expressive Language,0.816236


In [58]:
# Compare two sets:

ValueCompare='Self-Concept and Identity|Self-Awareness|Emotional Self-Awareness'
v1=e1[ValueCompare]
local_scores=list()

for k,v in e2.items():
    cos_scores=util.cos_sim(v1, v)[0][0]
    top_results=torch.topk(cos_scores, k=1)
    top=top_results[0].tolist()
    local_scores.append((k,top))

print(ValueCompare)
#print(local_scores)
df=pd.DataFrame(local_scores)
df.sort_values(by=1, ascending=False).head()

Self-Concept and Identity|Self-Awareness|Emotional Self-Awareness


Unnamed: 0,0,1
171,Self-Concept and Identity|Identity Development...,0.739083
462,Self-Concept and Identity|Identity Development...,0.738611
52,Self-Concept and Identity|Identity Development,0.733867
711,Self-Concept and Identity|Self-Evaluation|Self...,0.694942
463,Self-Concept and Identity|Identity Development...,0.684712


In [70]:
# Pick two embeddings to compare:
e1=Embeddings.get('generated_separate_anthropic_hierarchy_max10_1')
e2=Embeddings.get('generated_separate_openai_hierarchy_max10_2')

In [71]:
# Compare two sets:

ValueCompare='Motivation and Goal-Directed Behavior|Persistence and Effort'
v1=e1[ValueCompare]
local_scores=list()

for k,v in e2.items():
    cos_scores=util.cos_sim(v1, v)[0][0]
    top_results=torch.topk(cos_scores, k=1)
    top=top_results[0].tolist()
    local_scores.append((k,top))

print(ValueCompare)
#print(local_scores)
df=pd.DataFrame(local_scores)
df.sort_values(by=1, ascending=False).head()

Motivation and Goal-Directed Behavior|Persistence and Effort


Unnamed: 0,0,1
221,Motivation and Goal-Directed Behavior|Intrinsi...,0.654951
453,Motivation and Goal-Directed Behavior|Intrinsi...,0.62395
367,Motivation and Goal-Directed Behavior|Intrinsi...,0.618462
461,Motivation and Goal-Directed Behavior|Intrinsi...,0.577968
9,Self-Concept and Identity,0.574817


In [73]:
# Compare two sets:

ValueCompare='Communication and Language'
v1=e1[ValueCompare]
local_scores=list()

for k,v in e2.items():
    cos_scores=util.cos_sim(v1, v)[0][0]
    top_results=torch.topk(cos_scores, k=1)
    top=top_results[0].tolist()
    local_scores.append((k,top))

print(ValueCompare)
#print(local_scores)
df=pd.DataFrame(local_scores)
df.sort_values(by=1, ascending=False).head()

Communication and Language


Unnamed: 0,0,1
68,Communication and Language|Verbal Communicatio...,0.771443
160,Communication and Language|Verbal Communicatio...,0.757738
23,Communication and Language|Verbal Communication,0.747757
308,Communication and Language|Verbal Communicatio...,0.720011
163,Communication and Language|Auditory Processing...,0.717953
