# Steps

1. Preprocess document (and construct)
- Tokenize into clauses
- Tokenize into words, remove stop words, lemmatize : useful for lexicon count as well

2. Encode construct:
- construct prototype: I want to die
- each token: I wish I didn't wake up tomorrow
- weighted centroid

3. Encode doc


In [None]:
import pandas as pd
import numpy as np
import os


import re
import seaborn as sns

# catpro
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, './../catpro')

from catpro.text.utils import stop_words
from catpro.text.utils import clean
from catpro.text.utils.tokenizer import spacy_tokenizer
from catpro.text.utils.lemmatizer import spacy_lemmatizer

from catpro.text.embeddings import vectorize, cosine_similarity

In [None]:
import datetime
ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')
ts

In [None]:
input_dir = './data/input/'
output_dir = './data/output/'

In [None]:
import os


# Encode docs and lexicon (skip if done with this specific lexicon and dataset)

In [None]:
df = pd.read_csv(input_dir+'BurstingStudy_DailyData_02_15_2021_clean_preprocessed.csv', index_col = 0)    

In [None]:
df.columns

In [None]:
# df = pd.read_csv(input_dir+'train10_train_concurrent_metadata_100perconstruct_with_messages.csv', index_col = 0)


In [None]:
docs = df['SI_DescribeText_clean'].values

In [None]:
approach_embedding_names = [  # lexicon centroid weighted by distance to construct label
   # 'w_w_glove', 
    # 'w_c_psychbert',
    # 'w_w_minilm', 
    'w_c_minilm',           
    # 'wl_w_minilm',
    # 'wl_c_minilm'
]
    

embedding_name_type = {
#    model_name:embedding_type 
    # 'glove': 'word',
    'all-MiniLM-L6-v2': 'sentence',
    # 'all-MiniLM-L6-v2': 'document',
    # 'mnaylor/psychbert-cased': 'document',# need to fix  
}


    

# Functions

In [None]:
def df_similarity_token_category(embeddings_tokens_doc, constructs_d, df, docs_clean, summary_stats = None):
    '''
    embeddings_tokens_doc
    '''
    # for each doc, it creates a value (e.g., mean across tokens --either words or clauses) for each construct    
    
    feature_vectors_mean = []
    feature_vectors_median = []
    feature_vectors_max = []
    
    constructs = list(constructs_d.keys())

    for i, doc in enumerate(docs_clean):
        embeddings_tokens_doc_i = embeddings_tokens_doc[i]
        df_scores_category_all = pd.DataFrame(docs_clean[i], columns = ['token'])
        for category in constructs:
            embedding_category = constructs_d.get(category)
            embedding_category = np.array(embedding_category, dtype=float)
            embeddings_tokens_doc_i = np.array(embeddings_tokens_doc_i, dtype=float)
            if embeddings_tokens_doc_i.shape[0] == 0: #happens when there is an empty str
                embeddings_tokens_doc_i = [np.zeros(embedding_category.shape[0])]
            cosine_scores = cosine_similarity(embedding_category, embeddings_tokens_doc_i)
            # each token is a row, and each col is a construct being measured for that token.             
            df_scores_category_all[category] = np.array(cosine_scores, dtype = float)[0]#pd.DataFrame(cosine_scores, columns = ['category'])
            
            
            # df_scores_category = pd.DataFrame([docs_clean[i], np.array(cosine_scores[0])]).T
            # df_scores_category.columns = ['token', category]
            # df_scores_category = df_scores_category.sort_values(by='token')
            # # df_scores_category_all= df_scores_category_all.merge(df_scores_category, on='token', how = 'outer')
            # df_scores_category_all.append(df_scores_category)
        # df_scores_category_all = pd.concat(df_scores_category_all, axis=1)
        df_scores_category_all = df_scores_category_all[constructs].astype(float)

        # display(df_scores_category_all)
        feature_vectors_mean.append(df_scores_category_all.mean())
        feature_vectors_median.append(df_scores_category_all.median())
        feature_vectors_max.append(df_scores_category_all.max())

    feature_vectors_mean = pd.concat(feature_vectors_mean,axis=1).T
    feature_vectors_mean.columns = [n+'_mean' for n in feature_vectors_mean.columns]

    feature_vectors_median = pd.concat(feature_vectors_median,axis=1).T
    feature_vectors_median.columns = [n+'_median' for n in feature_vectors_median.columns]
    
    feature_vectors_max = pd.concat(feature_vectors_max,axis=1).T
    feature_vectors_max.columns = [n+'_max' for n in feature_vectors_max.columns]
    
    feature_vectors = pd.concat([feature_vectors_mean, feature_vectors_median, feature_vectors_max],axis=1)
    df[feature_vectors.columns.tolist()] = feature_vectors.values
    # feature_cols = list(set(feature_vectors.columns)-set(['subreddit','author','date','docs','docs_clean']))
    # feature_cols.sort()
    # feature_vectors= feature_vectors[['subreddit','author','date','docs','docs_clean']+feature_cols]
    
    return df

# Preprocessing

### TODO: remove words with negation 1-3 words prior

In [None]:
%%time
run_this = True

if run_this:
    # docs = [re.sub("[\(\[].*?[\)\]]", "", n) for n in docs] #replace text within parentheses/brackets and parentheses/brackets
    # docs = [n.replace('//', '').replace(' .', '.').replace(' ,', ',') for n in docs] 
    # docs = [n.replace('ampx200b', '').replace('\n','').replace('\xa0', '') for n in docs]
    docs_clean = [str(n) if str(n)!='nan' else '' for n in docs]
    docs_clean = [n.replace('!.', '!').replace('?.', '?').replace('....', '...') for n in docs_clean]
    docs_clean = [clean.remove_multiple_spaces(doc) for doc in docs_clean]

In [None]:
# more_stop_words = ['ca', 'nt','like', "'", "´", "n’t"]

In [None]:
%%time
run_this = False 

if run_this:
    # words: tokenize by words, remove stop words and lemmatize for word-word similarity
    docs_clean_w_w = stop_words.remove(list(docs_clean), language = 'en', sws = 'nltk', remove_punct=True, extend_stopwords=more_stop_words)
    docs_clean_w_w = [clean.remove_multiple_spaces(doc) for doc in docs_clean_w_w]
    docs_clean_w_w = spacy_lemmatizer(docs_clean_w_w, language ='en') #this takes 22s for 5200 docs
    df['docs_clean_w_w'] = docs_clean_w_w


### Tokenize into clauses

In [None]:
df.shape

In [None]:
%%time 
#1 min for 1300 counesling sessions. 3s for 2300 sentences.  



run_this = True

import re

if run_this:
    # docs_clean_clauses = [clean.remove_multiple_spaces(doc) for doc in docs_clean]
    docs_clean_clauses = spacy_tokenizer(docs_clean, 
                                     language = 'en', model='en_core_web_sm', 
                                     token = 'clause', # clause tokenization
                                     lowercase=False, 
                                     display_tree = False, 
                                     remove_punct=True, 
                                     clause_remove_conj = True)
    df['docs_clean_clauses'] = docs_clean_clauses
    df.to_csv(input_dir+f'BurstingStudy_DailyData_02_15_2021_clean_preprocessed_tokenized_{ts}.csv')

In [None]:
df

In [None]:
ts

In [None]:
# df[['docs_clean_clauses','docs_clean_w_w']].iloc[::10]


In [None]:
# [[n,i] for n,i in zip(docs, docs_clean_w_c)][::10]

# Encode embeddings and compute similarity

### Construct (Lexicon)

In [None]:

catpro_text_dir = './../catpro/catpro/text/data/'
lexicon_df = pd.read_csv(catpro_text_dir+'lexicons/suicidal_thoughts_and_behaviors/OsirisRankinFirstPassForDanLowMarch_3_ 2023_daniel_added_prototypes.csv', index_col = 0)


In [None]:
seed_tokens

In [None]:
other_tokens

In [None]:
constructs = [n for n in lexicon_df.columns if ('_add' not in n and '_remove' not in n and '_examples' not in n)]
lexicon = {}


for construct in constructs:
    construct_cols = [n for n in lexicon_df.columns if construct in n]
    lexicon_df_i = lexicon_df[construct_cols]
    seed_tokens = lexicon_df_i[lexicon_df_i[construct+'_remove'].isin(['seed_token', 'construct label', 'prototype'])][construct].values
    for token in ['active_si', 'passive_si', 'thwarted belongingness']:
        try: seed_tokens.remove(token)
        except: pass

    if len(seed_tokens)==0:
        print(construct, 'has 0 seed_tokens')
        
    other_tokens = lexicon_df_i[lexicon_df_i[construct+'_remove'].astype(str).isin(['nan', '0'])][construct].values
    other_tokens = [n for n in other_tokens if str(n)!='nan']                                                      
    if len(other_tokens)==0:
        print(construct, 'has 0 other_tokens')
        
    lexicon[construct] = {
        'seed_tokens':seed_tokens,
        'other_tokens':other_tokens,
        
    }


    
    

#     if '_add' not in n or '_remove' not in n or '_examples' not in n   

In [None]:
# # load risk factor and encode

# input_dir_catpro = './../data/'
# # lexicon = pd.read_csv(input_dir_catpro+'lexicons/suicidal_thoughts_and_behaviors/suicide_risk_lexicon_thesauri_questionnaires_23-03-16T19-35-06.csv', index_col = 0)
# import json

# with open(input_dir_catpro+'lexicons/suicidal_thoughts_and_behaviors/suicide_risk_lexicon_thesauri_questionnaires_23-03-16T19-35-06.json') as f:
#     lexicon = json.load(f)


In [None]:
lexicon.keys()

In [None]:
# df = pd.read_csv('./../data/lexicons/suicidal_thoughts_and_behaviors/suicide_risk_lexicon_thesauri_questionnaires_23-03-16T19-35-06.csv', index_col = 0)

# print(len(df.columns.tolist()))
# print(df.columns.tolist())

In [None]:
constructs_to_measure = ['abuse_physical',
 'abuse_sexual',
 'active_si',
 'aggression_irritability',
 'agitation',
 # 'alcohol_use',
 # 'anhedonia_uninterested',
 'anxiety',
 # 'barriers_to_treatment',
 # 'bully',
 'burdensomeness',
 'defeat_failure',
 'depressed_mood',
 # 'desire_to_escape',
 # 'discrimination',
 # 'eating_disorder',
 'emotional_pain',
 'emptiness',
 'entrapment',
 'fatigue_tired',
 'finances_work',
 'gender_sexual_identity',
 'grief_bereavement',
 'guilt',
 'hopelessness',
 'impulsivity',
 'loneliness_isolated',
 # 'panic',
 'passive_si',
 # 'perfectionism',
 'relationships',
 'rumination',
 'self-injury',
 'shame_self-disgust',
 'sleep_issues',
 # 'social_withdrawl',
 'substance_use',
 # 'thwarted_belongingness'
                        ]

lexicon_final = {}
for c in constructs_to_measure:
    lexicon_final[c] = lexicon.get(c)
    
lexicon = lexicon_final.copy()


In [None]:
for c in lexicon.keys():
    print(c,lexicon.get(c).get('seed_tokens'))

In [None]:
# # TODO: merge active_si and passive_si, change name of self-injury


# constructs_to_measure = ['self-injury',
#  'active_si',
#  'passive_si',
#  'bully',
#  'abuse_physical',
#  'abuse_sexual',
#  'relationships',
#  'grief_bereavement',
#  'loneliness_isolated',
#  'anxiety',
#  'depressed_mood',
#  'gender_sexual_identity',
#  'eating_disorder',
#  'substance_use']



# # values = ['']*len(constructs_to_measure)
# # ctl_tags = dict(zip(constructs_to_measure, values))
# ctl_tags_d = {'self-injury': 'self_harm',
#  # 'active_si': 'suicide',
#  # 'passive_si': 'suicide',
#     'suicide':'suicide',
#  'bully': 'bully',
#  'abuse_physical': 'abuse_physical',
#  'abuse_sexual': 'abuse_sexual',
#  'relationships': 'relationship',
#  'grief_bereavement': 'bereavement',
#  'loneliness_isolated': 'isolated',
#  'anxiety': 'anxiety_stress',
#  'depressed_mood': 'depressed',
#  'gender_sexual_identity': 'gender_sexual_identity',
#  'eating_disorder': 'eating_body_image',
#  'substance_use': 'substance'}

# ctl_tags = np.unique(list(ctl_tags_d.values())).tolist()

In [None]:
# # merge active and passive
# suicide_tokens = lexicon['active_si']+lexicon['passive_si']
# suicide_tokens = [n for n in suicide_tokens if str(n)!='nan']
# lexicon['suicide'] = suicide_tokens

# # rename lexicon constructs
# for k_old, k_new in ctl_tags_d.items():
#     lexicon[k_new] = lexicon.pop(k_old)

# # keep only ctl constructs
# lexicon_constructs_not_in_ctl = set(lexicon.keys()) - set(ctl_tags_d.values())
# lexicon_constructs_not_in_ctl
# for construct in lexicon_constructs_not_in_ctl:
#     del lexicon[construct]

# # for construct-word to doc analyses, have a single construct 
# # lexicon_prototypes = dict(zip(lexicon.keys(), ['']*len(lexicon.keys())))
# lexicon_prototypes = {'suicide': 'suicide',
#  'self_harm': 'I cut myself',
#  'bully': "bullied",
#  'abuse_physical': 'physical abuse',
#  'abuse_sexual': 'sexual abuse and rape',
#  'relationship': 'relationship',
#  'bereavement': "grieving and mourning",
#  'isolated': 'lonely',
#  'anxiety_stress': 'anxious',
#  'depressed': 'depressed',
#  'gender_sexual_identity': 'gender and sexual orientation',
#  'eating_body_image': 'eating disorder',
#  'substance': 'drugs'}

### Encode

you want to encode each token once, because can appear in multiple lexicons

In [None]:
embeddings_lexicon_tokens_d = {}

model_name = 'all-MiniLM-L6-v2'
embedding_type = 'sentence'
list_of_lists = False

In [None]:
import pickle

In [None]:
embeddings_dir = input_dir

In [None]:
# prior_encoded_embeddings = pd.read_csv(input_dir_catpro+'lexicons/suicidal_thoughts_and_behaviors/tokens_embeddings_22-12-02T17-43-57.csv', index_col = 0)
# prior_encoded_embeddings

In [None]:
# encode all lexicon tokens

tokens_all = []

for construct in lexicon.keys():
    tokens = list(lexicon.get(construct).get('seed_tokens'))+list(lexicon.get(construct).get('other_tokens'))
    tokens_all.append(tokens)
tokens_all = [n for i in tokens_all for n in i]
    


In [None]:
%%time 

#20 sec
run_this = True

if run_this:
    embeddings_lexicon_tokens = vectorize(tokens_all, list_of_lists=list_of_lists, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs
    embeddings_lexicon_tokens_d = dict(zip(tokens_all, embeddings_lexicon_tokens))


    with open(embeddings_dir+f'embeddings_lexicon-tokens_{model_name}.pickle', 'wb') as handle:
        pickle.dump(embeddings_lexicon_tokens_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
else:        
    with open(embeddings_dir+f'embeddings_lexicon-tokens_{model_name}.pickle', 'rb') as handle:
        embeddings_lexicon_tokens_d = pickle.load(handle)
    # with open(embeddings_dir+f'embeddings_{model_name}.pickle') as f:
    #     embeddings_lexicon_tokens_d = json.load(f)

In [None]:
# lexicon_prototypes.values()

In [None]:
# %%time 

# run_this = False

# if run_this:

#     embeddings_lexicon_prototypes = vectorize(list(lexicon_prototypes.values()), list_of_lists=False, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs

#     for k,v in zip(list(lexicon_prototypes.values()), embeddings_lexicon_prototypes):
#         embeddings_lexicon_tokens_d[k]=v
#     with open(embeddings_dir+f'embeddings_{model_name}.pickle', 'wb') as handle:
#         pickle.dump(embeddings_lexicon_tokens_d, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [None]:
### Create weighted lexicon

### Doc

In [None]:
# df = pd.read_csv(input_dir+f'train10_train_concurrent_metadata_100perconstruct_with_messages_preprocessed_23-03-20T17-50-34.csv')
# docs_clean_clauses = df['docs_clean_clauses'].values

In [None]:
# df[['message','docs_clean_clauses','docs_clean_w_w']].iloc[::10]


In [None]:
# lexicon_remove = pd.read_csv('./../data/lexicons/suicidal_thoughts_and_behaviors/suicide_risk_lexicon_thesauri_questionnaires_23-03-16T19-35-06_osiris_v01.csv', index_col = 0,encoding='cp1252')
# # suicide_list = lexicon_remove['active_si'].tolist()+lexicon_remove['passive_si'].tolist()
# nans= ['nan']*(lexicon_remove.shape[1]+6)
# lexicon_remove['suicide'] = suicide_list_temp+nans
# lexicon_remove['suicide']

In [None]:


# lexicon_remove_d = {}                            
# for l in lexicon.keys():
#     l=     ctl_tags_d_inv.get(l)
#     if l=='suicide':
#         lexicon_remove_d[l] = lexicon_remove_i
#         continue

#     lexicon_remove_i = lexicon_remove[[l,l+'_remove']].replace('seed_token', 0).replace('construct label', 1)
#     lexicon_remove_i['self-injury_remove'] = lexicon_remove_i['self-injury_remove'].astype(float)
    
    
#     lexicon_remove_i = lexicon_remove_i[lexicon_remove_i[l+'_remove']>=1][l].values
#     lexicon_remove_i = [n for n in lexicon_remove_i if str(n)!= 'nan' ]
#     print(lexicon_remove_i)    
#     # to_remove_i = []
#     lexicon_remove_d[l] = lexicon_remove_i
    
                             
                             
                             

# Encode

In [None]:
method = 'word_clause'
method.startswith('word_')

In [None]:
%%time

def construct_text_similarity(
    constructs = None,
    lexicon = None, 
    construct_prototype_d = None,
    embeddings_construct_d = None,
    docs = None,
    embeddings_docs_d = None,
    method = 'word_clause'
):
    '''
    A doc is composed of tokens. We compute the similarity between the construct and each token
    and taken some summary statistics
    
    Args:
        construct_embeddings: 
        docs_embeddings: 
        method: {'word_word', 'word_clause', 'wlexicon_clause'}

    Returns:
    '''
    # encode all list of lists 

    

    feature_vectors_all = []

    for i, doc in enumerate(docs):
        if i%200==0:
            print(i)
        embeddings_tokens_doc_i = embeddings_docs_d.get(i)
        feature_vectors_doc = [str(doc)]
        feature_vectors_doc_col_names = ['doc']
        for construct in constructs:
            if method.startswith('word_'):
                construct_prototype = construct_prototype_d.get(construct)
                embedding_construct = embeddings_construct_d.get(construct_prototype)
                
            elif method.startswith('lexicon_'):
                lexicon_tokens = lexicon.get(construct)
                
                embedding_construct = []
                for token in lexicon_tokens:
                    token_embedding = embeddings_construct_d.get(token)
                    embedding_construct.append(token_embedding)
            elif method.startswith('wlexicon_'):
                print('need to implement. break.')
                break
            # formatting
            embedding_construct = np.array(embedding_construct, dtype=float)
            embeddings_tokens_doc_i = np.array(embeddings_tokens_doc_i, dtype=float)
            

            if method.startswith('word_'):
                assert len(embedding_construct.shape) == 1
                if embeddings_tokens_doc_i.shape[0] == 0: #happens when there is an empty str
                    embeddings_tokens_doc_i = [np.zeros(embedding_construct.shape[0])]
                cosine_scores_docs_i = cosine_similarity([embedding_construct], embeddings_tokens_doc_i)
            else: #construct is a list of lists
                if doc == []:
                    cosine_scores_docs_i = [0]
                else:                
                    try: cosine_scores_docs_i = cosine_similarity(embedding_construct, embeddings_tokens_doc_i)
                    except: 
                        print('broke, returning cosine_similarity = 0')
                        cosine_scores_docs_i  = cosine_scores_docs_i = [0]
                        # return doc, embedding_construct, embeddings_tokens_doc_i
            doc_sim_mean = np.mean(cosine_scores_docs_i)
            doc_sim_median = np.median(cosine_scores_docs_i)
            doc_sim_max = np.max(cosine_scores_docs_i)
            feature_vectors_doc.extend([doc_sim_mean, doc_sim_median,doc_sim_max])
            feature_vectors_doc_col_names.extend([construct+"_mean", construct+"_median", construct+"_max"])

        feature_vectors_doc_df = pd.DataFrame(feature_vectors_doc, index = feature_vectors_doc_col_names).T
        feature_vectors_all.append(feature_vectors_doc_df)

    feature_vectors_all = pd.concat(feature_vectors_all).reset_index(drop=True)            
    return feature_vectors_all


In [None]:
df['docs_clean_clauses'].astype(str).value_counts()

In [None]:
df['docs_clean_clauses'] = df['docs_clean_clauses'].astype(str)

In [None]:
docs_clean_clauses = df['docs_clean_clauses'].tolist()
docs_clean_clauses = [eval(n) for n in docs_clean_clauses]
# docs_clean_clauses = [eval(n) for n in docs_clean_clauses]

docs_clean_clauses_unique = np.unique(docs_clean_clauses).tolist()
docs_clean_clauses_unique.remove([])

In [None]:
len(docs_clean_clauses_unique)

In [None]:



%%time 

# 12 m for 1300 docs, each one tokenized


run_this = True

if run_this:
    model_name = 'all-MiniLM-L6-v2'
    embedding_type = 'sentence'
    list_of_lists = True
    verbose = True

    embeddings_tokens_docs_unique = vectorize(docs_clean_clauses_unique, list_of_lists=list_of_lists, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs    
    embeddings_tokens_docs_unique_d = dict(zip(list(range(len(docs_clean_clauses))), embeddings_tokens_docs_unique))
    with open(embeddings_dir+f'embeddings_docs_tokenized_unique_{model_name}_{ts}.pickle', 'wb') as handle:
        pickle.dump(embeddings_tokens_docs_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(embeddings_dir+f'embeddings_docs_tokenized_unique_{model_name}_{ts}.pickle', 'rb') as handle:
        embeddings_tokens_docs_unique_d = pickle.load(handle)

# Link back to documents

### Encode docs (list of lists is faster)

In [None]:
# %%time 

# # 12 m for 1300 docs, each one tokenized


# run_this = False

# if run_this:
#     model_name = 'all-MiniLM-L6-v2'
#     embedding_type = 'sentence'
#     list_of_lists = True
#     verbose = True

#     embeddings_tokens_docs = vectorize(docs_clean_clauses, list_of_lists=list_of_lists, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs    
#     embeddings_tokens_docs_d = dict(zip(list(range(len(docs_clean_clauses))), embeddings_tokens_docs))
#     with open(embeddings_dir+f'embeddings_docs_{model_name}_train10_train_concurrent_metadata_100perconstruct_with_messages_{ts}.pickle', 'wb') as handle:
#         pickle.dump(embeddings_tokens_docs_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
# else:
#     with open(embeddings_dir+'embeddings_docs_all-MiniLM-L6-v2_train10_train_concurrent_metadata_100perconstruct_with_messages.pickle', 'rb') as handle:
#         embeddings_tokens_docs_d = pickle.load(handle)

In [None]:
# constructs = lexicon.keys()
# construct_prototype_d = lexicon_prototypes
# embeddings_construct_d = embeddings_lexicon_tokens_d
# docs = docs_clean_clauses
# embeddings_docs_d = embeddings_tokens_docs_d
# method = 'word_clause'

### TODO Encode tokes from words 

In [None]:
# %%time 



# feature_vectors_d = {}

# for method in ['word_word']:
    
#     if method == 'word_word':
#         docs = df['docs_clean_w_w'].values
#     print('=========================')
#     print(method)
#     feature_vectors_all = construct_text_similarity(
#         constructs = lexicon.keys(),
#         construct_prototype_d = lexicon_prototypes,
#         embeddings_construct_d = embeddings_lexicon_tokens_d,
#         docs = docs,
#         embeddings_docs_d = ######TODO,
#         method = method,
#                                                    )
#     feature_vectors_d[method]=feature_vectors_all


### todo: wlexicon for weightedLexicon

### word_ methods for prototype and lexicon_ 

In [None]:
construct_pr

In [None]:
# constructs = lexicon.keys()
# lexicon
# construct_prototype_d = lexicon_prototypes
# embeddings_construct_d = embeddings_lexicon_tokens_d
# docs = docs_clean_clauses
# embeddings_docs_d = embeddings_tokens_docs_unique_d
# method = 'lexicon_clause'

In [None]:
lexicon_seed = {}
for c in lexicon.keys():
    lexicon_seed[c] = lexicon.get(c).get('seed_tokens')

lexicon_all = {}
for c in lexicon.keys():
    lexicon_all[c] = list(lexicon.get(c).get('seed_tokens'))+list(lexicon.get(c).get('other_tokens'))



# Prototypes - clause

In [None]:
%%time 



feature_vectors_d = {}

for method in ['lexicon_clause']:#['word_clause', 'lexicon_clause']:
    print('=========================')
    print(method)
    feature_vectors_all = construct_text_similarity(
        constructs = lexicon.keys(),
        lexicon = lexicon_seed, #SEED tokens only
        construct_prototype_d = None,
        embeddings_construct_d = embeddings_lexicon_tokens_d,
        docs = docs_clean_clauses_unique,
        embeddings_docs_d = embeddings_tokens_docs_unique_d,
        method = method,
                                                   )
    feature_vectors_d[method]=feature_vectors_all


### map back onto DF

In [None]:
max_cols = [n+'_max' for n in lexicon.keys()]
method_i_max = feature_vectors_d.get(method)[max_cols+['doc']]
method_i_max.columns = max_cols+['docs_clean_clauses']
# df_cts = df.copy()
# df_cts[method_i_max.columns] = 

In [None]:
print(method_i_max.shape)
print(df.shape)

In [None]:
df_cts = df.merge(method_i_max, on='docs_clean_clauses', how='outer')
print(df_cts.shape)
df_cts

In [None]:
df_cts.to_csv(input_dir+f'dataset_cts_protoypes_{ts}.csv')


In [None]:
import seaborn as sns
import plotly.express as px

In [None]:
threshold = 0.45

In [None]:

df_cts_threshold = df_cts[df_cts[max_cols]>0.45]
cts_rank = df_cts_threshold[max_cols].sum().sort_values()[::-1]
cts_rank

In [None]:
np.round(0,2)

In [None]:
cts_rank_df = pd.DataFrame(cts_rank).reset_index()
cts_rank_df.columns = ['Construct', 'Sum > 0.45']
cts_rank_df['Sum > 0.45'] = [np.round(n,1) for n in cts_rank_df['Sum > 0.45'].values]
cts_rank_df['Construct'] = [n.replace('_' ,' ').replace(' max', '').replace('si', 'SI').capitalize() for n in cts_rank_df['Construct'].values]
cts_rank_df.to_csv(output_dir+'rank_cts_045_prototypes.csv')
cts_rank_df

In [None]:
output_dir

In [None]:
import time
time.sleep(1)

In [None]:

fig = px.box(df_cts[cts_rank.index.tolist()+["SI_DescribeText_clean"]], hover_data=["SI_DescribeText_clean"],points="all",title='Prototype-Clause Similarity')

fig.update_layout(
    # title=dict(text="Prototype-Clause Similarity", font=dict(size=16)),
    xaxis_title="Constructs (prototypes)", yaxis_title="Max. cosine similarity<br>with document clauses", 
    template='simple_white',
)



savefig=True
# for template in ["simple_white",]:  #"plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
    # fig.update_layout(template=template,)# title="'%s' theme" % template)
if savefig:
    time.sleep(5)
    print('done')
    # save all files in the same html 
    with open(output_dir+f'cts_{method}_prototypes_{ts}.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
      # save each file separately 
      # pio.write_image(fig,output_dir+"submissions_desire.png", scale=3, width=600, height=1200, engine='auto')
      # fig.write_html(output_dir+f"submissions_{activity}.html")
# fig.show()

## Remove documents below threshold

In [None]:
df_cts_nan = df_cts.copy()[max_cols]
df_cts_nan[(df_cts_nan<=threshold)] = np.nan
df_cts_nan['SI_DescribeText_clean'] = df_cts['SI_DescribeText_clean']

In [None]:
df_cts_nan

In [None]:

fig = px.box(df_cts_nan[cts_rank.index.tolist()+["SI_DescribeText_clean"]], hover_data=["SI_DescribeText_clean"],points='all',title='Prototype-Clause Similarity')

fig.update_layout(
    # title=dict(text="Prototype-Clause Similarity", font=dict(size=16)),
    xaxis_title="Constructs (prototypes)", yaxis_title="Max. cosine similarity<br>with document clauses", 
    template='simple_white',
)



savefig=True
# for template in ["simple_white",]:  #"plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
    # fig.update_layout(template=template,)# title="'%s' theme" % template)
if savefig:
    # time.sleep(5)
    # print('done')
    # save all files in the same html 
    with open(output_dir+f'cts_{method}_prototypes_nan_{ts}.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
      # save each file separately 
      # pio.write_image(fig,output_dir+"submissions_desire.png", scale=3, width=600, height=1200, engine='auto')
      # fig.write_html(output_dir+f"submissions_{activity}.html")
fig.show()

In [None]:
df_cts.columns

In [None]:
other_cols = ['ID', 'time_sent', 'time_complete', 'date_sent', 'date_complete',
       'survey_numb', 'day_study', 'Daily_SI_DesireKill', 'Daily_SI_Urge',
       'Daily_SI_Intent', 'Daily_SI_ResistUrge', 'Daily_SI_DesireDie',
       'Day_SI_Percent', 'Day_SI_Time', 'Day_SI_Describe',
       'Daily_SI_Interfere', 'Daily_SI_Cope', 'Daily_SI_Images', 'Day_SI_Plan',
       'SI_DescribeText', 'Day_NSSI', 'Daily_SA', 'Daily_Functioning',
       'Daily_Affect_Feel', 'Daily_Affect_Aware', 'Daily_Affect_Pos',
       'Daily_Affect_Neg', 'Daily_Affect_Stress', 'Daily_Affect_Temper',
       'Daily_Affect_Anger', 'Daily_Impul_SaidWithoutThink',
       'Daily_Impul_Money', 'Daily_Impul_Impatient', 'Daily_Impul_Decision',
       'Daily_Impul_Upset', 'Daily_Impul_ActEmotions', 'Daily_Impul_Impulse',
       'Daily_Impul_Irrespon', 'Daily_Impul_Dangerous', 'Daily_Impul_Ate',
       'Daily_SocialSupport_Family', 'Daily_SocialSupport_Friends',
       'Daily_SI_Desire_Tomorrow', 'Daily_SI_Urge_Tomorrow',
       'Daily_SI_ResistUrge_Tomorrow', 'SI_DescribeText_clean', 'word_count',
       'skipped', 'Day_SI_Describe_transformed', 'docs_clean_clauses',]

In [None]:
df_cts_zero = df_cts.copy()[max_cols]
df_cts_zero[(df_cts_zero<=threshold)] = 0


df_cts_zero[other_cols] = df_cts[other_cols]

df_cts_zero.to_csv(input_dir+f'dataset_cts_protoypes_zero_{ts}.csv')



In [None]:
df_cts_nan = df_cts[df_cts<threshold]==np.nan]

# Lexicon - clause

In [None]:
%%time 



feature_vectors_d = {}

for method in ['lexicon_clause']:#['word_clause', 'lexicon_clause']:
    print('=========================')
    print(method)
    feature_vectors_all = construct_text_similarity(
        constructs = lexicon.keys(),
        lexicon = lexicon_all, #just changed this
        construct_prototype_d = None,
        embeddings_construct_d = embeddings_lexicon_tokens_d,
        docs = docs_clean_clauses_unique,
        embeddings_docs_d = embeddings_tokens_docs_unique_d,
        method = method,
                                                   )
    feature_vectors_d[method]=feature_vectors_all


In [None]:
max_cols = [n+'_max' for n in lexicon.keys()]
method_i_max = feature_vectors_d.get(method)[max_cols+['doc']]
method_i_max.columns = max_cols+['docs_clean_clauses']


In [None]:
df['docs_clean_clauses'] = df['docs_clean_clauses'].astype(str)

In [None]:
df_cts = df.merge(method_i_max, on='docs_clean_clauses', how='outer')
df_cts_threshold = df_cts[df_cts[max_cols]>0.45]
cts_rank = df_cts_threshold[max_cols].sum().sort_values()[::-1]

In [None]:
df_cts.to_csv(input_dir+f'dataset_cts_alltokens_{ts}.csv')

In [None]:

fig = px.box(df_cts[cts_rank.index.tolist()+["SI_DescribeText_clean"]], hover_data=["SI_DescribeText_clean"],points="all")

savefig=True
for template in ["simple_white",]:  #"plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
    fig.update_layout(template=template,)# title="'%s' theme" % template)
    if savefig:
      # save all files in the same html 
      with open(output_dir+f'cts_{method}_alltokens_{ts}.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
      # save each file separately 
      # pio.write_image(fig,output_dir+"submissions_desire.png", scale=3, width=600, height=1200, engine='auto')
      # fig.write_html(output_dir+f"submissions_{activity}.html")
    # fig.show()

### Without documents below threshold

In [None]:
df_cts_nan = df_cts.copy()[max_cols]
df_cts_nan[(df_cts_nan<=threshold)] = np.nan
df_cts_nan['SI_DescribeText_clean'] = df_cts['SI_DescribeText_clean']


fig = px.box(df_cts_nan[cts_rank.index.tolist()+["SI_DescribeText_clean"]], hover_data=["SI_DescribeText_clean"],points='all',title='Lexicon-Clause Similarity')

fig.update_layout(
    # title=dict(text="Prototype-Clause Similarity", font=dict(size=16)),
    xaxis_title="Constructs (lexicon)", yaxis_title="Max. cosine similarity<br>with document clauses", 
    template='simple_white',
)


savefig=True
# for template in ["simple_white",]:  #"plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
    # fig.update_layout(template=template,)# title="'%s' theme" % template)
if savefig:
    # time.sleep(5)
    # print('done')
    # save all files in the same html 
    with open(output_dir+f'cts_{method}_alltokens_nan_{ts}.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
      # save each file separately 
      # pio.write_image(fig,output_dir+"submissions_desire.png", scale=3, width=600, height=1200, engine='auto')
      # fig.write_html(output_dir+f"submissions_{activity}.html")
# fig.show()

### Colorcode

In [None]:
# # https://plotly.com/python/line-and-scatter/

# # Need to make more longform

# data = df_cts[cts_rank.index.tolist()+["SI_DescribeText_clean"]+['Day_SI_Describe_transformed']]
# fig = px.scatter(data, y="count", x="nation", color="Day_SI_Describe_transformed")
# fig.update_traces(marker_size=10)
# fig.update_layout(scattermode="group")
# fig = px.box(data, hover_data=["SI_DescribeText_clean"],points="all",color_discrete_sequence = 'Day_SI_Describe_transformed')


In [None]:
data['Day_SI_Describe_transformed']=data['Day_SI_Describe_transformed'].astype(int)

In [None]:
# fig = px.box(data, hover_data=["SI_DescribeText_clean"],points="all",color = 'Day_SI_Describe_transformed')
# fig.show()

In [None]:
fig = px.box(data, hover_data=["SI_DescribeText_clean"],points="all",color = 'Day_SI_Describe_transformed')



savefig=True
for template in ["simple_white",]:  #"plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
    fig.update_layout(template=template,)# title="'%s' theme" % template)
    if savefig:
      # save all files in the same html 
      with open(output_dir+f'cts_{method}_alltokens_colorcode-intent_{ts}.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
      # save each file separately 
      # pio.write_image(fig,output_dir+"submissions_desire.png", scale=3, width=600, height=1200, engine='auto')
      # fig.write_html(output_dir+f"submissions_{activity}.html")
    # fig.show()

In [None]:
# fig = px.parallel_coordinates(df_cts[cts_rank.index.tolist()+["SI_DescribeText_clean"]])
# fig.show()

# Concurrent validity (once we have human judgements

In [None]:
import dcor
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr, pointbiserialr
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def cohend(d1, d2):
 # calculate the size of samples
 n1, n2 = len(d1), len(d2)
 # calculate the variance of the samples
 s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
 # calculate the pooled standard deviation
 s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
 # calculate the means of the samples
 u1, u2 = np.mean(d1), np.mean(d2)
 # calculate the effect size
 return (u1 - u2) / s

In [None]:
dcor_all = []
    
r_all = []

for method in ['word_clause', 'lexicon_clause']:
    print(method)
    feature_vectors_all = feature_vectors_d.get(method)
    tags = df[constructs]

    max_cols = [n+'_max' for n in constructs]
    method_i_max = feature_vectors_all[max_cols]
    assert [n+'_max' for n in constructs] == max_cols 
    method_i_max.columns = constructs        
    # dcor
    # dcor_i = dcor.distance_correlation(tags,method_i_max) 
    # dcor_all.append(dcor_i)
    # print('dcor', dcor_i)
    

    r_method_i = []
    for construct in constructs:
        y_true_1 = tags[tags[construct]==1]
        y_true_0 = tags[tags[construct]==0].sample(n=y_true_1.shape[0])
        y_true_1_indexes = y_true_1.index.tolist()
        y_true_0_indexes = y_true_0.index.tolist()
        y_true = pd.concat([y_true_1,y_true_0],axis=0)[construct].values
        # y_true = y_true.sample(n=y_true.shape[0])
        y_pred = method_i_max[construct][y_true_1_indexes+y_true_0_indexes].values

        # r, p = spearmanr(y_true,y_pred)
        r, p = pointbiserialr(y_true,y_pred)



        # dcor_i = dcor.distance_correlation(y_true,y_pred) #0.45
        # print(construct, f'dcor={np.round(dcor_i,2)}', np.round(r,2), np.round(p, 4))

        df_i = pd.DataFrame(method_i_max[construct][y_true_1_indexes+y_true_0_indexes])
        df_i['Truth'] = [1]*len(y_true_1_indexes) + [0]*len(y_true_0_indexes) 
        y_pred_0 = df_i[df_i['Truth']==0][construct].values
        y_pred_1 = df_i[df_i['Truth']==1][construct].values
        # cohens_d = (np.mean(y_pred_1) - np.mean(y_pred_0)) / (np.sqrt((np.std(y_pred_1) ** 2 + np.std(y_pred_0) ** 2) / 2))
        cohens_d = cohend(y_pred_1,y_pred_0)
        # rocauc = roc_auc_score(y_true,y_pred)
        # rocauc = str(np.round(roc_auc_score,2))
        title_i = f"{construct}: rho={np.round(r,2)} (p={np.round(p, 4)}) Cohen's {np.round(cohens_d,2)}"

        # display(df_i)
        sns.kdeplot(data=df_i,x = construct, hue='Truth')
        plt.title(title_i)
        plt.show()
        r_method_i.append(r)
        # r_all_method_i_stats.append(

    r_all.append(r_method_i)
    stats = np.round([np.mean(r_all), np.std(r_all),np.min(r_all),np.max(r_all)],2)
    print(f'stats: {stats[0]} ± {stats[1]} ({stats[2]}-{stats[3]})')
    print()

In [None]:
r

In [None]:
r_all

In [None]:
stats = np.round([np.mean(r_all), np.std(r_all),np.min(r_all),np.max(r_all)],2)

In [None]:
sns.kdeplot(data=df_i,x = construct, hue='Truth')

In [None]:
df_i

In [None]:
# max_cols = [n+'_max' for n in constructs]

# for c in constructs:
#     print(c)
#     feature_vectors_all_max_docs = feature_vectors_all[['doc']+max_cols].sort_values([construct+'_max'])[::-1]['doc'].values[:5]
#     [print('. '.join(eval(n)), '\n') for n in feature_vectors_all_max_docs]
#     # print(feature_vectors_all_max_docs)
    

In [None]:
# for idx in feature_vectors_all.index:
#     truth = dict(zip(constructs, df[constructs].iloc[idx,:].values))
#     print(truth)
    
#     features = feature_vectors_all.iloc[idx,:].values
#     print('. '.join(eval(features[0])))
#     print(features[1:])
          
#     max_cols = [n+'_max' for n in constructs]
    

In [None]:
import dcor

In [None]:
%%time 


In [None]:
# %%time 

# tags = df[constructs]

# max_cols = [n+'_max' for n in constructs]
# word_clause_max = feature_vectors_all[max_cols]
# assert [n+'_max' for n in constructs] == max_cols 
# word_clause_max.columns = constructs

In [None]:
word_clause_max

In [None]:
dcor_lexicon_clause_max = dcor.distance_correlation(tags,lexicon_clause_max) #0.45

In [None]:
dcor_lexicon_clause_max

In [None]:
word_clause_max.isna().sum()

In [None]:
assert tags.columns.tolist() == word_clause_max.columns.tolist()

In [None]:



for construct in constructs:
    y_true_1 = tags[tags[construct]==1]
    y_true_0 = tags[tags[construct]==0].sample(n=y_true_1.shape[0])
    y_true_1_indexes = y_true_1.index.tolist()
    y_true_0_indexes = y_true_0.index.tolist()
    y_true = pd.concat([y_true_1,y_true_0],axis=0)[construct].values
    # y_true = y_true.sample(n=y_true.shape[0])
    y_pred = word_clause_max[construct][y_true_1_indexes+y_true_0_indexes].values
    r, p = spearmanr(y_true,y_pred)
    print(construct, np.round(r,2), np.round(p, 4))

In [None]:
import matplotlib.pyplot as plt

In [None]:
sns.kdeplotword_clause_max[construct][y_true_1_indexes].values
word_clause_max[construct][y_true_0_indexes].values

In [None]:
for construct in constructs:
    y_true = tags[construct].values
    y_pred = word_clause_max[construct]
    r, p = spearmanr(y_true,y_pred)
    print(construct, np.round(r,2), np.round(p, 4))
    

In [None]:
for construct in constructs:
    y_true = tags[construct].values
    y_pred = word_clause_max[construct]
    r, p = spearmanr(y_true,y_pred)
    print(construct, np.round(r,2), np.round(p, 4))

In [None]:
df

In [None]:
# %%time
# # embeddings_tokens_docs ={}

# feature_vectors_all = []

# for i, doc in enumerate(docs_clean_clauses[:10]):
#     if i%100==0:
#         print(i)
#     embeddings_tokens_doc_i = vectorize(doc, list_of_lists=list_of_lists, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs    

In [None]:
%%time
# embeddings_tokens_docs ={}

feature_vectors_all = []


embeddings_tokens_doc_i = vectorize(docs_clean_clauses[:10], list_of_lists=list_of_lists, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs    

In [None]:
len(embedding_construct_prototype.shape) == 1

In [None]:
len(feature_vectors_all)

In [None]:

    
#     for i in docs_embeddings:
        
    
    
    
    
    
# # def df_similarity_token_category(embeddings_tokens_doc, constructs_d, df, docs_clean, summary_stats = None):
# #     '''
# #     embeddings_tokens_doc
# #     '''
# # for each doc, it creates a value (e.g., mean across tokens --either words or clauses) for each construct    

# feature_vectors_mean = []
# feature_vectors_median = []
# feature_vectors_max = []

# constructs = list(constructs_d.keys())

# for i, doc in enumerate(docs_clean):
#     embeddings_tokens_doc_i = embeddings_tokens_doc[i]
#     df_scores_category_all = pd.DataFrame(docs_clean[i], columns = ['token'])
#     for category in constructs:
#         embedding_category = constructs_d.get(category)
#         embedding_category = np.array(embedding_category, dtype=float)
#         embeddings_tokens_doc_i = np.array(embeddings_tokens_doc_i, dtype=float)
#         if embeddings_tokens_doc_i.shape[0] == 0: #happens when there is an empty str
#             embeddings_tokens_doc_i = [np.zeros(embedding_category.shape[0])]
#         cosine_scores = cosine_similarity(embedding_category, embeddings_tokens_doc_i)
#         # each token is a row, and each col is a construct being measured for that token.             
#         df_scores_category_all[category] = np.array(cosine_scores, dtype = float)[0]#pd.DataFrame(cosine_scores, columns = ['category'])


#         # df_scores_category = pd.DataFrame([docs_clean[i], np.array(cosine_scores[0])]).T
#         # df_scores_category.columns = ['token', category]
#         # df_scores_category = df_scores_category.sort_values(by='token')
#         # # df_scores_category_all= df_scores_category_all.merge(df_scores_category, on='token', how = 'outer')
#         # df_scores_category_all.append(df_scores_category)
#     # df_scores_category_all = pd.concat(df_scores_category_all, axis=1)
#     df_scores_category_all = df_scores_category_all[constructs].astype(float)

#     # display(df_scores_category_all)
#     feature_vectors_mean.append(df_scores_category_all.mean())
#     feature_vectors_median.append(df_scores_category_all.median())
#     feature_vectors_max.append(df_scores_category_all.max())

# feature_vectors_mean = pd.concat(feature_vectors_mean,axis=1).T
# feature_vectors_mean.columns = [n+'_mean' for n in feature_vectors_mean.columns]

# feature_vectors_median = pd.concat(feature_vectors_median,axis=1).T
# feature_vectors_median.columns = [n+'_median' for n in feature_vectors_median.columns]

# feature_vectors_max = pd.concat(feature_vectors_max,axis=1).T
# feature_vectors_max.columns = [n+'_max' for n in feature_vectors_max.columns]

# feature_vectors = pd.concat([feature_vectors_mean, feature_vectors_median, feature_vectors_max],axis=1)
# df[feature_vectors.columns.tolist()] = feature_vectors.values
# # feature_cols = list(set(feature_vectors.columns)-set(['subreddit','author','date','docs','docs_clean']))
# # feature_cols.sort()
# # feature_vectors= feature_vectors[['subreddit','author','date','docs','docs_clean']+feature_cols]


In [None]:
df = pd.read_csv(input_dir+f'train10_train_concurrent_metadata_100perconstruct_with_messages_preprocessed_23-03-20T17-50-34.csv')

In [None]:
lexicon_embeddings = {}
print('encoding...')
embeddings_tokens_doc = vectorize(docs_final, list_of_lists=True, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs
# encode tokens of each doc
np.save(npy_filepath, embeddings_tokens_doc)


In [None]:
lexicon.keys()

In [None]:
constructs_suicide_risk_lexicon_weighted_centroid = pd.read_csv('./../data/lexicons/suicidal_thoughts_and_behaviors/weighted_centroids_22-12-04T01-06-02.csv', index_col = 0)

In [None]:
%%time

run_this = True

lexicons_dir = './../data/lexicons/'
embeddings_path = lexicons_dir+'embeddings_tokens_all-MiniLM-L6-v2_22-12-02T17-32-05.json'
tokens_path = lexicons_dir+'suicidal_thoughts_and_behaviors/concurrent_validity_tokens_cosine_similarity_22-12-02T17-43-57.csv'



if run_this:
    for approach_embedding_name in approach_embedding_names:
        if 'glove' in approach_embedding_name:
            model_name = 'glove'

        elif 'minilm' in approach_embedding_name:
            model_name = 'all-MiniLM-L6-v2'
        elif 'psychbert' in approach_embedding_name:
            model_name = 'mnaylor/psychbert-cased'
        
        embedding_type = embedding_name_type.get(model_name)
        # if model_name in [
        #     # 'mnaylor/psychbert-cased',# cannot run on Mac M1, will run on colab: vectorize(docs_clean_joined, package = 'transformers', model_name = 'mnaylor/psychbert-cased', embedding_type = 'document')
        #     # 'all-MiniLM-L6-v2',
        #     # 'glove',
        #                 ]:
        #     continue # skip      
        print('approach_embedding_name: ', approach_embedding_name, 'model_name:', model_name, 'embedding_type: ', embedding_type)

        len_docs = len(docs_clean)
        # print(len_constructs , docs_per_construct )
        
        if '_w' in approach_embedding_name:
            docs_final = df['docs_clean_w_w'].values
            # df = pd.read_csv(output_dir+'feature_vectors_16constructs_7subreddits_156docs_w_w_minilm_22-10-09T02-19-07.csv', index_col = 0)
            # df = df.iloc[:, :5]
        elif '_c' in approach_embedding_name:
            docs_final = df['docs_clean_w_c'].values
            # df = pd.read_csv(output_dir+'feature_vectors_16constructs_7subreddits_156docs_w_c_minilm_22-10-09T02-19-07.csv', index_col = 0)
            # df = df.iloc[:, :5]
        df['docs_final']=docs_final


        type_of_document_tokenization = '_'.join(approach_embedding_name.split('_')[-2:]) #w, c

        
        npy_filepath = output_dir+f'army_starrs_{len(constructs)}constructs_{len_docs}docs_{type_of_document_tokenization}_embeddings.npy'
        try:
            embeddings_tokens_doc = np.load(npy_filepath,
                                            allow_pickle=True)
            print('loaded from prior run')

        except:
            print('did not find: ',npy_filepath)
            print('encoding...')
            embeddings_tokens_doc = vectorize(docs_final, list_of_lists=True, embedding_type = embedding_type, model_name = model_name) # 10 s for list of tokens for 5200 docs
            # encode tokens of each doc
            np.save(npy_filepath, embeddings_tokens_doc)

        filename = model_name.split('/')[-1]

        if approach_embedding_name.startswith('wl_'):
            # centroid weighted by cosine sim to construct label             

            constructs_d = {}
            # Load embedings for construct
            with open(embeddings_path, 'r') as json_file:
                lexicons_embeddings = json.load(json_file)

            lexicons_tokens = pd.read_csv(tokens_path, index_col = 0)
            for construct in lexicons_tokens['construct'].unique():
                weighted_centroid = constructs_suicide_risk_lexicon_weighted_centroid[construct].values
                constructs_d[construct]=weighted_centroid
                # lexicons_tokens_i = lexicons_tokens[lexicons_tokens['construct']==construct]
                # tokens_i = lexicons_tokens_i['token'].values
                # scores_i = lexicons_tokens_i['score'].values
                # embeddings_i = np.array([lexicons_embeddings.get(token) for token in tokens_i])
                # weighted_centroid = np.average(embeddings_i, axis=0, weights=scores_i)
                
                
            print('loaded dict of construct embeddings')

        else:
            try: 
#               # TODO:
                with open(output_dir+f'constructs{len(constructs)}_{approach_embedding_name}.pkl', 'rb') as f:
                    constructs_d = pickle.load(f)
                print('loaded dict of construct embeddings')
            except:
                print('encoding construct embeddings...')    
                # encode constructs     
                constructs_d = {}
                embeddings_constructs = vectorize(constructs, embedding_type = embedding_type, model_name = model_name)
                for category, embedding in zip(constructs, embeddings_constructs):
                    constructs_d[category] = embedding
                with open(output_dir+f'constructs{len(constructs)}_{approach_embedding_name}.pkl', 'wb') as f:
                    pickle.dump(constructs_d, f)

        # compute similarity (extract features)    
        # embeddings_tokens_doc = np.load(output_dir+f'army_starrs_5_{model_name.split('/')[-1]}_embeddings.npy')
        # constructs_d['hallucinating'] = constructs_d['hallucinating_hallucination']
        # del  constructs_d['hallucinating_hallucination']

        feature_vectors = df_similarity_token_category(embeddings_tokens_doc, constructs_d, df, docs_final, summary_stats = None)
        feature_vectors.to_csv(output_dir+f'feature_vectors_{len(constructs)}constructs_{len_docs}docs_{approach_embedding_name}_{ts}.csv')
