# Predict severity using construct-text similarity on suicide risk lexicon




In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import sys
import os
# TODO: !pip install construct-tracker

sys.path.append('./../construct-tracker/src/')
sys.path.append('./../construct-tracker/src/construct_tracker/')

from construct_tracker import lexicon
# TODO remove
# from importlib import reload
# reload(lexicon)



In [None]:
srl = lexicon.load_lexicon(name = 'srl_v1-0')
srl_prototypes = lexicon.load_lexicon(name = 'srl_prototypes_v1-0')

In [None]:
location = 'local' 

if location == 'colab':
  from google.colab import drive
  project_name = 'concept_tracker'
  drive.mount('/content/drive')
  input_dir = f'/content/drive/MyDrive/datum/{project_name}/data/ctl/'
  output_dir = f'/content/drive/MyDrive/datum/{project_name}/data/output/lexicon_paper/'
elif location == 'openmind':
  input_dir = '/nese/mit/group/sig/projects/dlow/ctl/'
  output_dir = '/home/dlow/datum/lexicon/data/output/mpnet/'
elif location =='local':
  input_dir = './data/input/ctl/'
  output_features_dir = './data/input/ctl/'
  output_ml_dir = './data/output/ml_performance/cts/'
  

os.makedirs(output_features_dir, exist_ok=True)
os.makedirs(output_ml_dir, exist_ok=True)

In [None]:
balanced_validation_set = False #False: setting it to True had bad performance). It uses 20% the training set as balanced validation, similar to other models that do 5-fold CV


train = pd.read_csv(input_dir+'train10_train_30perc_text_y_balanced_regression.csv', index_col=0)
val = pd.read_csv(input_dir+'train10_val_15perc_text_y_regression.csv', index_col=0)
test = pd.read_csv(input_dir+'train10_test_15perc_text_y_regression.csv', index_col=0)


train = train.dropna()
val = val.dropna()
test = test.dropna()

train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


print(train.shape, val.shape, test.shape)

if balanced_validation_set:
  from sklearn.model_selection import train_test_split
  train, val = train_test_split(train, test_size=0.2, random_state=42)
  print(train.shape, val.shape, test.shape)

In [None]:
display(train['y'].value_counts())
display(val['y'].value_counts())
display(test['y'].value_counts())

### 1. Count tokens using lexicon

In [None]:
train.shape

In [None]:
# Now count whether tokens appear in document:

# About 75 sec for 4160 reddits posts
# About 76 sec for 5654 CTL convos (just texter) (already lemmatized lexicon)

load = True
toy = False


if load:
	# TODO
	pass
	# counts = pd.read_csv(save_dir+'suicide_risk_lexicon_counts.csv')
	# counts['subreddit'] = reddit_df_mini['subreddit'].values
	# with open(save_dir+'suicide_risk_lexicon_matches_construct2doc.json', 'r') as json_file:
	# 	matches_construct2doc  = json.load(json_file)

	# # Can do the same for matches_by_construct, matches_doc2construct
else:
	for split, filename in [
			(train, 'train10_train_30perc_text_y_balanced_regression'),
			(test, 'train10_test_15perc_text_y_regression')
			]:
		
		split['text'] = split['text'].str.replace('\r', '')
		documents = [n.replace('\r', '') for n in documents]
		if toy:
			# documents = train['text'].sample(20).values
			documents = split['text'].iloc[:40]
			split = split.iloc[:40]
		else:
			documents = split['text'].values


		# srl = lexicon.lemmatize_tokens(srl)
		counts, matches_by_construct, matches_doc2construct, matches_construct2doc  = srl.extract(documents,
																							documents_df = split,
																							normalize = False,
																							save_dir = output_features_dir,
																							save_append_to_filename = filename
																							)
		# counts['subreddit'] = reddit_df_mini['subreddit'].values
		display(counts)
		


In [None]:
output_features_dir

In [None]:
# # Interpret counts: visualize matches in context  


# n = 2
# highlight_constructs = ['Lethal means for suicide', 'Passive suicidal ideation', 'Direct self-injury', 'Panic', 'Depressed mood']
# for construct in highlight_constructs:
# 	print(f'Matches for {construct}:')
# 	lexicon.highlight_matches(documents, construct,n, matches_construct2doc, random_seed=42)
# 	print()



### 2. Construct-Text Similarity (CTS)
Find similar tokens to the tokens in the lexicon

We'll use the highest prototypes for CTS (3/3 by raters) `srl_prototypes` so that it doesn't find similarity with low prototypical tokens

### Do not save doc embeddings, to heavy. Do sentence tokenization

In [None]:
from construct_tracker import cts
from importlib import reload
reload(cts)

# ~5000k CTL convos for 50 constructs - 60m
# 56 min for 50 constructs, chat without responder, all-MiniLM-L6-v2, preloading lexicon tokens, with lemmatization
# Encoding document clause tokens is what takes the longest.
# computing similarity between 50 constructs and 5353 documents...#  06:50

load = False
toy = False

if load:
	X_train_df = pd.read_csv(output_features_dir+'cts-scores_count-sum_thresh-03_train10_test_15perc_text_y_regression_24-08-14T22-50-58/cts_scores.csv')
	X_test_df = pd.read_csv(output_features_dir+'cts-scores_count-sum_thresh-03_24-08-14T16-48-28/train10_train_30perc_text_y_balanced_regression_cts-scores.csv')

else:
	for split, filename in [
		(train, 'train10_train_30perc_text_y_balanced_regression'),
		(test, 'train10_test_15perc_text_y_regression')
		]:
		if toy:
			documents = split['text'].iloc[:5].values
			split = split.iloc[:5]
		else:
			documents = split['text'].values
		# 31 sec for 42 reddit posts (relatively short) and 50 constructs. 
		lexicon_dict = srl_prototypes.to_dict()
		features, lexicon_dict_final_order, cosine_similarities = cts.measure(
			lexicon_dict,
			documents,
			documents_df = split, # pass the DF so it can concat
			# You can store and reload embeddings for lexicon tokens
			# stored_embeddings_path = './data/input/lexicons/embeddings_lexicon-tokens_all-mpnet-base-v2.pickle',
			document_representation = 'sentence',
			count_if_exact_match = False,
			embeddings_model = "avsolatorio/GIST-small-Embedding-v0",
			similarity_threshold = 0, 
			save_dir = output_features_dir,
			save_doc_embeddings = False,
			# document_embeddings_path = output_features_dir+'/embeddings_',
			save_append_to_filename = filename,
			)


In [None]:

# features2 = pd.concat([split, features], axis=1)
# features2




In [None]:
# features2.to_csv(output_features_dir+'cts-scores_count-sum_thresh-03_train10_test_15perc_text_y_regression_24-08-14T22-50-58/cts_scores2.csv', index=False)

In [None]:
highlight_constructs_max = ['Passive suicidal ideation_max',
 'Active suicidal ideation & suicidal planning_max',
 'Lethal means for suicide_max',
 'Direct self-injury_max',
 'Suicide exposure_max',
 'Other suicidal language_max','Depressed mood_max']

In [None]:
# Bar plot of counts split by subreddit
features2[highlight_constructs_max+['y']].groupby('y').sum().plot.bar()
plt.ylabel(f'Sum of cosine similarities > {threshold}')


In [None]:
# # Creating the new dataframe based on the conditions
# result = counts[highlight_constructs].where(counts[highlight_constructs] >= 1, features_threshold)
# result['subreddit']=subreddits


# # Bar plot of counts split by subreddit
# result[highlight_constructs+['subreddit']].groupby('subreddit').sum().plot.bar()
# plt.ylabel(f'Sum of cosine similarities')


In [None]:
# # If below threshold then replace with NaN
# threshold = 0.45 # depends on embeddings used

# features_threshold = result[highlight_constructs]
# features_threshold[features_threshold <= threshold] = np.nan
# features_threshold['subreddit'] = subreddits

# features_threshold[highlight_constructs+['subreddit']].groupby('subreddit').sum().plot.bar()
# plt.ylabel(f'Sum of cosine similarities > {threshold}')

In [None]:
# # If below threshold then replace with NaN
# threshold = 0.70 # depends on embeddings used
# features_threshold = result[highlight_constructs_max]
# features_threshold[features_threshold <= threshold] = np.nan
# features_threshold['subreddit'] = subreddits

# features_threshold[highlight_constructs_max+['subreddit']].groupby('subreddit').sum().plot.bar()
# plt.ylabel(f'Sum of cosine similarities > {threshold}')

In [None]:
# TODO: tokenization messing up space, which doesnt allow highlithing matches in context
# TODO: highlight exact match if possible: replace values in cosine_similarities
# Interpret scores
doc_id = 10

# Interpret counts: visualize matches in context  

highlight_constructs = ['Lethal means for suicide', 'Passive suicidal ideation', 'Direct self-injury', 'Panic', 'Depressed mood']
for construct in highlight_constructs:
	print(f'Matches for {construct}:')
	
	most_similar_lexicon_token, most_similar_document_token, highest_similarity = cts.get_highest_similarity_phrase(doc_id, construct, documents, features['documents_tokenized'].tolist(), cosine_similarities, lexicon_dict_final_order)
	print()



# Models