In [None]:
from transformers import AutoConfig, AutoModelForCausalLM
import torch
from vllm import LLM, SamplingParams

import numpy as np
import pandas as pd
import os
import re
import json
import re
import random
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score

# Read Data and Model

Competition data and annotations

In [None]:
narrs = pd.read_csv('../data/raw/nvdrs-youth-restricted.csv')
print('Number of Narratives',narrs.shape)

iaa_narrs = pd.read_csv('../data/interim/annotations.csv')
iaa_narrs = iaa_narrs[iaa_narrs.iaa!=1]
iaa_narrs = iaa_narrs[~iaa_narrs['relevant'].isna()].drop_duplicates(['uid','Person'])
print('Number of Annotations',iaa_narrs.shape)

Set parameters for GPU use

_This is an NVIDIA RTX A6000 with 49140MiB of space; we use most of the GPU with the models we load_

In [None]:
os.environ["NVIDIA_VISIBLE_DEVICES"] = '00000000:98:00.0'
os.environ["CUDA_VISIBLE_DEVICES"] = "4" 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.chdir('/shared/0/projects/nvdrs')
cache_dir = "/shared/4/models/"

Download model from huggingface (10 seconds)

In [None]:
!huggingface-cli login --token your_token_here

In [None]:
torch.cuda.empty_cache()
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
#model_dir = "Qwen/Qwen2.5-7B-Instruct"
#model_dir = "mistralai/Mistral-7B-Instruct-v0.3"
llm = LLM(model=model_dir, enforce_eager=True)

Evaluation Function

In [None]:
def evaluate_code(code, agg):
    print('Confusion Matrix')
    print(pd.crosstab(agg[code+'_llm']>0,agg[code]>0))
    print()
    print('Performance')
    for score in [precision_score, recall_score, f1_score]:
        print(score.__name__, score(agg[code]>0,agg[code+'_llm']>0))
    print()

# Turn Narratives into Sentences

Llama prompt format: <https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/>

In [None]:
llama3_template_zeroshot = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

I am a researcher studying suicide risk factors. You are a helpful AI question answering assistant, who answers all my questions.
{prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Narrative: {narr}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Answer:
"""

Prompt to split narratives (paragraphs) into sentences

In [None]:
sent_prompt = 'Split the following narrative into sentences. Format your output as a list of all sentences in the narrative.'

Run inference on LE Narratives (9 mins)

In [None]:
# Run inference
le_sents = llm.generate([llama3_template_zeroshot.format(narr = x, prompt=sent_prompt) for x in narrs.NarrativeLE],
                          sampling_params=SamplingParams(max_tokens=8192, temperature=0))

# Process outputs
final_le_sents = []
for output in le_sents:
    blob = output.outputs[0].text
    blob = [x for x in blob.split('\n') if bool(re.search('^\d{1,2}\\.',x))]
    final_le_sents.append(blob)

# Create DF with narrative UID and each sentence
uids = []
sents = []
for i in range(len(final_le_sents)):
    uid = narrs.uid[i]
    for x in final_le_sents[i]:
        uids.append(uid)
        sents.append(x)
le_sents_df = pd.DataFrame({'uid':uids, 'type':'LE', 'sentence':sents})

le_sents_df.to_csv('../data/interim/competition-le-sentences.csv', index=None)

Run inference on CME Narratives (8 mins)

In [None]:
# Run inference
cme_sents = llm.generate([llama3_template_zeroshot.format(narr = x, prompt=sent_prompt) for x in narrs.NarrativeCME],
                          sampling_params=SamplingParams(max_tokens=8192, temperature=0))

# Process outputs
final_cme_sents = []
for output in cme_sents:
    blob = output.outputs[0].text
    blob = [x for x in blob.split('\n') if bool(re.search('^\d{1,2}\\.',x))]
    final_cme_sents.append(blob)

# Create DF with narrative UID and each sentence
uids = []
sents = []
for i in range(len(final_cme_sents)):
    uid = narrs.uid[i]
    for x in final_cme_sents[i]:
        uids.append(uid)
        sents.append(x)
cme_sents_df = pd.DataFrame({'uid':uids, 'type':'LE', 'sentence':sents})

cme_sents_df.to_csv('../data/interim/competition-cme-sentences.csv', index=None)

Read prompts from file

In [None]:
le_sents_df = pd.read_csv('../data/interim/competition-le-sentences.csv')
cme_sents_df = pd.read_csv('../data/interim/competition-cme-sentences.csv')

# Relevance Prompt

Prompt template for sentences as input

In [None]:
llama3_template_zeroshot = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

I am a researcher studying suicide risk factors. You are a helpful AI question answering assistant, who answers all my questions.
{prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Sentence: {narr}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Answer:
"""

Prompt to test whether a sentence mentions an online space

In [None]:
prompt_relevant = "Does the following sentence talk about an online space? This includes social media, web searches, messaging, chat, email, viewing or posting content, online gaming, online schooling, or cyberbullying. This does not include texting. Answer Yes or No with no explanation."
prompt_relevant

Examine outputs on one narrative

In [None]:
target_uid = 'azpu'
prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) 
                for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
le_relevant = llm.generate(prompts_list,
                           sampling_params=SamplingParams(max_tokens=8192, temperature=0))
for i,x in enumerate(le_sents_df.sentence[le_sents_df.uid==target_uid]):
    print(x)
    print(le_relevant[i].outputs[0].text)

prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) 
                for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
cme_relevant = llm.generate(prompts_list,
                            sampling_params=SamplingParams(max_tokens=8192, temperature=0))
for i,x in enumerate(cme_sents_df.sentence[cme_sents_df.uid==target_uid]):
    print(x)
    print(cme_relevant[i].outputs[0].text)

Evaluate performance on test set (4 mins)

_Note: Poor performance on the neutral and withdraw codes._

In [None]:
ids = iaa_narrs.uid.tolist()
annotators = iaa_narrs.Person.tolist()

# Run inference on LE narratives
le_annotators = [annotators[i] for i,target_uid in enumerate(ids) for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
le_uids = [target_uid for target_uid in ids for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
le_sents = [x for target_uid in ids for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) for x in le_sents]
le_relevant = llm.generate(prompts_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))
print(Counter([x.outputs[0].text for x in le_relevant]))

# Run inference on CME narratives
cme_annotators = [annotators[i] for i,target_uid in enumerate(ids) for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
cme_uids = [target_uid for target_uid in ids for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
cme_sents = [x for target_uid in ids for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) for x in cme_sents]
cme_relevant = llm.generate(prompts_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))
print(Counter([x.outputs[0].text for x in cme_relevant]))

# Create combined dataset with both LE and CME sentences + labels
sents_labeled = pd.concat([pd.DataFrame({'uid':le_uids, 'type':'LE', 'sentence':le_sents,
                                         'person':le_annotators, 
                                         'relevant_llm':[x.outputs[0].text for x in le_relevant]}),
                            pd.DataFrame({'uid':cme_uids, 'type':'CME', 'sentence':cme_sents, 
                                          'person':cme_annotators, 
                                          'relevant_llm':[x.outputs[0].text for x in cme_relevant]})])
sents_labeled['relevant_llm'] = sents_labeled['relevant_llm'].apply(lambda x: 1 if x=='Yes' else 0)
sents_labeled = sents_labeled.reset_index(drop=True)

# Assign labels to each narrative (relevant_llm = 1 if any sentence was classified as relevant)
agg = sents_labeled.groupby(['uid','person'])['relevant_llm'].sum().reset_index().merge(iaa_narrs[['uid','relevant']],on='uid',how='left')
agg2 = sents_labeled.groupby(['uid','person'])['relevant_llm'].sum().reset_index().merge(iaa_narrs,on='uid',how='left')

# Evaluate performance
evaluate_code('relevant', agg)
for c in ['disclosure', 'disclosure_violent', 'sharing', 'conflict',
       'withdraw', 'harm_passive', 'harm_active', 'victim', 'neutral',
       'time_intensive', 'relationship', 'le_search']:
    print(pd.crosstab(agg2[c],agg2['relevant_llm']>0))

Generate predictions on all sentences (25 mins)

In [None]:
ids = narrs.uid.tolist()

le_uids = [target_uid for target_uid in ids for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
le_sents = [x for target_uid in ids for x in le_sents_df.sentence[le_sents_df.uid==target_uid]]
prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) for x in le_sents]
le_relevant = llm.generate(prompts_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))
print(Counter([x.outputs[0].text for x in le_relevant]))

cme_uids = [target_uid for target_uid in ids for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
cme_sents = [x for target_uid in ids for x in cme_sents_df.sentence[cme_sents_df.uid==target_uid]]
prompts_list = [llama3_template_zeroshot.format(narr = x, prompt=prompt_relevant) for x in cme_sents]
cme_relevant = llm.generate(prompts_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))
print(Counter([x.outputs[0].text for x in cme_relevant]))

sents_labeled_final = pd.concat([pd.DataFrame({'uid':le_uids, 'type':'LE', 'sentence':le_sents,
                                               'relevant_llm':[x.outputs[0].text for x in le_relevant]}),
                                 pd.DataFrame({'uid':cme_uids, 'type':'CME', 'sentence':cme_sents,
                                               'relevant_llm':[x.outputs[0].text for x in cme_relevant]})])
sents_labeled_final['relevant_llm'] = sents_labeled_final['relevant_llm'].apply(lambda x: 1 if x=='Yes' else 0)
sents_labeled_final = sents_labeled_final.reset_index(drop=True)

# Disclosure Prompt

Prompt to see whether a sentence discusses suicidal ideation

In [None]:
prompt_disclosure = 'In the following sentence, was V thinking about suicide, planning to kill or hurt themselves, or indicating suicidal ideation? Answer Yes or No with no explanation.'
prompt_disclosure

Evaluate performance on test set

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_disclosure) 
               for x in sents_labeled[sents_labeled.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'disclosure'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[sents_labeled.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x=='Yes' else 0)
agg = sents_labeled.groupby(['uid','person'])[code+'_llm'].sum().reset_index().\
        merge(iaa_narrs[['uid',code,'disclosure_violent']],on='uid',how='left')
agg['disclosure'] = agg['disclosure'] + agg['disclosure_violent']
evaluate_code(code,agg)

Generate predictions on all sentences that mention online spaces (15 seconds)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_disclosure) 
               for x in sents_labeled_final[sents_labeled_final.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'disclosure'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[sents_labeled_final.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x=='Yes' else 0)
Counter(sents_labeled[code+'_llm'])

# Sharing Prompt

Prompt to see whether a sentence discusses sharing intimate information or emotions online

In [None]:
prompt_sharing = """In the following sentence, which of the following is true? Give only the letter with no explanation.

A. V posted on social media or messaged someone indicating they were thinking about suicide or planning to kill or hurt themselves
B. V searched how to kill or hurt themselves online
C. V had an interpersonal issue (argument, breakup, conflict, etc.)
D. V left a suicide note 
E. V messaged someone online
F. V talked about non-suicidal self-harm online
G. V posted about their thoughts or feelings online.
H. V revealed something about themselves online
I. V posted something private or personal online
J. Someone reported V's suicidal ideation online
K. None of the above."""
print(prompt_sharing)

Evaluate performance on test set

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_sharing) 
               for x in sents_labeled[sents_labeled.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'sharing'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[sents_labeled.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x in ['F','G','H','I'] else 0)
agg = sents_labeled.groupby(['uid','person'])[['disclosure_llm',code+'_llm']].sum().reset_index().\
        merge(iaa_narrs[['uid',code]],on='uid',how='left')
evaluate_code(code,agg)

Generate predictions on all sentences that mention online spaces (30 seconds)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_sharing) 
               for x in sents_labeled_final[sents_labeled_final.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'sharing'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[sents_labeled_final.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x in ['F','G','H','I'] else 0)
Counter(sents_labeled[code+'_llm'])

# Conflict Prompt

Prompt to see whether a sentence discusses a conflict that started or progressed online

In [None]:
prompt_conflict = """In the following sentence, which of the following is true? Give only the letter with no explanation.

A. V posted on social media or messaged someone indicating they were thinking about suicide or planning to kill or hurt themselves
B. V was being bullied or harassed online. 
C. V argued with somene online.
D. Something happened online which led to a conflict.
E. V posted about an interpersonal conflict online. 
F. An online relationship was in trouble or ended. 
G. None of the above."""
print(prompt_conflict)

Evaluate performance on test set

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_conflict) 
               for x in sents_labeled[sents_labeled.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'conflict'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[sents_labeled.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x in ['C','D','E','F'] else 0)
agg = sents_labeled.groupby(['uid','person'])[['disclosure_llm',code+'_llm']].sum().reset_index().\
        merge(iaa_narrs[['uid',code]],on='uid',how='left')
evaluate_code(code,agg)

Generate predictions on all sentences that mention online spaces (30 seconds)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_conflict) 
               for x in sents_labeled_final[sents_labeled_final.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'conflict'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[sents_labeled_final.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x in ['C','D','E','F'] else 0)
Counter(sents_labeled[code+'_llm'])

# Withdraw Prompt

Prompt to see whether a sentence discusses withdrawal from online spaces

In [None]:
prompt_withdraw = """In the following sentence, which of the following is true? Give only the letter with no explanation.

A. V posted on social media or messaged someone indicating they were thinking about suicide or planning to kill or hurt themselves
B. Someone took away V's access to internet, phone, computer, gaming, social media, or other devices
C. V had stopped using social media, deleted an account, or withdrew from an online account
D. None of the above"""
print(prompt_withdraw)

Evaluate performance on test set (all narratives, not just those that mention online spaces) (4 minutes)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_withdraw) 
               for x in sents_labeled.sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'withdraw'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[:,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x in ['B','C'] else 0)

# Calculate interannotator agreement at a narrative level 
agg = sents_labeled.groupby(['uid','person'])[['disclosure_llm',code+'_llm']].max().reset_index().\
    merge(iaa_narrs[['uid',code]],on='uid',how='left')
evaluate_code(code,agg)

Generate predictions on all sentences (not just those that mention online spaces) (34 minutes)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_withdraw) 
               for x in sents_labeled_final.sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'withdraw'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[:,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x in ['B','C'] else 0)
Counter(sents_labeled[code+'_llm'])

# Harm / Victimization Prompt

Prompt to see whether a sentence discusses harm by or to the decedent in online spaces

In [None]:
prompt_harm = """In the following sentence, which of the following is true? Give only the letter with no explanation.

A. V posted on social media or messaged someone indicating they were thinking about suicide or planning to kill or hurt themselves
B. V searched how to kill or hurt themselves online
C. V argued with someone
D. V talked about non-suicidal self-harm online
E. V was bullied, harassed, or harmed online
F. V harmed, threatened, acted inappropriately towards, or bullied someone online
G. V was on a forum for suicide or self harm
H. V viewed other violent or explicit content online
I. None of the above."""
print(prompt_harm)

Evaluate performance on test set

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_harm) 
               for x in sents_labeled[sents_labeled.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'harm_passive'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[sents_labeled.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x in ['B','G','H'] else 0)
agg = sents_labeled.groupby(['uid','person'])[['disclosure_llm',code+'_llm']].sum().reset_index().\
        merge(iaa_narrs[['uid',code]],on='uid',how='left')
print(code)
evaluate_code(code,agg)


code = 'victim'
sents_labeled[code+'_llm'] = 'n'
sents_labeled.loc[sents_labeled.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled[code+'_llm'] = sents_labeled[code+'_llm'].apply(lambda x: 1 if x in ['E'] else 0)
agg = sents_labeled.groupby(['uid','person'])[['disclosure_llm',code+'_llm']].sum().reset_index().\
        merge(iaa_narrs[['uid',code]],on='uid',how='left')
print(code)
evaluate_code(code,agg)

Generate predictions on all sentences that mention online spaces (30 seconds)

In [None]:
prompt_list = [llama3_template_zeroshot.format(narr = x, prompt = prompt_harm) 
               for x in sents_labeled_final[sents_labeled_final.relevant_llm==1].sentence]
codes = llm.generate(prompt_list, sampling_params=SamplingParams(max_tokens=8192, temperature=0))

code = 'harm_passive'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[sents_labeled_final.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x in ['B','G','H'] else 0)
print(code, Counter(sents_labeled[code+'_llm']))


code = 'victim'
sents_labeled_final[code+'_llm'] = 'n'
sents_labeled_final.loc[sents_labeled_final.relevant_llm==1,code+'_llm'] = [x.outputs[0].text for x in codes]
sents_labeled_final[code+'_llm'] = sents_labeled_final[code+'_llm'].apply(lambda x: 1 if x in ['E'] else 0)
print(code, Counter(sents_labeled[code+'_llm']))

# Assemble Final Dataset

In [None]:
narrs_labeled = sents_labeled_final.groupby(['uid'])[['relevant_llm', 'disclosure_llm',
                                                      'sharing_llm', 'conflict_llm', 'withdraw_llm',
                                                      'harm_passive_llm', 'victim_llm', ]].max().reset_index()

In [None]:
for var in ['relevant','disclosure', 'sharing','conflict','withdraw', 'harm_passive','victim']:
    print(var)
    print(Counter(narrs_labeled[var+'_llm']))
    print(np.mean(narrs_labeled[var+'_llm']))
    print()

In [None]:
narrs_labeled.to_csv('../data/processed/competition_predictions.csv',index=None)