In [None]:
import pandas as pd
import nltk
from tqdm import tqdm
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch

In [None]:
df = pd.read_csv('../input/features_Z140Hep.csv')
df['text'] = df['NarrativeLE'] + '.' + df['NarrativeCME']
df['text'] = df['text'].str.lower()

In [None]:
new_df = df[['uid', 'text']]
new_df

In [6]:
# List of keywords
keywords = [' second', 
            ' seconds', 
            ' second(s)',
            ' minute',
            ' minutes',
            ' minute(s)',
            ' min',
            ' mins',
            ' min(s)',
            ' hour',
            ' hours',
            ' hour(s)',
            ' hr',
            ' hrs',
            ' hr(s)',
            ' day',
            ' days',
            ' week',
            ' weeks',
            ' week(s)',
            ' month',
            ' months',
            ' month(s)'
            ' year',
            ' years',
            ' year(s)',
            'sunday',
            'monday',
            'tuesday',
            'wednesday',
            'thursday',
            'friday',
            'saturday',
            'morning',
            'afternoon',
            'evening'
            ]

In [None]:
# Filter rows where any keyword exists in the text
new_df = new_df[new_df['text'].str.contains('|'.join(keywords), case=False, na=False)]

In [None]:
new_df['matched_keywords'] = new_df['text'].apply(lambda x: ', '.join([kw for kw in keywords if kw in x.lower()]))
new_df

In [None]:
agent_model_name = 'google/flan-t5-xl'
agent_tokenizer = AutoTokenizer.from_pretrained(agent_model_name)
agent_model = T5ForConditionalGeneration.from_pretrained(agent_model_name)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps')
agent_model = agent_model.to(device)
agent_model.eval()

def model_QA(query, question):
    input_text = f"""
    question: {question}
    given sentence: {query}
    answer:
    """
    inputs = agent_tokenizer(input_text, return_tensors="pt").to(device)
    output_ids = agent_model.generate(inputs["input_ids"], max_length=500, num_beams=4, early_stopping=True,do_sample=True, temperature = 0.9)
    answer = agent_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

In [20]:
question = """
Given the sentence, check if it contains a relative timing reference.
If it does, return the exact wording from the given sentence. Otherwise, return 'None'.
"""
questions = [question]

In [21]:
uuid_ls = new_df['uid'].tolist()
text_ls = new_df['text'].tolist()
keyword_ls = new_df['matched_keywords'].tolist()

In [22]:
def split_sentence(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

In [None]:
nltk.download('punkt')

In [None]:
results = {}

for idx, paragraph in enumerate(tqdm(text_ls)):
    useful_sentence = []
    relative_timing = []
    uuid = uuid_ls[idx]
    keyword_matched = keyword_ls[idx]
    sentences = split_sentence(paragraph)
    for sentence in sentences:
        answer = model_QA(sentence, questions)
        if answer != 'None':
            useful_sentence.append(sentence)
            relative_timing.append(answer)

    results[uuid] = {
        "useful_sentence": useful_sentence,
        "relative_timing": relative_timing,
        "keyword_matched": keyword_matched
    }


In [27]:
question2 = """
Given the following sentence, choose the option that best describes its content.
If no option seems appropriate, select 'Other':

Argument with family : The victim had an argument or conflict with family member
Depressed mood or mental health : The victim was depressed or had a mental health condition
Drug : The victim took drugs
Alcohol : The victim drank alcohol
Weapon: The victim got, bought or purchased a weapon
Relationship problem with partner : The victim had relationship Problem with a partner, such as break up, divorce
Love message:  The victim sent or spoke "I love you" related messages to someone
School problem : The victim had problems at or related to school
Job problem : The victim had job problems, such as losting jobs, cannot get a job
Financial problem : The victim had financial problems
Death of friend or family : A family member or friend of the victim died
History of suicide attempt: The victim attempted to suicide before, such as cutting their wrists, overdosing on pills, or hanging themselves.
Thought of suicide: The victim thought or plan to suicide before.
Death of victim : The victim died, such as cutting their wrists, overdosing on pills, hanging, or shot themselves
Other: other
"""
question2 = [question2]

In [None]:
for key in tqdm(results):
    values = results[key]
    useful_sentences = values['useful_sentence']
    category = []
    for sentence in useful_sentences:
        answer = model_QA(sentence, question2)
        category.append(answer)
    values['category'] = category

In [None]:
import json 

# Save to a JSON file
output_file = "../checkpoint/results.json"
with open(output_file, "w") as json_file:
    json.dump(results, json_file, indent=4)

print(f"Data successfully saved to {output_file}")