In [7]:
import pandas as pd
import json

# Load the JSON data from file
file_path = './BioRED/Test.BioC.JSON'  # Replace with your file path
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to combine passages and entities, and format relations
def process_documents(documents):
    combined_passages = []
    combined_entities = []
    document_relations = []

    for document in documents:
        # Combining texts and annotations (entities) for all passages in the document
        all_texts = []
        all_entities = []
        for passage in document['passages']:
            all_texts.append(passage['text'])
            annotations = passage.get('annotations', [])
            for ann in annotations:
                entity_info = {
                    'id': ann['id'],
                    'text': ann['text'],
                    'type': ann['infons']['type'],
                    'identifier': ann['infons']['identifier']
                }
                all_entities.append(entity_info)

        # Combining all texts into a single passage
        combined_passage_text = ' '.join(all_texts)
        combined_passages.append(combined_passage_text)
        combined_entities.append(all_entities)

        # Extracting and formatting relations for the document
        relations = []
        for relation in document['relations']:
            entity1_id = relation['infons']['entity1']
            entity2_id = relation['infons']['entity2']
            relation_type = relation['infons']['type']

            # Finding the texts for the entities in the relations
            entity1_text = next((e['text'] for e in all_entities if e['id'] == entity1_id), entity1_id)
            entity2_text = next((e['text'] for e in all_entities if e['id'] == entity2_id), entity2_id)

            relations.append({'entity1': entity1_text, 'entity2': entity2_text, 'relation': relation_type})

        # Adding the same list of relations for the entire document
        document_relations.append(relations)

    return combined_passages, combined_entities, document_relations

# Process the documents
passages, entities, relations = process_documents(data['documents'])

# Creating the initial DataFrame
df_final_combined_with_types = pd.DataFrame({
    'passage': passages,
    'entities': entities,
    'relations': relations
})

# Function to map entity identifiers to their texts
def map_entities_to_texts(entity_list, identifier):
    for entity in entity_list:
        if entity['identifier'] == identifier:
            return entity['text']
    return identifier

# Modifying the relations to replace identifiers with actual entity texts
true_relations = []
for index, row in df_final_combined_with_types.iterrows():
    modified_relations = []
    for relation in row['relations']:
        entity1_text = map_entities_to_texts(row['entities'], relation['entity1'])
        entity2_text = map_entities_to_texts(row['entities'], relation['entity2'])
        modified_relations.append({
            'entity1': entity1_text,
            'entity2': entity2_text,
            'relation': relation['relation']
        })
    true_relations.append(modified_relations)

# Adding the 'true_relations' column to the DataFrame
df_final_combined_with_types['true_relations'] = true_relations

# Displaying the first data point with the modified 'true_relations' column
df_final_combined_with_types.iloc[99].to_dict()


{'passage': 'Aconitine-induced Ca2+ overload causes arrhythmia and triggers apoptosis through p38 MAPK signaling pathway in rats. Aconitine is a major bioactive diterpenoid alkaloid with high content derived from herbal aconitum plants. Emerging evidence indicates that voltage-dependent Na(+) channels have pivotal roles in the cardiotoxicity of aconitine. However, no reports are available on the role of Ca(2+) in aconitine poisoning. In this study, we explored the importance of pathological Ca(2+) signaling in aconitine poisoning in vitro and in vivo. We found that Ca(2+) overload lead to accelerated beating rhythm in adult rat ventricular myocytes and caused arrhythmia in conscious freely moving rats. To investigate effects of aconitine on myocardial injury, we performed cytotoxicity assay in neonatal rat ventricular myocytes (NRVMs), as well as measured lactate dehydrogenase level in the culture medium of NRVMs and activities of serum cardiac enzymes in rats. The results showed that 

In [8]:
df_final_combined_with_types.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   passage         100 non-null    object
 1   entities        100 non-null    object
 2   relations       100 non-null    object
 3   true_relations  100 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB


In [9]:
df_final_combined_with_types.head()

Unnamed: 0,passage,entities,relations,true_relations
0,A novel SCN5A mutation manifests as a malignan...,"[{'id': '0', 'text': 'SCN5A', 'type': 'GeneOrG...","[{'entity1': 'D001919', 'entity2': '6331', 're...","[{'entity1': 'bradycardia', 'entity2': 'SCN5A'..."
1,Allelic expression imbalance of human mu opioi...,"[{'id': '0', 'text': 'human', 'type': 'Organis...","[{'entity1': '4988', 'entity2': 'D010146', 're...","[{'entity1': 'mu opioid receptor', 'entity2': ..."
2,Genetic polymorphisms in the carbonyl reductas...,"[{'id': '0', 'text': 'carbonyl reductase 3', '...","[{'entity1': 'D066126', 'entity2': 'C010013', ...","[{'entity1': 'cardiotoxic', 'entity2': 'doxoru..."
3,Debrisoquine phenotype and the pharmacokinetic...,"[{'id': '0', 'text': 'Debrisoquine', 'type': '...","[{'entity1': '153', 'entity2': 'D008790', 'rel...","[{'entity1': 'beta-1 adrenoceptor', 'entity2':..."
4,The first founder DGUOK mutation associated wi...,"[{'id': '0', 'text': 'DGUOK', 'type': 'GeneOrG...","[{'entity1': '1716', 'entity2': 'C580039', 're...","[{'entity1': 'DGUOK', 'entity2': 'hepatocerebr..."


In [10]:
# Re-importing pandas as the code execution state was reset
import pandas as pd

# Assuming df_final_combined_with_types is already defined and loaded
# If not, it should be reloaded or redefined here

# Code to create df_for_prompting from df_final_combined_with_types

# Extracting the required columns from df_final_combined_with_types
passages = df_final_combined_with_types['passage']
true_relations = df_final_combined_with_types['true_relations']

# Creating list_of_entities column by extracting only the entity text from entities
list_of_entities = df_final_combined_with_types['entities'].apply(lambda entities: [entity['text'] for entity in entities])

# Creating the new DataFrame df_for_prompting
df_for_prompting = pd.DataFrame({
    'passage': passages,
    'list_of_entities': list_of_entities,
    'true_relations': true_relations
})

# Displaying the first data point of df_for_prompting for verification
df_for_prompting.head(1)


Unnamed: 0,passage,list_of_entities,true_relations
0,A novel SCN5A mutation manifests as a malignan...,"[SCN5A, long QT syndrome, tachycardia, bradyca...","[{'entity1': 'bradycardia', 'entity2': 'SCN5A'..."


In [34]:
from src.Model_Parameter import get_model_response

import re
import os
import csv
import json
from typing import List
import yaml
import openai
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange
from langchain import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser

In [35]:
import requests
import xml.etree.ElementTree as ET
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import nltk
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the 'punkt' resource
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/sanket/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanket/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/sanket/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [52]:
df= df_for_prompting.head(2).copy()

In [53]:
df

Unnamed: 0,passage,list_of_entities,true_relations
0,A novel SCN5A mutation manifests as a malignan...,"[SCN5A, long QT syndrome, tachycardia, bradyca...","[{'entity1': 'bradycardia', 'entity2': 'SCN5A'..."
1,Allelic expression imbalance of human mu opioi...,"[human, mu opioid receptor, OPRM1, A118G, mu o...","[{'entity1': 'mu opioid receptor', 'entity2': ..."


In [56]:

#first_n_rows_df = df.head(3).copy()
# Create a prompt template
import time
# Start time
start_time = time.time()
template = (
    """
    Instruction: As a skilled medical domain expert, you are tasked to analyze the relation between entities from the given passage.
    The input will contain a list of all the entities present in the passage. 
    You should find the possible relation between different pairs of  entities.
    The relation can only be from the following list: ['Association', 'Positive_Correlation', 'Bind', 'Negative_Correlation', 'Comparison', 'Conversion', 'Cotreatment', 'Drug_Interaction'].
    Your response must only contain the following JSON format:
    {{[{{
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"}},
      
      {{
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"}}]
      
    }}


    passsage: {passage}
    
    entity list: {list_of_entities}

    """
)

from tqdm import tqdm  # Import tqdm library

# Assuming you have imported pandas and defined 'first_n_rows_df' and 'template' as described

# Create an empty list to store model outputs
model_outputs = []

# Use tqdm as a wrapper around the iteration for tracking progress
for index, row in tqdm(df.iterrows(), total=len(df)):
    prompt = template.format(
        passage=row['passage'],
        list_of_entities=row['list_of_entities']
    )
    # Print the input prompt
    print("Input Prompt:")
    print(prompt)

    output = get_model_response(prompt)[0][1]  # Assuming the response contains the relation
    print("Model Output:")
    print(output)
    
    model_outputs.append(output)

# Add a new column 'model_output' to 'first_n_rows_df' containing the model outputs
df['model_output'] = model_outputs
df.head(15)
end_time = time.time()

  0%|                                                     | 0/2 [00:00<?, ?it/s]

Input Prompt:

    Instruction: As a skilled medical domain expert, you are tasked to analyze the relation between entities from the given passage.
    The input will contain a list of all the entities present in the passage. 
    You should find the possible relation between different pairs of  entities.
    The relation can only be from the following list: ['Association', 'Positive_Correlation', 'Bind', 'Negative_Correlation', 'Comparison', 'Conversion', 'Cotreatment', 'Drug_Interaction'].
    Your response must only contain the following JSON format:
    {[{
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"},
      
      {
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"}]
      
    }


    passsage: A novel SCN5A mutation manifests as a malignant form of long QT syndrome with perinatal onset of tachycardia/b

 50%|██████████████████████                      | 1/2 [14:21<14:21, 861.41s/it]

Model Output:

Input Prompt:

    Instruction: As a skilled medical domain expert, you are tasked to analyze the relation between entities from the given passage.
    The input will contain a list of all the entities present in the passage. 
    You should find the possible relation between different pairs of  entities.
    The relation can only be from the following list: ['Association', 'Positive_Correlation', 'Bind', 'Negative_Correlation', 'Comparison', 'Conversion', 'Cotreatment', 'Drug_Interaction'].
    Your response must only contain the following JSON format:
    {[{
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"},
      
      {
      "entity1" : "name of the entity",
      "entity2" : "name of the entity",
      "relation" : "correct relation from given options"}]
      
    }


    passsage: Allelic expression imbalance of human mu opioid receptor (OPRM1) caused by variant A118G. As a 

100%|████████████████████████████████████████████| 2/2 [14:22<00:00, 431.29s/it]

Model Output:






In [57]:
df

Unnamed: 0,passage,list_of_entities,true_relations,model_output
0,A novel SCN5A mutation manifests as a malignan...,"[SCN5A, long QT syndrome, tachycardia, bradyca...","[{'entity1': 'bradycardia', 'entity2': 'SCN5A'...",
1,Allelic expression imbalance of human mu opioi...,"[human, mu opioid receptor, OPRM1, A118G, mu o...","[{'entity1': 'mu opioid receptor', 'entity2': ...",


In [11]:
data = pd.read_parquet('./Results/biored_re_llama2_7b_chat_model_response.parquet')

In [12]:
data.head()

Unnamed: 0,record_id,prompt,ground_truth,mapped_ground_true,model_response,answer,mapped_answer,time
0,15485686,\nInstruction: \nAs a skilled medical domain e...,"[{'entity1': 'bradycardia', 'entity1_identifie...","[{'entity1': 'bradycardia', 'entity1_identifie...",Here are the entity relationships reported in ...,"[{'entity1_identifier': '', 'entity2_identifie...","[{'entity1_identifier': '', 'entity2_identifie...",12-04-2023-14-31-37
1,16046395,\nInstruction: \nAs a skilled medical domain e...,"[{'entity1': 'mu opioid receptor', 'entity1_id...","[{'entity1': 'mu opioid receptor', 'entity1_id...",Here are the entity relations found in the pas...,"[{'entity1_identifier': '9606', 'entity2_ident...","[{'entity1_identifier': '9606', 'entity2_ident...",12-04-2023-14-34-27
2,18457324,\nInstruction: \nAs a skilled medical domain e...,"[{'entity1': 'cardiotoxic', 'entity1_identifie...","[{'entity1': 'cardiotoxic', 'entity1_identifie...",Sure! Here are the entity relations reported b...,"[{'entity1_identifier': '', 'entity2_identifie...","[{'entity1_identifier': '', 'entity2_identifie...",12-04-2023-14-36-18
3,1848636,\nInstruction: \nAs a skilled medical domain e...,"[{'entity1': 'beta-1 adrenoceptor', 'entity1_i...","[{'entity1': 'beta-1 adrenoceptor', 'entity1_i...",Here are the entity relationships reported in ...,"[{'entity1_identifier': 'D003647', 'entity2_id...","[{'entity1_identifier': 'D003647', 'entity2_id...",12-04-2023-14-37-15
4,19394258,\nInstruction: \nAs a skilled medical domain e...,"[{'entity1': 'DGUOK', 'entity1_identifier': '1...","[{'entity1': 'DGUOK', 'entity1_identifier': '1...",Here are the entity relations based on the inf...,"[{'entity1_identifier': '1716', 'entity2_ident...","[{'entity1_identifier': '1716', 'entity2_ident...",12-04-2023-14-37-59


In [64]:
data['mapped_answer'][0]

array([{'entity1': 'interleukin 12B', 'entity1_identifier': '3593', 'entity2': 'hepatitis C virus infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'IL12B', 'entity1_identifier': '3593', 'entity2': 'HCV-infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'Interleukin-12', 'entity1_identifier': '3593', 'entity2': 'HCV-infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'IL-12', 'entity1_identifier': '3593', 'entity2': 'HCV-infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'IL12B', 'entity1_identifier': '3593', 'entity2': 'HCV infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'IL-12', 'entity1_identifier': '3593', 'entity2': 'HCV infection', 'entity2_identifier': 'D006526', 'relation': 'Association'},
       {'entity1': 'IL12B', 'entity1_identifier': '3593', 'entity2': 'HCV infection',

In [13]:
data.to_csv('./Results/biored_re_llama2_7b_chat_model_response.csv')