In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# First, let's load the JSON file and take a look at its structure to understand it better.
import json

# Load the JSON data from the uploaded file
file_path = '/content/drive/MyDrive/Capstone Project/BioRED/Train.BioC.JSON'
with open(file_path, 'r') as file:
    biored_data = json.load(file)

# Now let's inspect the overall structure of the JSON data.
biored_data_structure = {key: type(value) for key, value in biored_data.items()}

biored_data_structure, list(biored_data.keys())


({'source': str, 'date': str, 'key': str, 'documents': list},
 ['source', 'date', 'key', 'documents'])

In [3]:
# Let's inspect the first document in the 'documents' list to understand its structure.
first_document_structure = {key: type(value) for key, value in biored_data['documents'][0].items()}
first_document_example = biored_data['documents'][0]

first_document_structure, first_document_example.keys()


({'id': str, 'passages': list, 'relations': list},
 dict_keys(['id', 'passages', 'relations']))

In [4]:
# Inspect the structure of a single passage and the relations in the first document.
passages_structure = {key: type(value) for key, value in first_document_example['passages'][0].items()}
relations_structure = {key: type(value) for key, value in first_document_example['relations'][0].items()}

passages_structure, first_document_example['passages'][0].keys(), relations_structure, first_document_example['relations'][0].keys()


({'offset': int, 'text': str, 'annotations': list},
 dict_keys(['offset', 'text', 'annotations']),
 {'id': str, 'infons': dict},
 dict_keys(['id', 'infons']))

In [5]:
# Let's take a closer look at an example annotation from the first passage and an example relation.
example_annotation = first_document_example['passages'][0]['annotations'][0]
example_relation = first_document_example['relations'][0]

example_annotation, example_relation


({'id': '0',
  'infons': {'identifier': '3175', 'type': 'GeneOrGeneProduct'},
  'text': 'Hepatocyte nuclear factor-6',
  'locations': [{'offset': 0, 'length': 27}]},
 {'id': 'R0',
  'infons': {'entity1': '3175',
   'entity2': 'D003924',
   'type': 'Association',
   'novel': 'No'}})

In [6]:
from typing import List, Dict


def preprocess_biored_data_fixed(data: Dict) -> List[Dict]:
    """
    Preprocess the BioRED data for relation extraction with error handling for missing entities.

    Parameters:
    - data (Dict): The loaded JSON data from the BioRED dataset.

    Returns:
    - List[Dict]: A list of preprocessed data suitable for relation extraction,
                   including tokenized text, entity markers, and relation labels.
    """

    # A list to hold all preprocessed examples
    preprocessed_examples = []

    # Token markers for entities, these can be adjusted as needed
    entity_start_marker = "[E]"
    entity_end_marker = "[/E]"

    # Loop over each document in the dataset
    for document in data['documents']:
        # Loop over each passage in the document
        for passage in document['passages']:
            # Make a copy of the passage text to modify it with entity markers
            marked_text = passage['text']

            # Keep track of adjustments needed due to added markers
            offset_adjustment = 0

            # Sort annotations by offset to handle text modification correctly
            annotations = sorted(passage['annotations'], key=lambda x: x['locations'][0]['offset'])

            # Mark the entities in the text with special tokens
            for annotation in annotations:
                entity_text = annotation['text']
                entity_offset = annotation['locations'][0]['offset'] + offset_adjustment
                entity_length = annotation['locations'][0]['length']

                # Insert entity end marker first to not mess up the start offset
                marked_text = (
                    marked_text[:entity_offset + entity_length] +
                    entity_end_marker +
                    marked_text[entity_offset + entity_length:]
                )
                # Insert entity start marker
                marked_text = (
                    marked_text[:entity_offset] +
                    entity_start_marker +
                    marked_text[entity_offset:]
                )

                # Update offset adjustment
                offset_adjustment += len(entity_start_marker) + len(entity_end_marker)

            # Create examples from relations
            for relation in document['relations']:
                try:
                    # Extract entity IDs from the relation
                    entity1_id = relation['infons']['entity1']
                    entity2_id = relation['infons']['entity2']

                    # Extract the text for both entities
                    entity1_text = next(
                        ann['text'] for ann in annotations if ann['infons']['identifier'] == entity1_id
                    )
                    entity2_text = next(
                        ann['text'] for ann in annotations if ann['infons']['identifier'] == entity2_id
                    )

                    # Create a new example for the relation
                    example = {
                        'text': marked_text,
                        'entity1': entity1_text,
                        'entity2': entity2_text,
                        'relation': relation['infons']['type']
                    }
                    preprocessed_examples.append(example)
                except StopIteration:
                    # If an entity in the relation cannot be found in annotations, skip this relation
                    continue

    return preprocessed_examples

# Run the preprocessing with the fixed function on the first 5 documents to create examples
preprocessed_data_fixed = preprocess_biored_data_fixed({'documents': biored_data['documents']})
print(len(preprocessed_data_fixed))  # Show the first 3 examples

preprocessed_data_fixed[:5]


## Include type as well from the raw dataset

4498


[{'text': '[E]Hepatocyte nuclear factor-6[/E]: associations between genetic variability and [E]type II diabetes[/E] and between genetic variability and estimates of [E]insulin[/E] secretion.',
  'entity1': 'Hepatocyte nuclear factor-6',
  'entity2': 'type II diabetes',
  'relation': 'Association'},
 {'text': 'The transcription factor hepatocyte nuclear factor (HNF)-6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young. We therefore [E]tested the hypothesis that variab[/E]ility in the HNF-6 gene is associated with subsets of Type II (non-insulin-[E]dependent) diabetes mel[/E]litus and estimates of insulin secretion in glucose tolerant subjects.   W[E]e clo[/E]ned the coding region as well as the[E] intron-exon boundaries of the HNF-6 gene. We the[/E]n examined them on[E] genomi[/E]c DNA in six M[E]ODY pro[/E]bands without mutations in the MODY1, MODY3 and MODY4 genes and in 54 patients with late-onset [E]Type [/E]II diabetes by 

In [7]:
from typing import List, Dict

def preprocess_biored_data_fixed(data: Dict) -> List[Dict]:
    """
    Preprocess the BioRED data for relation extraction with error handling for missing entities,
    and include the type of entities from the JSON data.

    Parameters:
    - data (Dict): The loaded JSON data from the BioRED dataset.

    Returns:
    - List[Dict]: A list of preprocessed data suitable for relation extraction,
                   including tokenized text, entity markers, relation labels, and types of entities.
    """

    # A list to hold all preprocessed examples
    preprocessed_examples = []

    # Token markers for entities, these can be adjusted as needed
    entity_start_marker = "[E]"
    entity_end_marker = "[/E]"

    # Loop over each document in the dataset
    for document in data['documents']:
        # Loop over each passage in the document
        for passage in document['passages']:
            # Make a copy of the passage text to modify it with entity markers
            marked_text = passage['text']

            # Keep track of adjustments needed due to added markers
            offset_adjustment = 0

            # Sort annotations by offset to handle text modification correctly
            annotations = sorted(passage['annotations'], key=lambda x: x['locations'][0]['offset'])

            # Mark the entities in the text with special tokens
            for annotation in annotations:
                entity_text = annotation['text']
                entity_offset = annotation['locations'][0]['offset'] + offset_adjustment
                entity_length = annotation['locations'][0]['length']
                entity_type = annotation['infons']['type']  # Extract the type

                # Insert entity end marker first to not mess up the start offset
                marked_text = (
                    marked_text[:entity_offset + entity_length] +
                    entity_end_marker +
                    marked_text[entity_offset + entity_length:]
                )
                # Insert entity start marker
                marked_text = (
                    marked_text[:entity_offset] +
                    entity_start_marker +
                    marked_text[entity_offset:]
                )

                # Update offset adjustment
                offset_adjustment += len(entity_start_marker) + len(entity_end_marker)

            # Create examples from relations
            for relation in document['relations']:
                try:
                    # Extract entity IDs from the relation
                    entity1_id = relation['infons']['entity1']
                    entity2_id = relation['infons']['entity2']

                    # Find the corresponding annotation for each entity
                    annotation1 = next(
                        ann for ann in annotations if ann['infons']['identifier'] == entity1_id
                    )
                    annotation2 = next(
                        ann for ann in annotations if ann['infons']['identifier'] == entity2_id
                    )

                    # Extract the text and type for both entities
                    entity1_text = annotation1['text']
                    entity2_text = annotation2['text']
                    entity1_type = annotation1['infons']['type']
                    entity2_type = annotation2['infons']['type']

                    # Create a new example for the relation
                    example = {
                        'text': marked_text,
                        'entity1': entity1_text,
                        'entity2': entity2_text,
                        'entity1_type': entity1_type,
                        'entity2_type': entity2_type,
                        'relation': relation['infons']['type']
                    }
                    preprocessed_examples.append(example)
                except StopIteration:
                    # If an entity in the relation cannot be found in annotations, skip this relation
                    continue

    return preprocessed_examples

# Example usage:
preprocessed_data_fixed = preprocess_biored_data_fixed({'documents': biored_data['documents']})
print(len(preprocessed_data_fixed))
preprocessed_data_fixed[:5]


4498


[{'text': '[E]Hepatocyte nuclear factor-6[/E]: associations between genetic variability and [E]type II diabetes[/E] and between genetic variability and estimates of [E]insulin[/E] secretion.',
  'entity1': 'Hepatocyte nuclear factor-6',
  'entity2': 'type II diabetes',
  'entity1_type': 'GeneOrGeneProduct',
  'entity2_type': 'DiseaseOrPhenotypicFeature',
  'relation': 'Association'},
 {'text': 'The transcription factor hepatocyte nuclear factor (HNF)-6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young. We therefore [E]tested the hypothesis that variab[/E]ility in the HNF-6 gene is associated with subsets of Type II (non-insulin-[E]dependent) diabetes mel[/E]litus and estimates of insulin secretion in glucose tolerant subjects.   W[E]e clo[/E]ned the coding region as well as the[E] intron-exon boundaries of the HNF-6 gene. We the[/E]n examined them on[E] genomi[/E]c DNA in six M[E]ODY pro[/E]bands without mutations in the MODY

In [8]:
import pandas as pd

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(preprocessed_data_fixed, columns=['text','entity1', 'entity2', 'entity1_type', 'entity2_type', 'relation'])

# Display the first few rows of the DataFrame
df


Unnamed: 0,text,entity1,entity2,entity1_type,entity2_type,relation
0,[E]Hepatocyte nuclear factor-6[/E]: associatio...,Hepatocyte nuclear factor-6,type II diabetes,GeneOrGeneProduct,DiseaseOrPhenotypicFeature,Association
1,The transcription factor hepatocyte nuclear fa...,hepatocyte nuclear factor (HNF)-6,maturity-onset diabetes,GeneOrGeneProduct,DiseaseOrPhenotypicFeature,Association
2,The transcription factor hepatocyte nuclear fa...,glucose,insulin,ChemicalEntity,GeneOrGeneProduct,Positive_Correlation
3,The transcription factor hepatocyte nuclear fa...,glucose,maturity-onset diabetes,ChemicalEntity,DiseaseOrPhenotypicFeature,Association
4,We have identified a type II Ca2+-dependent le...,type II Ca2+-dependent lectin,mannose,GeneOrGeneProduct,ChemicalEntity,Bind
...,...,...,...,...,...,...
4493,A 46-year old man with a chronic hepatitis C v...,simvastatin,hepatitis C virus infection,ChemicalEntity,DiseaseOrPhenotypicFeature,Negative_Correlation
4494,A 46-year old man with a chronic hepatitis C v...,simvastatin,rhabdomyolysis,ChemicalEntity,DiseaseOrPhenotypicFeature,Positive_Correlation
4495,A 46-year old man with a chronic hepatitis C v...,telaprevir,hepatitis C virus infection,ChemicalEntity,DiseaseOrPhenotypicFeature,Negative_Correlation
4496,A 46-year old man with a chronic hepatitis C v...,telaprevir,simvastatin,ChemicalEntity,ChemicalEntity,Drug_Interaction
