In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# First, let's load the JSON file and take a look at its structure to understand it better.
import json

# Load the JSON data from the uploaded file
file_path = './BioRED/Test.BioC.JSON'  # Replace with your file path
with open(file_path, 'r') as file:
    biored_data = json.load(file)

# Now let's inspect the overall structure of the JSON data.
biored_data_structure = {key: type(value) for key, value in biored_data.items()}

biored_data_structure, list(biored_data.keys())


({'source': str, 'date': str, 'key': str, 'documents': list},
 ['source', 'date', 'key', 'documents'])

In [2]:
# Let's inspect the first document in the 'documents' list to understand its structure.
first_document_structure = {key: type(value) for key, value in biored_data['documents'][0].items()}
first_document_example = biored_data['documents'][0]

first_document_structure, first_document_example.keys()


({'id': str, 'passages': list, 'relations': list},
 dict_keys(['id', 'passages', 'relations']))

In [3]:
# Inspect the structure of a single passage and the relations in the first document.
passages_structure = {key: type(value) for key, value in first_document_example['passages'][0].items()}
relations_structure = {key: type(value) for key, value in first_document_example['relations'][0].items()}

passages_structure, first_document_example['passages'][0].keys(), relations_structure, first_document_example['relations'][0].keys()


({'offset': int, 'text': str, 'annotations': list},
 dict_keys(['offset', 'text', 'annotations']),
 {'id': str, 'infons': dict},
 dict_keys(['id', 'infons']))

In [4]:
# Let's take a closer look at an example annotation from the first passage and an example relation.
example_annotation = first_document_example['passages'][0]['annotations'][0]
example_relation = first_document_example['relations'][0]

example_annotation, example_relation


({'id': '0',
  'infons': {'identifier': '6331', 'type': 'GeneOrGeneProduct'},
  'text': 'SCN5A',
  'locations': [{'offset': 8, 'length': 5}]},
 {'id': 'R0',
  'infons': {'entity1': 'D001919',
   'entity2': '6331',
   'type': 'Association',
   'novel': 'Novel'}})

In [5]:
from typing import List, Dict


def preprocess_biored_data_fixed(data: Dict) -> List[Dict]:
    """
    Preprocess the BioRED data for relation extraction with error handling for missing entities.

    Parameters:
    - data (Dict): The loaded JSON data from the BioRED dataset.

    Returns:
    - List[Dict]: A list of preprocessed data suitable for relation extraction,
                   including tokenized text, entity markers, and relation labels.
    """

    # A list to hold all preprocessed examples
    preprocessed_examples = []

    # Token markers for entities, these can be adjusted as needed
    entity_start_marker = "[E]"
    entity_end_marker = "[/E]"

    # Loop over each document in the dataset
    for document in data['documents']:
        # Loop over each passage in the document
        for passage in document['passages']:
            # Make a copy of the passage text to modify it with entity markers
            marked_text = passage['text']

            # Keep track of adjustments needed due to added markers
            offset_adjustment = 0

            # Sort annotations by offset to handle text modification correctly
            annotations = sorted(passage['annotations'], key=lambda x: x['locations'][0]['offset'])

            # Mark the entities in the text with special tokens
            for annotation in annotations:
                entity_text = annotation['text']
                entity_offset = annotation['locations'][0]['offset'] + offset_adjustment
                entity_length = annotation['locations'][0]['length']

                # Insert entity end marker first to not mess up the start offset
                marked_text = (
                    marked_text[:entity_offset + entity_length] +
                    entity_end_marker +
                    marked_text[entity_offset + entity_length:]
                )
                # Insert entity start marker
                marked_text = (
                    marked_text[:entity_offset] +
                    entity_start_marker +
                    marked_text[entity_offset:]
                )

                # Update offset adjustment
                offset_adjustment += len(entity_start_marker) + len(entity_end_marker)

            # Create examples from relations
            for relation in document['relations']:
                try:
                    # Extract entity IDs from the relation
                    entity1_id = relation['infons']['entity1']
                    entity2_id = relation['infons']['entity2']

                    # Extract the text for both entities
                    entity1_text = next(
                        ann['text'] for ann in annotations if ann['infons']['identifier'] == entity1_id
                    )
                    entity2_text = next(
                        ann['text'] for ann in annotations if ann['infons']['identifier'] == entity2_id
                    )

                    # Create a new example for the relation
                    example = {
                        'text': marked_text,
                        'entity1': entity1_text,
                        'entity2': entity2_text,
                        'relation': relation['infons']['type']
                    }
                    preprocessed_examples.append(example)
                except StopIteration:
                    # If an entity in the relation cannot be found in annotations, skip this relation
                    continue

    return preprocessed_examples

# Run the preprocessing with the fixed function on the first 5 documents to create examples
preprocessed_data_fixed = preprocess_biored_data_fixed({'documents': biored_data['documents']})
print(len(preprocessed_data_fixed))  # Show the first 3 examples

preprocessed_data_fixed[:5]


## Include type as well from the raw dataset

1123


[{'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'bradycardia',
  'entity2': 'SCN5A',
  'relation': 'Association'},
 {'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'tachycardia',
  'entity2': 'SCN5A',
  'relation': 'Association'},
 {'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'long QT syndrome',
  'entity2': 'SCN5A',
  'relation': 'Association'},
 {'text': 'OBJECTIVE: Congenital long QT syndrome (LQTS) with in utero onset of the rhythm disturbances is associated with a poor prognosis. In this study [E]we investigated [/E]a [E]newb[/E]orn patient with fetal bradycardia, 2:1 atrioventricular block and ventricu

In [6]:
from typing import List, Dict

def preprocess_biored_data_fixed(data: Dict) -> List[Dict]:
    """
    Preprocess the BioRED data for relation extraction with error handling for missing entities,
    and include the type of entities from the JSON data.

    Parameters:
    - data (Dict): The loaded JSON data from the BioRED dataset.

    Returns:
    - List[Dict]: A list of preprocessed data suitable for relation extraction,
                   including tokenized text, entity markers, relation labels, and types of entities.
    """

    # A list to hold all preprocessed examples
    preprocessed_examples = []

    # Token markers for entities, these can be adjusted as needed
    entity_start_marker = "[E]"
    entity_end_marker = "[/E]"

    # Loop over each document in the dataset
    for document in data['documents']:
        # Loop over each passage in the document
        for passage in document['passages']:
            # Make a copy of the passage text to modify it with entity markers
            marked_text = passage['text']

            # Keep track of adjustments needed due to added markers
            offset_adjustment = 0

            # Sort annotations by offset to handle text modification correctly
            annotations = sorted(passage['annotations'], key=lambda x: x['locations'][0]['offset'])

            # Mark the entities in the text with special tokens
            for annotation in annotations:
                entity_text = annotation['text']
                entity_offset = annotation['locations'][0]['offset'] + offset_adjustment
                entity_length = annotation['locations'][0]['length']
                entity_type = annotation['infons']['type']  # Extract the type

                # Insert entity end marker first to not mess up the start offset
                marked_text = (
                    marked_text[:entity_offset + entity_length] +
                    entity_end_marker +
                    marked_text[entity_offset + entity_length:]
                )
                # Insert entity start marker
                marked_text = (
                    marked_text[:entity_offset] +
                    entity_start_marker +
                    marked_text[entity_offset:]
                )

                # Update offset adjustment
                offset_adjustment += len(entity_start_marker) + len(entity_end_marker)

            # Create examples from relations
            for relation in document['relations']:
                try:
                    # Extract entity IDs from the relation
                    entity1_id = relation['infons']['entity1']
                    entity2_id = relation['infons']['entity2']

                    # Find the corresponding annotation for each entity
                    annotation1 = next(
                        ann for ann in annotations if ann['infons']['identifier'] == entity1_id
                    )
                    annotation2 = next(
                        ann for ann in annotations if ann['infons']['identifier'] == entity2_id
                    )

                    # Extract the text and type for both entities
                    entity1_text = annotation1['text']
                    entity2_text = annotation2['text']
                    entity1_type = annotation1['infons']['type']
                    entity2_type = annotation2['infons']['type']

                    # Create a new example for the relation
                    example = {
                        'text': marked_text,
                        'entity1': entity1_text,
                        'entity2': entity2_text,
                        'entity1_type': entity1_type,
                        'entity2_type': entity2_type,
                        'relation': relation['infons']['type']
                    }
                    preprocessed_examples.append(example)
                except StopIteration:
                    # If an entity in the relation cannot be found in annotations, skip this relation
                    continue

    return preprocessed_examples

# Example usage:
preprocessed_data_fixed = preprocess_biored_data_fixed({'documents': biored_data['documents']})
print(len(preprocessed_data_fixed))
preprocessed_data_fixed[:5]


1123


[{'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'bradycardia',
  'entity2': 'SCN5A',
  'entity1_type': 'DiseaseOrPhenotypicFeature',
  'entity2_type': 'GeneOrGeneProduct',
  'relation': 'Association'},
 {'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'tachycardia',
  'entity2': 'SCN5A',
  'entity1_type': 'DiseaseOrPhenotypicFeature',
  'entity2_type': 'GeneOrGeneProduct',
  'relation': 'Association'},
 {'text': 'A novel [E]SCN5A[/E] mutation manifests as a malignant form of [E]long QT syndrome[/E] with perinatal onset of [E]tachycardia[/E]/[E]bradycardia[/E].',
  'entity1': 'long QT syndrome',
  'entity2': 'SCN5A',
  'entity1_type': 'DiseaseOrPhenotypicFeature',
  'entity2_type': 'GeneOrGeneProduct',
  'relation': 'Association'},
 {'te

In [7]:
import pandas as pd

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(preprocessed_data_fixed, columns=['text','entity1', 'entity2', 'entity1_type', 'entity2_type', 'relation'])

# Display the first few rows of the DataFrame
df


Unnamed: 0,text,entity1,entity2,entity1_type,entity2_type,relation
0,A novel [E]SCN5A[/E] mutation manifests as a m...,bradycardia,SCN5A,DiseaseOrPhenotypicFeature,GeneOrGeneProduct,Association
1,A novel [E]SCN5A[/E] mutation manifests as a m...,tachycardia,SCN5A,DiseaseOrPhenotypicFeature,GeneOrGeneProduct,Association
2,A novel [E]SCN5A[/E] mutation manifests as a m...,long QT syndrome,SCN5A,DiseaseOrPhenotypicFeature,GeneOrGeneProduct,Association
3,OBJECTIVE: Congenital long QT syndrome (LQTS) ...,bradycardia,Na(v)1.5,DiseaseOrPhenotypicFeature,GeneOrGeneProduct,Association
4,OBJECTIVE: Congenital long QT syndrome (LQTS) ...,bradycardia,V1763M,DiseaseOrPhenotypicFeature,SequenceVariant,Positive_Correlation
...,...,...,...,...,...,...
1118,Aconitine is a major bioactive diterpenoid alk...,Aconitine,cardiotoxicity,ChemicalEntity,DiseaseOrPhenotypicFeature,Positive_Correlation
1119,Aconitine is a major bioactive diterpenoid alk...,Aconitine,myocardial injury,ChemicalEntity,DiseaseOrPhenotypicFeature,Positive_Correlation
1120,Aconitine is a major bioactive diterpenoid alk...,Aconitine,MAPK,ChemicalEntity,GeneOrGeneProduct,Association
1121,Aconitine is a major bioactive diterpenoid alk...,Aconitine,Ca(2+),ChemicalEntity,ChemicalEntity,Positive_Correlation


In [13]:
pip install wordcloud

Collecting wordcloud
  Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/ac/66/937d1d73389c0b501c928c4d8513653063d2b40272dff70d0e283d8b9144/wordcloud-1.9.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading wordcloud-1.9.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.3-cp311-cp311-macosx_11_0_arm64.whl (168 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

In [15]:
# 1. Descriptive Statistics
print("Descriptive Statistics:")
df.describe()

Descriptive Statistics:


Unnamed: 0,text,entity1,entity2,entity1_type,entity2_type,relation,entity_pair
count,1123,1123,1123,1123,1123,1123,1123
unique,176,469,575,4,4,8,13
top,BACKGROUND: Respiratory syncytial virus (RSV) ...,PPARa,mitochondrial antiviral-signaling protein,GeneOrGeneProduct,DiseaseOrPhenotypicFeature,Association,GeneOrGeneProduct & DiseaseOrPhenotypicFeature
freq,45,28,15,400,446,585,196


In [16]:
# 2. Relation Distribution
print("\nRelation Distribution:")
df['relation'].value_counts()






Relation Distribution:


relation
Association             585
Positive_Correlation    312
Negative_Correlation    190
Cotreatment              16
Comparison                9
Bind                      5
Drug_Interaction          4
Conversion                2
Name: count, dtype: int64

In [17]:
print("\nentity1_type Distribution:")
df['entity1_type'].value_counts()


entity1_type Distribution:


entity1_type
GeneOrGeneProduct             400
ChemicalEntity                370
DiseaseOrPhenotypicFeature    252
SequenceVariant               101
Name: count, dtype: int64

In [18]:
print("\nentity2_type Distribution:")
df['entity2_type'].value_counts()


entity2_type Distribution:


entity2_type
DiseaseOrPhenotypicFeature    446
GeneOrGeneProduct             356
ChemicalEntity                245
SequenceVariant                76
Name: count, dtype: int64

In [19]:
# 3. Entity Analysis
print("\nEntity Analysis:")
df['entity1'].value_counts()
df['entity2'].value_counts()


Entity Analysis:


entity2
mitochondrial antiviral-signaling protein    15
breast cancer                                12
ribavirin                                    12
malignancy                                   11
ethanol                                      11
                                             ..
c.2187delA                                    1
c.2992C>T                                     1
rs1800255                                     1
rs1801184                                     1
Ca(2+)                                        1
Name: count, Length: 575, dtype: int64

In [20]:
# 4. Text Analysis
print("\nAverage text length:", df['text'].str.len().mean())


Average text length: 1690.6402493321461


In [21]:
# 5. Co-occurrence Analysis
# Example: Count pairs of entities
# Combine entity1_type and entity2_type into a new column 'entity_pair'
df['entity_pair'] = df['entity1'] + ' & ' + df['entity2']

# Now count the occurrences of each unique pair
entity_pair_counts = df['entity_pair'].value_counts().reset_index()
entity_pair_counts.columns = ['entity_pair', 'Count']
entity_pair_counts

Unnamed: 0,entity_pair,Count
0,cyclophosphamide & Wegener's granulomatosis,2
1,CenpH & cyclin B1,2
2,NDP & retinopathy of prematurity,2
3,NDP & Norrie disease,2
4,ephedrine & hypotension,2
...,...,...
1075,IL-5 & mitochondrial antiviral-signaling protein,1
1076,IL-4 & mitochondrial antiviral-signaling protein,1
1077,IL-1b & mitochondrial antiviral-signaling protein,1
1078,TIR-domain-containing adapter-inducing interfe...,1


In [24]:
# 8. Duplication Check
print("\nDuplicates:", df.duplicated().sum())


Duplicates: 1


In [25]:
# To sort the DataFrame and then find and print out the duplicate rows
df_sorted = df.sort_values(by=['entity1', 'entity2', 'relation'], ascending=True)
duplicates_sorted = df_sorted[df_sorted.duplicated(keep=False)]
print(f"Number of duplicate rows: {duplicates_sorted.shape[0]}")
duplicates_sorted


Number of duplicate rows: 2


Unnamed: 0,text,entity1,entity2,entity1_type,entity2_type,relation,entity_pair
737,BACKGROUND: Studies in mice have shown that PP...,PPARa,lipid,GeneOrGeneProduct,ChemicalEntity,Association,PPARa & lipid
763,BACKGROUND: Studies in mice have shown that PP...,PPARa,lipid,GeneOrGeneProduct,ChemicalEntity,Association,PPARa & lipid


In [26]:
# 9. Missing Values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
text            0
entity1         0
entity2         0
entity1_type    0
entity2_type    0
relation        0
entity_pair     0
dtype: int64
