In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.matcher import Matcher
import random
import os
import re

# Load a base spaCy model
nlp = spacy.load("en_core_web_sm")  # Can be changed to en_core_web_trf for better accuracy


In [12]:
file_path = "/content/CDR_TrainingSet.PubTator.txt"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

print(f"Total lines in file: {len(lines)}")
print("First 5 lines:")
for line in lines[:5]:
    print(line)

Total lines in file: 11923
First 5 lines:
227508|t|Naloxone reverses the antihypertensive effect of clonidine.

227508|a|In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate receptors. As naloxone and clonidine do not appear to interact with the same receptor si

In [15]:
def preprocess_pubtator(file_path):
    """
    Process PubTator dataset into a structured format.
    Returns a list of (text, entities) tuples.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data = []
    current_id = None
    current_text = ""
    current_entities = []

    for line in lines:
        line = line.strip()

        if not line:
            if current_id and current_text:
                data.append((current_text, current_entities))
            current_id, current_text, current_entities = None, "", []
            continue

        if "|t|" in line or "|a|" in line:
            parts = line.split("|")
            if len(parts) < 3:
                continue
            pmid, section, text = parts[0], parts[1], parts[2]
            if pmid != current_id and current_id is not None:
                data.append((current_text, current_entities))
                current_entities = []
            current_id = pmid
            current_text += text + " "

        elif re.match(r"^\d+\t\d+\t\d+\t", line):
            parts = line.split("\t")
            if len(parts) < 4:
                continue
            start, end, label = int(parts[1]), int(parts[2]), parts[3]
            entity_type = parts[4]  # Either "Chemical" or "Disease"
            current_entities.append((start, end, entity_type))

    return data

# Load training, dev, and test data
train_data = preprocess_pubtator("/content/CDR_TrainingSet.PubTator.txt")
dev_data = preprocess_pubtator("/content/CDR_DevelopmentSet.PubTator.txt")
test_data = preprocess_pubtator("/content/CDR_TestSet.PubTator.txt")

print(f"Training samples: {len(train_data)}")
print(f"Development samples: {len(dev_data)}")
print(f"Test samples: {len(test_data)}")

Training samples: 500
Development samples: 500
Test samples: 500


In [20]:
def convert_to_spacy(data, output_file, nlp):
    """
    Convert data into spaCy DocBin format and save to disk.
    """
    db = DocBin()

    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []

        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)

        doc.ents = ents
        db.add(doc)

    db.to_disk(output_file)

# Load spaCy model
nlp = spacy.blank("en")  # Blank English model

# Convert datasets
convert_to_spacy(train_data, "/content/train.spacy", nlp)
convert_to_spacy(dev_data, "/content/dev.spacy", nlp)
convert_to_spacy(test_data, "/content/test.spacy", nlp)

print("SpaCy dataset files saved!")


SpaCy dataset files saved!


In [22]:
!python -m spacy init config config.cfg --pipeline "ner" --optimize "efficiency"
!python -m spacy train config.cfg --output ./output --paths.train /content/train.spacy --paths.dev /content/train.spacy --gpu-id -1



[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     63.90    1.31    0.85    2.89    0.01
  0     200       1189.64   6743.44   60.37   73.17   51.38    0.60
  0     400        226.71   3478.13   77.37   80.21   74.72    0.77
  1     600        203.42   2513.70   84.36   87.13   81.77    0.84
  1     800        215.85   1813.25   88.07   92.30   84.22    0.88
  2    1000        219.28   1913.97   90.34   91.02   89.66    0.90
  2    1200        211.33   1180.20   91.72   94.50   89.10    0.92
  2    1400        220.58   1249.61   92.73   95.07   90.5

In [24]:
!python -m spacy evaluate /content/output/model-best /content/test.spacy


[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   83.40 
NER R   74.25 
NER F   78.56 
SPEED   22482 

[1m

               P       R       F
Chemical   88.04   76.20   81.69
Disease    78.21   71.93   74.94



In [25]:
nlp_ner = spacy.load("./output/model-best")

text = "Naloxone reverses the antihypertensive effect of clonidine."
doc = nlp_ner(text)

for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


Naloxone -> Chemical
clonidine -> Chemical


In [26]:
# Example: Load the first article from your dataset
with open("/content/CDR_TrainingSet.PubTator.txt", "r") as file:
    articles = file.read().split("\n\n")  # Split by articles (assuming articles are separated by double newlines)

first_article = articles[0]  # Select the first article
print(first_article)  # Check the content

227508|t|Naloxone reverses the antihypertensive effect of clonidine.
227508|a|In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate receptors. As naloxone and clonidine do not appear to interact with the same receptor site, the observed functional antagonism sugg

In [27]:
# Load your trained model
nlp_ner = spacy.load("./output/model-best")

# Process the first article
doc = nlp_ner(first_article)

In [28]:
from spacy import displacy

# Render the entities in the article
displacy.render(doc, style="ent", jupyter=True)  # Use `jupyter=True` if you're in a Jupyter notebook

In [29]:
# Define custom colors for entity labels
colors = {
    "CHEMICAL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "DISEASE": "linear-gradient(90deg, #ff9561, #ffcc00)",
}
options = {"colors": colors}

# Render with custom colors
displacy.render(doc, style="ent", options=options, jupyter=True)