In [1]:
%pip install torch transformers huggingface_hub requests beautifulsoup4 lxml

import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from huggingface_hub import hf_hub_download
import requests
from bs4 import BeautifulSoup

# 1. Load BioBERT (pre-trained for biomedical NER - Named Entity Recognition)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 2. Fetch sample PubMed abstracts (simplified for demo)
def fetch_pubmed_abstracts(query="breast cancer biomarkers", max_results=10):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json"
    response = requests.get(url)
    ids = response.json()["esearchresult"]["idlist"]
    abstracts = []
    for pmid in ids:
        fetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
        xml = requests.get(fetch_url).text
        soup = BeautifulSoup(xml, "xml")
        abstract = soup.find("AbstractText")
        if abstract:
            abstracts.append(abstract.text)
    return abstracts

# 3. Process text with BioBERT to extract entities
def extract_entities(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs).logits
    predictions = outputs.argmax(dim=-1)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = []
    current_entity = ""
    for token, pred in zip(tokens, predictions):
        if pred > 0:  # Simplified: assume >0 is an entity (e.g., gene/protein)
            if token.startswith("##"):
                current_entity += token[2:]
            else:
                if current_entity:
                    entities.append(current_entity)
                current_entity = token
    if current_entity:
        entities.append(current_entity)
    return entities

# 4. Load sample GEO/TCGA data (mocked for demo - replace with real data)
geo_data = pd.DataFrame({
    "Gene": ["BRCA1", "HER2", "TP53"],
    "Expression_Diff": [5.2, 3.8, -4.1]  # Differential expression (log fold change)
})

# 5. Main discovery logic
abstracts = fetch_pubmed_abstracts()
potential_biomarkers = set()
for abstract in abstracts:
    entities = extract_entities(abstract)
    for entity in entities:
        if entity in geo_data["Gene"].values:  # Check if gene appears in GEO/TCGA
            potential_biomarkers.add(entity)

# 6. Rank by expression difference (simplified scoring)
results = geo_data[geo_data["Gene"].isin(potential_biomarkers)].sort_values("Expression_Diff", ascending=False)
print("Potential Biomarkers for Breast Cancer:")
print(results[["Gene", "Expression_Diff"]])

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Potential Biomarkers for Breast Cancer:
Empty DataFrame
Columns: [Gene, Expression_Diff]
Index: []
