<a href="https://colab.research.google.com/github/christophergaughan/Bioinformatics-Code/blob/main/Vasculogic_query_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and NER model
model_name = "dslim/bert-base-NER"  # Pre-trained NER model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Verify it works
print(f"Loaded model: {model_name}")


In [None]:
from google.colab import files

# Upload the Excel file
uploaded = files.upload()

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/Updated_indications_and_assets.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows
print(data.head())

# Get column names for reference
print(data.columns)


In [None]:
import pandas as pd

# Load the Excel file into a pandas DataFrame
file_path = '/content/Updated_indications_and_assets.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows to understand the structure
print(data.head())

# Display the column names to reference later
print(data.columns)


## 1. `Example Marketed Therapies`
* Content: List of drugs or therapies that are commonly marketed to treat the  disorder.
* PubMed Query: `"Marketed therapies for [Disorder]"` or `"FDA-approved drugs for [Disorder]"`.
* Example Data: `"Aspirin; Ibuprofen; Paracetamol"`.

## 2. `Clinical Efficacy`
* Content: Summary of the efficacy of the therapies used for the disorder, such as survival rates, progression-free survival, or effectiveness in managing symptoms.
* PubMed Query: `"Clinical efficacy of [Drug Name] for [Disorder]"` or `"Effectiveness of [Drug Name] in [Disorder]"`.
Example Data: "Improves survival by 20%; Reduces relapse rate by 30%".

## 3. `Biomarkers`
* Content: Key biomarkers associated with the disorder, which may indicate disease progression or therapeutic targets.
* PubMed Query: `"Biomarkers for [Disorder]"` or `"Genetic markers for [Disorder]"`.
Example Data: `"BRCA1; BRCA2; HER2"`.


# Sample Data Flow
For a disorder like "Breast Cancer", we might populate the columns as:

* `Disorder`: `"Breast Cancer"`
* `Example Marketed Therapies`: `"Trastuzumab (Herceptin); Tamoxifen; Palbociclib"`
* `Clinical Efficacy`: `"Improves survival by 15%; 40% progression-free survival"`
`Biomarkers`: `"HER2; BRCA1; BRCA2"`

In [None]:
!pip install biopython


In [None]:
from Bio import Entrez

print("Biopython installed successfully!")


In [None]:
from transformers import pipeline

# Ensure the summarizer uses the GPU (device=0 for the first GPU)
summarizer = pipeline("summarization", model="t5-small", device=0)


In [None]:
def chunk_text(text, max_length=512):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield " ".join(words[i:i + max_length])

# Example usage
text = "Your very long input text here..."
chunks = list(chunk_text(text, max_length=512))


In [None]:
summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
    summaries.append(summary)

# Combine summaries
final_summary = " ".join(summaries)
print(final_summary)


In [None]:
print(data.columns)


In [None]:
data['Clinical Efficacy (PI*)'] = data['Clinical Efficacy (PI*)'].astype(str)


In [None]:
data.rename(columns={
    'Clinical Efficacy (PI*)': 'Clinical Efficacy',
    'Example Marketed Therapies (Brand Names)': 'Marketed Therapies',
    'Biomarkers': 'Biomarkers'
}, inplace=True)


In [None]:
data['Clinical Efficacy'] = data['Clinical Efficacy'].astype(str)
data['Marketed Therapies'] = data['Marketed Therapies'].astype(str)
data['Biomarkers'] = data['Biomarkers'].astype(str)


In [None]:
required_columns = ['Clinical Efficacy', 'Marketed Therapies', 'Biomarkers']
for col in required_columns:
    if col not in data.columns:
        data[col] = "No data available"


In [None]:
required_columns = [
    'Biomarkers',
    'Example Marketed Therapies (Brand Names)',
    'Clinical Efficacy (PI*)'
]

for col in required_columns:
    if col not in data.columns:
        data[col] = "No data available"  # Add placeholder values


In [None]:
data['Biomarkers'] = data['Biomarkers'].astype(str)
data['Example Marketed Therapies (Brand Names)'] = data['Example Marketed Therapies (Brand Names)'].astype(str)
data['Clinical Efficacy (PI*)'] = data['Clinical Efficacy (PI*)'].astype(str)


In [None]:
# # Ensure compatible column data types
# data['Biomarkers'] = data['Biomarkers'].astype(str)
# data['Example Marketed Therapies (Brand Names)'] = data['Example Marketed Therapies (Brand Names)'].astype(str)
# data['Clinical Efficacy (PI*)'] = data['Clinical Efficacy (PI*)'].astype(str)

# # Populate the DataFrame
# for index, row in data.iterrows():
#     disorder = row['Disorder']

#     # Query PubMed for Example Marketed Therapies
#     therapy_query = f"Marketed therapies for {disorder}"
#     therapy_ids = search_pubmed(therapy_query)
#     therapy_articles = fetch_pubmed_details(therapy_ids)
#     chunks = list(chunk_text(therapy_articles, max_length=512))

#     therapies = []
#     for chunk in chunks:
#         inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
#         outputs = model(**inputs)
#         predictions = torch.argmax(outputs.logits, dim=2)
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         labels = [model.config.id2label[p.item()] for p in predictions[0]]
#         therapies.extend([token for token, label in zip(tokens, labels) if label == "B-DRUG"])
#     data.at[index, 'Example Marketed Therapies (Brand Names)'] = "; ".join(set(therapies))  # Deduplicate therapies

#     # Query PubMed for Clinical Efficacy
#     efficacy_query = f"Clinical efficacy of therapies for {disorder}"
#     efficacy_ids = search_pubmed(efficacy_query)
#     efficacy_articles = fetch_pubmed_details(efficacy_ids)
#     efficacy_chunks = list(chunk_text(efficacy_articles, max_length=512))
#     summaries = [summarizer(chunk, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
#                  for chunk in efficacy_chunks]
#     data.at[index, 'Clinical Efficacy (PI*)'] = " ".join(summaries)

#     # Query PubMed for Biomarkers
#     biomarker_query = f"Biomarkers for {disorder}"
#     biomarker_ids = search_pubmed(biomarker_query)
#     biomarker_articles = fetch_pubmed_details(biomarker_ids)
#     biomarker_chunks = list(chunk_text(biomarker_articles, max_length=512))

#     biomarkers = []
#     for chunk in biomarker_chunks:
#         inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
#         outputs = model(**inputs)
#         predictions = torch.argmax(outputs.logits, dim=2)
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         labels = [model.config.id2label[p.item()] for p in predictions[0]]
#         biomarkers.extend([token for token, label in zip(tokens, labels) if label == "B-BIOMARKER"])
#     data.at[index, 'Biomarkers'] = "; ".join(set(biomarkers))  # Deduplicate biomarkers

# # Save the updated DataFrame back to Excel
# updated_file_path = '/content/Updated_indications_and_assets_filled4.xlsx'
# data.to_excel(updated_file_path, index=False)


In [None]:
# Check for missing values in the input column
print(data['Disorder'].isnull().sum())  # Count NaN in the Disorder column
print(data['Disorder'].unique())  # Check unique values for unexpected entries


## 1. The Biomarkers column is mostly missing, despite appearing to be retrievable, we can deal with this. It *might* be due to issues with:

* PubMed Query Relevance: The queries used for `Biomarkers` may not match  PubMed's indexed terms for specific biomarkers.
* NER Model Limitations: The model may not be well-suited for extracting specific scientific terms like `biomarkers` from abstracts.
* Chunking or Processing Errors: Biomarker-related information might be split across chunks or overlooked during processing.

We *can deal*


Strategies to Improve Biomarker Retrieval
1. Refine PubMed Queries
The default query format ("Biomarkers for [Disorder]") may not align with how biomarkers are described in PubMed. Surprise.

Enhanced Query Examples:

"`Biomarkers in [Disorder]`" (e.g., "`Biomarkers in Lung Cancer``")
"`Genetic markers for [Disorder]`" (e.g., "`Genetic markers for Leukemia`")
"`Molecular biomarkers for [Disorder]`"

Implementation: Update the query construction for biomarkers:

In [None]:
# Assuming 'data' is your DataFrame
for index, row in data.iterrows():
    disorder = row['Disorder']  # Assuming 'Disorder' is the column name
    biomarker_query = f"Molecular biomarkers for {disorder}"

In [None]:
# Assuming 'data' is your DataFrame
for index, row in data.iterrows():
    disorder = row['Disorder']  # Assuming 'Disorder' is the column name
    biomarker_query = f"Molecular biomarkers for {disorder}"
    # ... (rest of your code to process the query) ...

## 2. Use Synonyms and Keywords
Some disorders may have synonyms or related terms. For example:

"`Cancer`" → "`Tumor`"

"`Leukemia`" → "`Blood cancer`"

**Create a mapping of synonyms and keywords to improve the coverage of PubMed queries:**




In [None]:
keyword_mapping = {
    "Lung Cancer": ["Lung Cancer", "Pulmonary Tumors"],
    "Leukemia": ["Leukemia", "Blood Cancer"],
    "Solid Tumors": ["lung cancer", "breast cancer", "colorectal cancer", "prostate cancer", "liver cancer"],
    "Hematologic Cancers": ["leukemia", "lymphoma", "multiple myeloma"],
    "Rare Cancers": ["sarcomas", "neuroendocrine tumors", "pediatric cancers"],
    "Immuno-oncology": ["checkpoint inhibitors", "CAR-T therapies"],
    "Neurology and Psychiatry": ["neurological disorders", "schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Psychiatric Disorders": ["schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Neurodegenerative Diseases": ["Alzheimer's Disease", "Parkinson's Disease", "ALS"],
    "Neurodevelopmental Disorders": ["autism", "ADHD"],
    "Seizure Disorders": ["epilepsy", "Dravet syndrome"],
    "Movement Disorders": ["Huntington's Disease", "dystonia", "Tourette syndrome", "Benign Tremor", "Parkinson's Disease"],
    "Cardiovascular Diseases": [
        "Atherosclerosis", "Coronary Artery Disease", "Low-Density Lipoprotein (LDL)", "High-Density Lipoprotein (HDL)",
        "Heart Failure", "Ventricular Hypertrophy", "Hypertension", "Arrhythmia", "Atrial Fibrillation",
        "Venous Thromboembolism", "Peripheral Artery Disease", "Arteriovenous Malformation"
    ],
    "Autoimmune and Inflammatory Diseases": ["Rheumatoid Arthritis", "Psoriasis", "Psoriatic Arthritis", "Lupus", "Multiple Sclerosis", "Ankylosing Spondylitis"],
    "Inflammatory Bowel Disease": ["Crohn's Disease", "Ulcerative Colitis"],
    "Autoimmune Skin Disorders": ["atopic dermatitis", "vitiligo"],
    "Metabolic Disorders": [
        "Diabetes Type 1", "Diabetes Type 2", "Obesity", "Dyslipidemia", "Hypercholesterolemia",
        "Hyperglycemia", "Hypoglycemia", "Non-Alcoholic Steatohepatitis", "NASH", "MASH", "Gout",
        "Hyperthyroidism", "Hypothyroidism"
    ],
    "Rare Metabolic Disorders": ["lysosomal storage disease"],
    "Infectious Diseases": [
        "HIV", "AIDS", "COVID-19", "SARS", "Hepatitis B", "Hepatitis C", "Tuberculosis",
        "Bacterial Infections", "antibiotic-resistant infections", "Fungal Infections",
        "ESKAPE", "Malaria", "Dengue"
    ],
    "Respiratory Diseases": ["Chronic Obstructive Pulmonary Disease", "COPD", "Asthma", "Cystic Fibrosis", "Idiopathic Pulmonary Fibrosis", "IPF", "Allergic Rhinitis"],
    "Rare Diseases": [
        "Orphan Indications", "Genetic Disorders", "Cystic Fibrosis", "Duchenne Muscular Dystrophy",
        "Rare Neurodegenerative and Neuromuscular Disorders", "Spinal Muscular Atrophy",
        "Inherited Metabolic Disorders", "Gaucher's Disease", "Fabry Disease", "Rare Autoimmune Disorders"
    ],
    "Hematology": [
        "Hemophilia", "Bleeding Disorders", "Sickle Cell Disease", "Thalassemia", "Beta Thalassemia",
        "Myelodysplastic Syndromes", "Anemia", "Aplastic Anemia"
    ],
    "Endocrine Disorders": ["Growth Disorders", "Growth Hormone Deficiency", "Osteoporosis", "Cushing's Syndrome", "Acromegaly"]
}

# Construct the biomarker query
keywords = keyword_mapping.get(disorder, [disorder])
biomarker_query = f"Biomarkers for ({' OR '.join(keywords)})"



In [None]:
import re
from Bio import Entrez  # Import Entrez module

def extract_biomarkers(text):
    # Example regex for genes/proteins (can be expanded)
    pattern = r'\b[A-Z0-9]+(?:-[A-Z0-9]+)?\b'  # Matches terms like "BRCA1", "HER2", etc.
    return re.findall(pattern, text)

def search_pubmed(query): # Define search_pubmed
    Entrez.email = "your_email@example.com"  # Replace with your email
    handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_pubmed_details(id_list): # Define fetch_pubmed_details
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
    records = handle.read()
    handle.close()
    return records
# Example usage
# Assuming you have data loaded in a DataFrame called 'data'
for index, row in data.iterrows():
    disorder = row['Disorder']  # Get the disorder from your data
    # Assuming you have a function search_pubmed and fetch_pubmed_details
    biomarker_query = f"Molecular biomarkers for {disorder}"
    biomarker_ids = search_pubmed(biomarker_query) # get ids of pubmed articles
    biomarker_articles = fetch_pubmed_details(biomarker_ids) #fetch articles text based on id
    biomarkers = extract_biomarkers(biomarker_articles)


## Biomarker Extraction

In [None]:
def shorten_text(text, max_words=1000):
    """Shortens a text to a maximum number of words.

    Args:
        text (str): The text to shorten.
        max_words (int, optional): The maximum number of words to keep. Defaults to 1000.

    Returns:
        str: The shortened text.
    """
    return " ".join(text.split()[:max_words])


In [None]:
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")


In [None]:
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased")


In [None]:
model.config.id2label = {0: "O", 1: "B-DRUG", 2: "I-DRUG", 3: "B-BIOMARKER", 4: "I-BIOMARKER"}


In [None]:
# Import libraries
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from Bio import Entrez
import time
from http.client import IncompleteRead
import re

# Initialize summarizer pipeline
summarizer = pipeline("summarization", model="t5-small", device=0)  # Use GPU

# Load the tokenizer and NER model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased")
model.config.id2label = {0: "O", 1: "B-DRUG", 2: "I-DRUG", 3: "B-BIOMARKER", 4: "I-BIOMARKER"}

# Helper functions
def chunk_text(text, max_length=512):
    tokens = text.split()
    for i in range(0, len(tokens), max_length):
        yield " ".join(tokens[i:i + max_length])

def shorten_text(text, max_words=1000):
    return " ".join(text.split()[:max_words])

def fetch_pubmed_details_with_retry(id_list, retries=3, delay=5):
    ids = ",".join(id_list)
    for attempt in range(retries):
        try:
            handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
            records = handle.read()
            handle.close()
            return records
        except IncompleteRead as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
            time.sleep(delay)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
            time.sleep(delay)
    return "No data available"

def search_pubmed(query):
    Entrez.email = "your_email@example.com"  # Replace with your email
    handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def extract_biomarkers(text):
    pattern = r'\b[A-Z0-9]+(?:-[A-Z0-9]+)?\b'  # Regex for genes/proteins
    return re.findall(pattern, text)

# Define keyword mapping
keyword_mapping = {
    "Lung Cancer": ["Lung Cancer", "Pulmonary Tumors"],
    "Leukemia": ["Leukemia", "Blood Cancer"],
    "Solid Tumors": ["lung cancer", "breast cancer", "colorectal cancer", "prostate cancer", "liver cancer"],
    "Hematologic Cancers": ["leukemia", "lymphoma", "multiple myeloma"],
    "Rare Cancers": ["sarcomas", "neuroendocrine tumors", "pediatric cancers"],
    "Immuno-oncology": ["checkpoint inhibitors", "CAR-T therapies"],
    "Neurology and Psychiatry": ["neurological disorders", "schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Psychiatric Disorders": ["schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Neurodegenerative Diseases": ["Alzheimer's Disease", "Parkinson's Disease", "ALS"],
    "Neurodevelopmental Disorders": ["autism", "ADHD"],
    "Seizure Disorders": ["epilepsy", "Dravet syndrome"],
    "Movement Disorders": ["Huntington's Disease", "dystonia", "Tourette syndrome", "Benign Tremor", "Parkinson's Disease"],
    "Cardiovascular Diseases": [
        "Atherosclerosis", "Coronary Artery Disease", "Low-Density Lipoprotein (LDL)", "High-Density Lipoprotein (HDL)",
        "Heart Failure", "Ventricular Hypertrophy", "Hypertension", "Arrhythmia", "Atrial Fibrillation",
        "Venous Thromboembolism", "Peripheral Artery Disease", "Arteriovenous Malformation"
    ],
    "Autoimmune and Inflammatory Diseases": ["Rheumatoid Arthritis", "Psoriasis", "Psoriatic Arthritis", "Lupus", "Multiple Sclerosis", "Ankylosing Spondylitis"],
    "Inflammatory Bowel Disease": ["Crohn's Disease", "Ulcerative Colitis"],
    "Autoimmune Skin Disorders": ["atopic dermatitis", "vitiligo"],
    "Metabolic Disorders": [
        "Diabetes Type 1", "Diabetes Type 2", "Obesity", "Dyslipidemia", "Hypercholesterolemia",
        "Hyperglycemia", "Hypoglycemia", "Non-Alcoholic Steatohepatitis", "NASH", "MASH", "Gout",
        "Hyperthyroidism", "Hypothyroidism"
    ],
    "Rare Metabolic Disorders": ["lysosomal storage disease"],
    "Infectious Diseases": [
        "HIV", "AIDS", "COVID-19", "SARS", "Hepatitis B", "Hepatitis C", "Tuberculosis",
        "Bacterial Infections", "antibiotic-resistant infections", "Fungal Infections",
        "ESKAPE", "Malaria", "Dengue"
    ],
    "Respiratory Diseases": ["Chronic Obstructive Pulmonary Disease", "COPD", "Asthma", "Cystic Fibrosis", "Idiopathic Pulmonary Fibrosis", "IPF", "Allergic Rhinitis"],
    "Rare Diseases": [
        "Orphan Indications", "Genetic Disorders", "Cystic Fibrosis", "Duchenne Muscular Dystrophy",
        "Rare Neurodegenerative and Neuromuscular Disorders", "Spinal Muscular Atrophy",
        "Inherited Metabolic Disorders", "Gaucher's Disease", "Fabry Disease", "Rare Autoimmune Disorders"
    ],
    "Hematology": [
        "Hemophilia", "Bleeding Disorders", "Sickle Cell Disease", "Thalassemia", "Beta Thalassemia",
        "Myelodysplastic Syndromes", "Anemia", "Aplastic Anemia"
    ],
    "Endocrine Disorders": ["Growth Disorders", "Growth Hormone Deficiency", "Osteoporosis", "Cushing's Syndrome", "Acromegaly"]

}

# Load data
file_path = "/content/Updated_indications_and_assets.xlsx"
data = pd.read_excel(file_path)

# Ensure correct dtype for columns
for column in ['Example Marketed Therapies (Brand Names)', 'Clinical Efficacy (PI*)', 'Biomarkers']:
    data[column] = data[column].astype(str)

# Process data
data['Query Term'] = data['Disorder'].map(lambda d: keyword_mapping.get(d, [d]))

for index, row in data.iterrows():
    disorder = row['Query Term']

    # Example Marketed Therapies
    therapy_query = f"Marketed therapies for ({' OR '.join(disorder)})"
    therapy_ids = search_pubmed(therapy_query)
    if not therapy_ids:
        data.at[index, 'Example Marketed Therapies (Brand Names)'] = "No data available"
        continue
    therapy_articles = shorten_text(fetch_pubmed_details_with_retry(therapy_ids))
    therapies = []
    for chunk in chunk_text(therapy_articles):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [model.config.id2label[p.item()] for p in predictions[0]]
        therapies.extend([token for token, label in zip(tokens, labels) if label == "B-DRUG"])
    data.at[index, 'Example Marketed Therapies (Brand Names)'] = "; ".join(set(therapies)) if therapies else "No data available"

    # Clinical Efficacy
    efficacy_query = f"Clinical efficacy of therapies for ({' OR '.join(disorder)})"
    efficacy_ids = search_pubmed(efficacy_query)
    if not efficacy_ids:
        data.at[index, 'Clinical Efficacy (PI*)'] = "No data available"
        continue
    efficacy_articles = shorten_text(fetch_pubmed_details_with_retry(efficacy_ids))
    summaries = [summarizer(chunk, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
                 for chunk in chunk_text(efficacy_articles)]
    data.at[index, 'Clinical Efficacy (PI*)'] = " ".join(summaries) if summaries else "No data available"

    # Biomarkers
    biomarker_query = f"Biomarkers for ({' OR '.join(disorder)})"
    biomarker_ids = search_pubmed(biomarker_query)
    if not biomarker_ids:
        data.at[index, 'Biomarkers'] = "No data available"
        continue
    biomarker_articles = shorten_text(fetch_pubmed_details_with_retry(biomarker_ids))
    biomarkers = []
    for chunk in chunk_text(biomarker_articles):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [model.config.id2label[p.item()] for p in predictions[0]]
        biomarkers.extend([token for token, label in zip(tokens, labels) if label == "B-BIOMARKER"])
    if not biomarkers:
        biomarkers = extract_biomarkers(biomarker_articles)
    data.at[index, 'Biomarkers'] = "; ".join(set(biomarkers)) if biomarkers else "No data available"

# Save updated data
updated_file_path = "/content/Updated_indications_and_assets_fixedV2.xlsx"
data.to_excel(updated_file_path, index=False)
print(f"Updated file saved to: {updated_file_path}")


In [None]:
import matplotlib.pyplot as plt

# Example: Distribution of populated rows per column
populated_counts = data.notna().sum()
populated_counts.plot(kind="bar", title="Populated Data Counts")
plt.show()


In [None]:
# Remove duplicates and standardize formatting for therapies
cleaned_therapies = [therapy.capitalize() for therapy in set(therapies)]
data.at[index, 'Example Marketed Therapies (Brand Names)'] = "; ".join(cleaned_therapies) if cleaned_therapies else "No data available"


In [None]:
summaries = summarizer(chunk, max_length=50, min_length=20, do_sample=False)
key_summary = " ".join([s['summary_text'] for s in summaries[:1]])  # Only keep the first summary


In [None]:
# Import libraries
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from Bio import Entrez
import time
from http.client import IncompleteRead
import re

# Initialize summarizer pipeline
summarizer = pipeline("summarization", model="t5-small", device=0)  # Use GPU

# Load the tokenizer and NER model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased")
model.config.id2label = {0: "O", 1: "B-DRUG", 2: "I-DRUG", 3: "B-BIOMARKER", 4: "I-BIOMARKER"}

# Helper functions
def chunk_text(text, max_length=512):
    tokens = text.split()
    for i in range(0, len(tokens), max_length):
        yield " ".join(tokens[i:i + max_length])

def shorten_text(text, max_words=1000):
    return " ".join(text.split()[:max_words])

def fetch_pubmed_details_with_retry(id_list, retries=3, delay=5):
    ids = ",".join(id_list)
    for attempt in range(retries):
        try:
            handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
            records = handle.read()
            handle.close()
            return records
        except IncompleteRead as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
            time.sleep(delay)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
            time.sleep(delay)
    return "No data available"

def search_pubmed(query):
    Entrez.email = "your_email@example.com"  # Replace with your email
    handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def extract_biomarkers(text):
    pattern = r'\b[A-Z0-9]+(?:-[A-Z0-9]+)?\b'  # Regex for genes/proteins
    return re.findall(pattern, text)

# Define keyword mapping
keyword_mapping = {
    "Lung Cancer": ["Lung Cancer", "Pulmonary Tumors"],
    "Leukemia": ["Leukemia", "Blood Cancer"],
    "Solid Tumors": ["lung cancer", "breast cancer", "colorectal cancer", "prostate cancer", "liver cancer"],
    "Hematologic Cancers": ["leukemia", "lymphoma", "multiple myeloma"],
    "Rare Cancers": ["sarcomas", "neuroendocrine tumors", "pediatric cancers"],
    "Immuno-oncology": ["checkpoint inhibitors", "CAR-T therapies"],
    "Neurology and Psychiatry": ["neurological disorders", "schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Psychiatric Disorders": ["schizophrenia", "bipolar disorder", "major depression", "PTSD"],
    "Neurodegenerative Diseases": ["Alzheimer's Disease", "Parkinson's Disease", "ALS"],
    "Neurodevelopmental Disorders": ["autism", "ADHD"],
    "Seizure Disorders": ["epilepsy", "Dravet syndrome"],
    "Movement Disorders": ["Huntington's Disease", "dystonia", "Tourette syndrome", "Benign Tremor", "Parkinson's Disease"],
    "Cardiovascular Diseases": [
        "Atherosclerosis", "Coronary Artery Disease", "Low-Density Lipoprotein (LDL)", "High-Density Lipoprotein (HDL)",
        "Heart Failure", "Ventricular Hypertrophy", "Hypertension", "Arrhythmia", "Atrial Fibrillation",
        "Venous Thromboembolism", "Peripheral Artery Disease", "Arteriovenous Malformation"
    ],
    "Autoimmune and Inflammatory Diseases": ["Rheumatoid Arthritis", "Psoriasis", "Psoriatic Arthritis", "Lupus", "Multiple Sclerosis", "Ankylosing Spondylitis"],
    "Inflammatory Bowel Disease": ["Crohn's Disease", "Ulcerative Colitis"],
    "Autoimmune Skin Disorders": ["atopic dermatitis", "vitiligo"],
    "Metabolic Disorders": [
        "Diabetes Type 1", "Diabetes Type 2", "Obesity", "Dyslipidemia", "Hypercholesterolemia",
        "Hyperglycemia", "Hypoglycemia", "Non-Alcoholic Steatohepatitis", "NASH", "MASH", "Gout",
        "Hyperthyroidism", "Hypothyroidism"
    ],
    "Rare Metabolic Disorders": ["lysosomal storage disease"],
    "Infectious Diseases": [
        "HIV", "AIDS", "COVID-19", "SARS", "Hepatitis B", "Hepatitis C", "Tuberculosis",
        "Bacterial Infections", "antibiotic-resistant infections", "Fungal Infections",
        "ESKAPE", "Malaria", "Dengue"
    ],
    "Respiratory Diseases": ["Chronic Obstructive Pulmonary Disease", "COPD", "Asthma", "Cystic Fibrosis", "Idiopathic Pulmonary Fibrosis", "IPF", "Allergic Rhinitis"],
    "Rare Diseases": [
        "Orphan Indications", "Genetic Disorders", "Cystic Fibrosis", "Duchenne Muscular Dystrophy",
        "Rare Neurodegenerative and Neuromuscular Disorders", "Spinal Muscular Atrophy",
        "Inherited Metabolic Disorders", "Gaucher's Disease", "Fabry Disease", "Rare Autoimmune Disorders"
    ],
    "Hematology": [
        "Hemophilia", "Bleeding Disorders", "Sickle Cell Disease", "Thalassemia", "Beta Thalassemia",
        "Myelodysplastic Syndromes", "Anemia", "Aplastic Anemia"
    ],
    "Endocrine Disorders": ["Growth Disorders", "Growth Hormone Deficiency", "Osteoporosis", "Cushing's Syndrome", "Acromegaly"]

}

# Load data
file_path = "/content/Updated_indications_and_assets.xlsx"
data = pd.read_excel(file_path)

# Ensure correct dtype for columns
for column in ['Example Marketed Therapies (Brand Names)', 'Clinical Efficacy (PI*)', 'Biomarkers']:
    data[column] = data[column].astype(str)

# Process data
data['Query Term'] = data['Disorder'].map(lambda d: keyword_mapping.get(d, [d]))

for index, row in data.iterrows():
    disorder = row['Query Term']

    # Example Marketed Therapies
    therapy_query = f"Marketed therapies for ({' OR '.join(disorder)})"
    therapy_ids = search_pubmed(therapy_query)
    if not therapy_ids:
        data.at[index, 'Example Marketed Therapies (Brand Names)'] = "No data available"
        continue
    therapy_articles = shorten_text(fetch_pubmed_details_with_retry(therapy_ids))
    therapies = []
    for chunk in chunk_text(therapy_articles):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [model.config.id2label[p.item()] for p in predictions[0]]
        therapies.extend([token for token, label in zip(tokens, labels) if label == "B-DRUG"])
    data.at[index, 'Example Marketed Therapies (Brand Names)'] = "; ".join(set(therapies)) if therapies else "No data available"

    # Clinical Efficacy
    efficacy_query = f"Clinical efficacy of therapies for ({' OR '.join(disorder)})"
    efficacy_ids = search_pubmed(efficacy_query)
    if not efficacy_ids:
        data.at[index, 'Clinical Efficacy (PI*)'] = "No data available"
        continue
    efficacy_articles = shorten_text(fetch_pubmed_details_with_retry(efficacy_ids))
    summaries = [summarizer(chunk, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
                 for chunk in chunk_text(efficacy_articles)]
    data.at[index, 'Clinical Efficacy (PI*)'] = " ".join(summaries) if summaries else "No data available"

    # Biomarkers
    biomarker_query = f"Biomarkers for ({' OR '.join(disorder)})"
    biomarker_ids = search_pubmed(biomarker_query)
    if not biomarker_ids:
        data.at[index, 'Biomarkers'] = "No data available"
        continue
    biomarker_articles = shorten_text(fetch_pubmed_details_with_retry(biomarker_ids))
    biomarkers = []
    for chunk in chunk_text(biomarker_articles):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [model.config.id2label[p.item()] for p in predictions[0]]
        biomarkers.extend([token for token, label in zip(tokens, labels) if label == "B-BIOMARKER"])
    if not biomarkers:
        biomarkers = extract_biomarkers(biomarker_articles)
    data.at[index, 'Biomarkers'] = "; ".join(set(biomarkers)) if biomarkers else "No data available"

# Save updated data
updated_file_path = "/content/Updated_indications_and_assets_fixedV2_expt.xlsx"
data.to_excel(updated_file_path, index=False)
print(f"Updated file saved to: {updated_file_path}")


In [None]:
# # Import libraries
# import pandas as pd
# import torch
# from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
# from Bio import Entrez
# import time
# from http.client import IncompleteRead
# import re

# # Initialize summarizer pipeline
# summarizer = pipeline("summarization", model="t5-small", device=0)  # Use GPU

# # Load the tokenizer and NER model
# tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
# model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=10)  # Adjust num_labels as per the model

# # Helper functions
# def chunk_text(text, max_length=512):
#     tokens = text.split()
#     for i in range(0, len(tokens), max_length):
#         yield " ".join(tokens[i:i + max_length])

# def shorten_text(text, max_words=1000):
#     return " ".join(text.split()[:max_words])

# def fetch_pubmed_details_with_retry(id_list, retries=3, delay=5):
#     ids = ",".join(id_list)
#     for attempt in range(retries):
#         try:
#             handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
#             records = handle.read()
#             handle.close()
#             return records
#         except IncompleteRead as e:
#             print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
#             time.sleep(delay)
#         except Exception as e:
#             print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
#             time.sleep(delay)
#     return "No data available"

# def search_pubmed(query):
#     Entrez.email = "your_email@example.com"  # Replace with your email
#     handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
#     record = Entrez.read(handle)
#     handle.close()
#     return record["IdList"]

# # Regex-based biomarker extraction fallback
# def extract_biomarkers(text):
#     # Example regex for genes/proteins (can be expanded)
#     pattern = r'\b[A-Z0-9]+(?:-[A-Z0-9]+)?\b'  # Matches terms like "BRCA1", "HER2", etc.
#     return re.findall(pattern, text)

# # Define keyword_mapping
# keyword_mapping = {
#     "Lung Cancer": ["Lung Cancer", "Pulmonary Tumors"],
#     "Leukemia": ["Leukemia", "Blood Cancer"],
#     "Solid Tumors": ["lung cancer", "breast cancer", "colorectal cancer", "prostate cancer", "liver cancer"],
#     "Hematologic Cancers": ["leukemia", "lymphoma", "multiple myeloma"],
#     "Rare Cancers": ["sarcomas", "neuroendocrine tumors", "pediatric cancers"],
#     "Immuno-oncology": ["checkpoint inhibitors", "CAR-T therapies"],
#     "Neurology and Psychiatry": ["neurological disorders", "schizophrenia", "bipolar disorder", "major depression", "PTSD"],
#     "Psychiatric Disorders": ["schizophrenia", "bipolar disorder", "major depression", "PTSD"],
#     "Neurodegenerative Diseases": ["Alzheimer's Disease", "Parkinson's Disease", "ALS"],
#     "Neurodevelopmental Disorders": ["autism", "ADHD"],
#     "Seizure Disorders": ["epilepsy", "Dravet syndrome"],
#     "Movement Disorders": ["Huntington's Disease", "dystonia", "Tourette syndrome", "Benign Tremor", "Parkinson's Disease"],
#     "Cardiovascular Diseases": [
#         "Atherosclerosis", "Coronary Artery Disease", "Low-Density Lipoprotein (LDL)", "High-Density Lipoprotein (HDL)",
#         "Heart Failure", "Ventricular Hypertrophy", "Hypertension", "Arrhythmia", "Atrial Fibrillation",
#         "Venous Thromboembolism", "Peripheral Artery Disease", "Arteriovenous Malformation"
#     ],
#     "Autoimmune and Inflammatory Diseases": ["Rheumatoid Arthritis", "Psoriasis", "Psoriatic Arthritis", "Lupus", "Multiple Sclerosis", "Ankylosing Spondylitis"],
#     "Inflammatory Bowel Disease": ["Crohn's Disease", "Ulcerative Colitis"],
#     "Autoimmune Skin Disorders": ["atopic dermatitis", "vitiligo"],
#     "Metabolic Disorders": [
#         "Diabetes Type 1", "Diabetes Type 2", "Obesity", "Dyslipidemia", "Hypercholesterolemia",
#         "Hyperglycemia", "Hypoglycemia", "Non-Alcoholic Steatohepatitis", "NASH", "MASH", "Gout",
#         "Hyperthyroidism", "Hypothyroidism"
#     ],
#     "Rare Metabolic Disorders": ["lysosomal storage disease"],
#     "Infectious Diseases": [
#         "HIV", "AIDS", "COVID-19", "SARS", "Hepatitis B", "Hepatitis C", "Tuberculosis",
#         "Bacterial Infections", "antibiotic-resistant infections", "Fungal Infections",
#         "ESKAPE", "Malaria", "Dengue"
#     ],
#     "Respiratory Diseases": ["Chronic Obstructive Pulmonary Disease", "COPD", "Asthma", "Cystic Fibrosis", "Idiopathic Pulmonary Fibrosis", "IPF", "Allergic Rhinitis"],
#     "Rare Diseases": [
#         "Orphan Indications", "Genetic Disorders", "Cystic Fibrosis", "Duchenne Muscular Dystrophy",
#         "Rare Neurodegenerative and Neuromuscular Disorders", "Spinal Muscular Atrophy",
#         "Inherited Metabolic Disorders", "Gaucher's Disease", "Fabry Disease", "Rare Autoimmune Disorders"
#     ],
#     "Hematology": [
#         "Hemophilia", "Bleeding Disorders", "Sickle Cell Disease", "Thalassemia", "Beta Thalassemia",
#         "Myelodysplastic Syndromes", "Anemia", "Aplastic Anemia"
#     ],
#     "Endocrine Disorders": ["Growth Disorders", "Growth Hormone Deficiency", "Osteoporosis", "Cushing's Syndrome", "Acromegaly"]
# }

# # Load data
# file_path = "/content/Updated_indications_and_assets.xlsx"
# data = pd.read_excel(file_path)

# # Ensure columns have the correct dtype
# data['Example Marketed Therapies (Brand Names)'] = data['Example Marketed Therapies (Brand Names)'].astype(str)
# data['Clinical Efficacy (PI*)'] = data['Clinical Efficacy (PI*)'].astype(str)
# data['Biomarkers'] = data['Biomarkers'].astype(str)

# # Process data
# data['Query Term'] = data['Disorder'].map(lambda d: keyword_mapping.get(d, [d]))

# for index, row in data.iterrows():
#     disorder = row['Query Term']

#     # Example Marketed Therapies
#     therapy_query = f"Marketed therapies for ({' OR '.join(disorder)})"
#     therapy_ids = search_pubmed(therapy_query)
#     if not therapy_ids:
#         data.at[index, 'Example Marketed Therapies (Brand Names)'] = "No data available"
#         continue
#     therapy_articles = shorten_text(fetch_pubmed_details_with_retry(therapy_ids))
#     therapies = []
#     for chunk in chunk_text(therapy_articles):
#         # Ensure truncation and padding for the model's input size
#         inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
#         outputs = model(**inputs)
#         predictions = torch.argmax(outputs.logits, dim=2)
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         labels = [model.config.id2label[p.item()] for p in predictions[0]]
#         therapies.extend([token for token, label in zip(tokens, labels) if label == "B-DRUG"])
#         data.at[index, 'Example Marketed Therapies (Brand Names)'] = "; ".join(set(therapies)) if therapies else "No data available"

#     # Clinical Efficacy
#     efficacy_query = f"Clinical efficacy of therapies for ({' OR '.join(disorder)})"
#     efficacy_ids = search_pubmed(efficacy_query)
#     if not efficacy_ids:
#         data.at[index, 'Clinical Efficacy (PI*)'] = "No data available"
#         continue
#     efficacy_articles = shorten_text(fetch_pubmed_details_with_retry(efficacy_ids))
#     summaries = [summarizer(chunk, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
#                  for chunk in chunk_text(efficacy_articles)]
#     data.at[index, 'Clinical Efficacy (PI*)'] = " ".join(summaries) if summaries else "No data available"

#     # Biomarkers
#     biomarker_query = f"Biomarkers for ({' OR '.join(disorder)})"
#     biomarker_ids = search_pubmed(biomarker_query)
#     if not biomarker_ids:
#         data.at[index, 'Biomarkers'] = "No data available"
#         continue
#     biomarker_articles = shorten_text(fetch_pubmed_details_with_retry(biomarker_ids))
#     biomarkers = []

#     # Extract biomarkers using NER
#     for chunk in chunk_text(biomarker_articles):
#         inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
#         outputs = model(**inputs)
#         predictions = torch.argmax(outputs.logits, dim=2)
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         labels = [model.config.id2label[p.item()] for p in predictions[0]]
#         biomarkers.extend([token for token, label in zip(tokens, labels) if label == "B-BIOMARKER"])

#     # Fallback to regex if NER model misses biomarkers
#     if not biomarkers:
#         biomarkers = extract_biomarkers(biomarker_articles)

#     # Deduplicate and store results
#     data.at[index, 'Biomarkers'] = "; ".join(set(biomarkers)) if biomarkers else "No data available"

# # Save the updated data
# updated_file_path = "/content/Updated_indications_and_assets_fixedV2.xlsx"
# data.to_excel(updated_file_path, index=False)
# print(f"Updated file saved to: {updated_file_path}")
