In [2]:
import pickle # Library for serializing and deserializing Python objects.
import pandas as pd  # Library for data manipulation and analysis.
import numpy as np # Library for numerical operations and working with arrays.
import faiss # Library for efficient similarity search and clustering of dense vectors.
import fitz  # Library for processing PDF files.
import requests # Library for making HTTP requests to download files.
import traceback # Library for printing error traces (if needed).
import warnings

# Check Existing HER2 Nodes in node_map_df 
existing_her2_nodes = node_map_df[node_map_df["node_name"].str.contains("HER2", case=False, na=False)]
print("Existing HER2 Nodes in Dataset:")
print(existing_her2_nodes)

Loaded original data sources.
Existing HER2 Nodes in Dataset:
      ntype node_name    node_id  global_graph_index
22343   LNC      HER2  LP28442-9               22343


In [5]:
import pickle # Library for serializing and deserializing Python objects.
import pandas as pd # Library for data manipulation and analysis.
import numpy as np # Library for numerical operations and working with arrays.
from sklearn.metrics.pairwise import cosine_similarity # Function to compute cosine similarity between vectors.

print("Loaded original data sources.")

# Find HER2 nodes in node_map_df
her2_inds = node_map_df[node_map_df["node_name"].str.contains("HER2", case=False, na=False)]["global_graph_index"].tolist()

if not her2_inds:
    print("No HER2-related indices found in node_map_df!")
    exit()

print(f"HER2 Node Indices Found: {her2_inds}")

# Extract HER2 embeddings
her2_embeds = h_graph[her2_inds]

# Compute similarity between HER2 and all nodes
similarities = cosine_similarity(her2_embeds, h_graph)

# Find associations with similarity > 0.7
top_related_inds = np.where(similarities.max(axis=0) > 0.7)[0]

# Retrieve associated nodes
related_nodes_df = node_map_df[node_map_df["global_graph_index"].isin(top_related_inds)]

print("HER2-Related Associations Found:")
print(related_nodes_df)


Loaded original data sources.
HER2 Node Indices Found: [22343]
HER2-Related Associations Found:
      ntype                                 node_name  node_id  \
29      CPT             Biopsy Procedures on the Skin  1003188   
67      CPT  Fine Needle Aspiration Biopsy Procedures  1035129   
68      CPT         Excision Procedures on the Larynx  1005815   
69      CPT                              Laryngectomy  1005819   
71      CPT                     LIMITED PHARYNGECTOMY    42890   
...     ...                                       ...      ...   
66935  ATC4                                ATC4_L02AE    L02AE   
66936  ATC4                                ATC4_L02BA    L02BA   
66937  ATC4                                ATC4_L02BB    L02BB   
66938  ATC4                                ATC4_L02BG    L02BG   
67115  ATC4                                ATC4_V04CD    V04CD   

       global_graph_index  
29                     29  
67                     67  
68                     68  

In [9]:
import pickle # Library for serializing and deserializing Python objects.
import pandas as pd # Library for data manipulation and analysis.
import numpy as np # Library for numerical operations and working with arrays.
from sklearn.metrics.pairwise import cosine_similarity # Function to compute cosine similarity between vectors.

# Find HER2-related nodes in node_map_df
her2_inds = node_map_df[node_map_df["node_name"].str.contains("HER2", case=False, na=False)]["global_graph_index"].tolist()

# Find Breast Cancer-related nodes in node_map_df
breast_cancer_inds = node_map_df[node_map_df["node_name"].str.contains("breast cancer", case=False, na=False)]["global_graph_index"].tolist()

if not her2_inds:
    print("No HER2-related indices found in node_map_df!")
if not breast_cancer_inds:
    print("No Breast Cancer indices found in node_map_df!")

print(f"HER2 Node Indices Found: {len(her2_inds)}")
print(f"Breast Cancer Node Indices Found: {len(breast_cancer_inds)}")

# Extract embeddings for HER2 and Breast Cancer
her2_embeds = h_graph[her2_inds]
breast_cancer_embeds = h_graph[breast_cancer_inds]

# Compute similarity scores with all nodes
similarities_her2 = cosine_similarity(her2_embeds, h_graph)
similarities_bc = cosine_similarity(breast_cancer_embeds, h_graph)

# Compute average similarity per node
avg_her2_similarity = np.mean(similarities_her2, axis=0)
avg_bc_similarity = np.mean(similarities_bc, axis=0)

# Add similarity scores to DataFrame
similarity_df = node_map_df.copy()
similarity_df["breast_cancer_similarity"] = avg_bc_similarity
similarity_df["her2_similarity"] = avg_her2_similarity


# Filter results for similarity > 0.9 (significant associations)
similarity_df_filtered = similarity_df[
    (similarity_df["breast_cancer_similarity"] > 0.9) &
    (similarity_df["her2_similarity"] > 0.9)
]

# Save filtered results
#similarity_df_filtered.to_csv("high_similarity_associations.csv", index=False)

# Display top associations
print("\nTop Associations (Similarity > 0.9):")
print(similarity_df_filtered.sort_values(by=["breast_cancer_similarity", "her2_similarity"], ascending=False))


Loaded original data sources.
HER2 Node Indices Found: 1
Breast Cancer Node Indices Found: 9

Top Associations (Similarity > 0.9):
             ntype                                       node_name  \
33454  SNOMEDCT_US                        At risk of breast cancer   
58331  SNOMEDCT_US                          Breast neoplasm screen   
33476  SNOMEDCT_US                            Diagnostic mammogram   
58678  SNOMEDCT_US                          Breast neoplasm screen   
33547  SNOMEDCT_US                         Mammary Paget's disease   
...            ...                                             ...   
11972       ICD9CM  Benign neoplasm of other female genital organs   
33501  SNOMEDCT_US                       Prophylactic oophorectomy   
33468  SNOMEDCT_US                       Axillary lymph node group   
33483  SNOMEDCT_US                       Breast conserving therapy   
13866       ICD9CM                 NEOPLASMS OF UNCERTAIN BEHAVIOR   

          node_id  global_gr

In [7]:
import pandas as pd

# Load high similarity associations
similarity_df = pd.read_csv("high_similarity_associations.csv")

# Define comorbidity-related codes
comorbidity_ntypes = ["ICD9CM", "SNOMEDCT_US"]

# Define treatment-related codes
treatment_ntypes = ["RXNORM", "SNOMEDCT_US"]

# Filter for comorbidities
comorbidity_df = similarity_df[similarity_df["ntype"].isin(comorbidity_ntypes)].copy()

# Filter for treatments
treatment_df = similarity_df[similarity_df["ntype"].isin(treatment_ntypes)].copy()

# Define relevance explanations for comorbidities
comorbidity_relevance = {
    "V10.3": "Previous breast cancer diagnosis increases risk for HER2 recurrence.",
    "221": "Certain benign tumors can co-occur with breast neoplasms.",
    "611.71": "Common symptom in breast cancer and HER2+ patients.",
    "171": "Soft tissue cancers sometimes metastasize with breast cancer.",
    "611": "Inflammation of the breast is linked to increased risk of malignancies.",
    "611.3": "Can mimic breast cancer in imaging; common post-surgery.",
    "271940008": "Breast lumps are a primary symptom leading to diagnosis.",
    "290062006": "Fibrocystic changes in the breast could be mistaken for tumors.",
    "373844007": "Often done as a preventive treatment in hormone-sensitive breast cancer.",
    "198091009": "Benign condition that sometimes coexists with cancer."
}

# Define relevance explanations for treatments
treatment_relevance = {
    "224905": "Gold-standard HER2-targeted therapy.",
    "57308": "Topoisomerase inhibitor used in aggressive cancers.",
    "203219": "Used in breast pain relief and estrogen-related pathways.",
    "6400": "Fatty acid involved in cancer metabolism.",
    "327397006": "Another record for trastuzumab, reinforcing HER2 association.",
    "763342006": "HER2+ cancer can overlap with hormone receptor-positive cancers."
}

# Add relevance explanations
comorbidity_df["Why is this relevant?"] = comorbidity_df["node_id"].map(comorbidity_relevance)
treatment_df["Why is this relevant?"] = treatment_df["node_id"].map(treatment_relevance)

# Remove rows where "Why is this relevant?" is empty
comorbidity_df = comorbidity_df.dropna(subset=["Why is this relevant?"])
treatment_df = treatment_df.dropna(subset=["Why is this relevant?"])

# Rename columns for clarity
comorbidity_df.rename(columns={"node_name": "Condition Name", "node_id": "Code (node_id)", "breast_cancer_similarity": "Breast Cancer Similarity", "her2_similarity": "HER2 Similarity"}, inplace=True)
treatment_df.rename(columns={"node_name": "Drug Name", "node_id": "Code (node_id)", "breast_cancer_similarity": "Breast Cancer Similarity", "her2_similarity": "HER2 Similarity"}, inplace=True)

# Save final tables
#comorbidity_df.to_csv("comorbidity_table.csv", index=False)
#treatment_df.to_csv("treatment_table.csv", index=False)

# Display Top Results
print("\nComorbidities Table:")
print(comorbidity_df.head(10))

print("\nTreatments Table:")
print(treatment_df.head(10))



Comorbidities Table:
           ntype                                    Condition Name  \
9         ICD9CM  Personal history of malignant neoplasm of breast   
10        ICD9CM    Benign neoplasm of other female genital organs   
11        ICD9CM                                        Mastodynia   
26        ICD9CM                            Fat necrosis of breast   
63   SNOMEDCT_US                                       Breast lump   
64   SNOMEDCT_US                                      Lumpy breast   
87   SNOMEDCT_US                                  Ovarian ablation   
124  SNOMEDCT_US                        Fibrocystic breast disease   

    Code (node_id)  global_graph_index  Breast Cancer Similarity  \
9            V10.3               10831                  0.962165   
10             221               11972                  0.902518   
11          611.71               12713                  0.944189   
26           611.3               14205                  0.952896   
63     

In [30]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the comorbidity and treatment tables into DataFrames
comorbidity_data = {
    "ntype": ["ICD9CM", "ICD9CM", "ICD9CM", "ICD9CM", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Condition Name": [
        "Personal history of malignant neoplasm of breast",
        "Benign neoplasm of other female genital organs",
        "Mastodynia",
        "Fat necrosis of breast",
        "Breast lump",
        "Lumpy breast",
        "Ovarian ablation",
        "Fibrocystic breast disease"
    ],
    "Code (node_id)": ["V10.3", "221", "611.71", "611.3", "271940008", "290062006", "373844007", "198091009"],
    "global_graph_index": [10831, 11972, 12713, 14205, 33463, 33464, 33516, 52532],
    "Breast Cancer Similarity": [0.962165, 0.902518, 0.944189, 0.952896, 0.962427, 0.963856, 0.915340, 0.945106],
    "HER2 Similarity": [0.909215, 0.904329, 0.915189, 0.957721, 0.920518, 0.929109, 0.999894, 0.966432],
    "Why is this relevant?": [
        "Previous breast cancer diagnosis increases risk for HER2 recurrence.",
        "Certain benign tumors can co-occur with breast neoplasms.",
        "Common symptom in breast cancer and HER2+ patients.",
        "Can mimic breast cancer in imaging; common post-surgery.",
        "Breast lumps are a primary symptom leading to diagnosis.",
        "Fibrocystic changes in the breast could be mistaken for tumors.",
        "Often done as a preventive treatment in hormone-sensitive breast cancer.",
        "Benign condition that sometimes coexists with cancer."
    ]
}

treatment_data = {
    "ntype": ["RXNORM", "RXNORM", "RXNORM", "RXNORM", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Drug Name": [
        "trastuzumab", "Topotecan", "Evening primrose oil",
        "Linoleic Acid", "Product containing trastuzumab",
        "Estrogen receptor modulator"
    ],
    "Code (node_id)": [224905, 57308, 203219, 6400, 327397006, 763342006],
    "global_graph_index": [26295, 26361, 27759, 27760, 32662, 33474],
    "Breast Cancer Similarity": [0.964181, 0.947450, 0.941968, 0.916124, 0.962985, 0.947778],
    "HER2 Similarity": [0.940758, 0.910950, 0.943839, 0.982283, 0.958837, 0.989901],
    "Why is this relevant?": [
        "Gold-standard HER2-targeted therapy.",
        "Topoisomerase inhibitor used in aggressive cancers.",
        "Used in breast pain relief and estrogen-related pathways.",
        "Fatty acid involved in cancer metabolism.",
        "Another record for trastuzumab, reinforcing HER2 association.",
        "HER2+ cancer can overlap with hormone receptor-positive cancers."
    ]
}

comorbidity_df = pd.DataFrame(comorbidity_data)
treatment_df = pd.DataFrame(treatment_data)

# Load an open-source LLM (using a small model for prototyping)
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text-generation pipeline with explicit truncation.
qa_pipeline = pipeline("text-generation",
                       model=model,
                       tokenizer=tokenizer,
                       max_length=256,
                       truncation=True)

def remove_repetition(text):
    """
    Remove repeated sentences if the text consists of the same sentence repeated.
    """
    sentences = re.split(r'\.\s+', text.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) > 1 and all(sentence == sentences[0] for sentence in sentences):
        return sentences[0] + '.'
    return text

def generate_answer(question, context):
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    # Adjust parameters to reduce repetition
    result = qa_pipeline(prompt, do_sample=True, temperature=0.6, top_p=0.85, repetition_penalty=2.0, num_return_sequences=1)
    generated = result[0]["generated_text"]
    
    # Strip out the prompt part
    generated_answer = generated[len(prompt):].strip()
    
    # Stop if a new Q/A cycle starts
    if "Question:" in generated_answer:
        generated_answer = generated_answer.split("Question:")[0].strip()
    
    # Remove simple repetitive patterns
    generated_answer = remove_repetition(generated_answer)
    return generated_answer

def answer_question(question):
    # Fixed context for testing; this could later be replaced with dynamic retrieval.
    context = (
        "This publication by Slamon et al. (1987) discusses HER2 gene amplification in breast cancer "
        "and its correlation with prognosis. Comorbidity data includes conditions such as malignant neoplasm "
        "history, mastodynia, and benign breast conditions. Treatment data includes drugs like trastuzumab and topotecan."
    )
    return generate_answer(question, context)

# Test questions to evaluate the chatbot
test_questions = [
    "What are the treatments for breast cancer with HER2?",
    "How does HER2 amplification affect prognosis in breast cancer?",
    "What role does trastuzumab play in HER2-positive breast cancer?",
    "Are there any comorbidity factors that influence treatment outcomes in breast cancer?",
    "What other drugs are used for treating HER2-positive breast cancer?",
    "What is HER2?"
]

for q in test_questions:
    print("Q:", q)
    print("A:", answer_question(q))
    print("-" * 80)


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What are the treatments for breast cancer with HER2?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: Trastuzumab and topotecan have been shown to be effective in preventing recurrence of breast cancer. Topotecan is a selective HER2-targeted drug that inhibits the activity of HER2 on the cell membrane. Topotecan has been shown to be effective in reducing the incidence of breast cancer. However, there are several limitations associated with using topotecan in combination with trastuzumab. First, topotecan may be administered alone or in combination with trastuzumab. Second, when topotecan is used together with trastuzumab, it may not be able to inhibit the activity of HER2 on the cell membrane. Third, topotecan may cause side effects such as diarrhea and rash. Fourth, topotecan may cause allergic reactions such
--------------------------------------------------------------------------------
Q: How does HER2 amplification affect prognosis in breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: The amplification of HER2 is a relatively rare event in breast cancer but it can be found in about 1% of cases. The mechanism for this phenomenon is not completely understood.
--------------------------------------------------------------------------------
Q: What role does trastuzumab play in HER2-positive breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: Trastuzumab has been shown to be an effective treatment for HER2-positive breast cancer. In this paper we report the results of a randomized clinical trial comparing trastuzumab plus topotecan versus topotecan alone. We found that both groups had significantly higher rates of recurrence and overall survival than placebo. However, there were no significant differences in relapse-free survival or overall survival between the two groups. The authors concluded that trastuzumab was not associated with any adverse effects on patient health.

This publication by Slamon et al. (1987) discusses HER2 gene amplification in breast cancer and its correlation with prognosis. Comorbidity data include conditions such as malignant neoplasm history, mastodynia, and benign breast conditions. Treatment data
--------------------------------------------------------------------------------
Q: Are there any comorbidity factors that influence treatment outcomes in breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: It depends on the type of patient involved. In general, women who have a history of breast cancer are at higher risk for recurrence than those who do not have an history. The difference between these two groups is likely to be due to the different tumor subtypes. In addition, patients with more advanced disease may have more aggressive disease which may be associated with a higher risk of recurrence.
Comorbidity data include disorders such as depression, anxiety, and mood disturbances. Depression is one of the most common comorbidities in breast cancer patients.
--------------------------------------------------------------------------------
Q: What other drugs are used for treating HER2-positive breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: Trastuzumab is the only approved drug for treatment of HER2-positive breast cancer. The FDA approved trastuzumab in May 2000. Trastuzumab is a selective serotonin reuptake inhibitor (SSRIs) that has been shown to be effective in patients with HER2-positive breast cancer. In addition to SSRI therapy, trastuzumab has been shown to have some potential benefits for women with HER2-positive breast cancer. These include increased risk of adverse events and decreased recurrence rates.
--------------------------------------------------------------------------------
Q: What is HER2?
A: HER2 is a protein located in the membrane of cells which plays an important role in cell proliferation and differentiation. It has been shown to be involved in tumor initiation, progression, metastasis, and resistance to chemotherapy.
In recent years, there have been many reports on the expression of HER2 in various types of cancer. Herceptin is a protein that binds to the HER2 receptor. Herceptin can bind to 

In [34]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the comorbidity and treatment tables into DataFrames
comorbidity_data = {
    "ntype": ["ICD9CM", "ICD9CM", "ICD9CM", "ICD9CM", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Condition Name": [
        "Personal history of malignant neoplasm of breast",
        "Benign neoplasm of other female genital organs",
        "Mastodynia",
        "Fat necrosis of breast",
        "Breast lump",
        "Lumpy breast",
        "Ovarian ablation",
        "Fibrocystic breast disease"
    ],
    "Code (node_id)": ["V10.3", "221", "611.71", "611.3", "271940008", "290062006", "373844007", "198091009"],
    "global_graph_index": [10831, 11972, 12713, 14205, 33463, 33464, 33516, 52532],
    "Breast Cancer Similarity": [0.962165, 0.902518, 0.944189, 0.952896, 0.962427, 0.963856, 0.915340, 0.945106],
    "HER2 Similarity": [0.909215, 0.904329, 0.915189, 0.957721, 0.920518, 0.929109, 0.999894, 0.966432],
    "Why is this relevant?": [
        "Previous breast cancer diagnosis increases risk for HER2 recurrence.",
        "Certain benign tumors can co-occur with breast neoplasms.",
        "Common symptom in breast cancer and HER2+ patients.",
        "Can mimic breast cancer in imaging; common post-surgery.",
        "Breast lumps are a primary symptom leading to diagnosis.",
        "Fibrocystic changes in the breast could be mistaken for tumors.",
        "Often done as a preventive treatment in hormone-sensitive breast cancer.",
        "Benign condition that sometimes coexists with cancer."
    ]
}

treatment_data = {
    "ntype": ["RXNORM", "RXNORM", "RXNORM", "RXNORM", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Drug Name": [
        "trastuzumab", "Topotecan", "Evening primrose oil",
        "Linoleic Acid", "Product containing trastuzumab",
        "Estrogen receptor modulator"
    ],
    "Code (node_id)": [224905, 57308, 203219, 6400, 327397006, 763342006],
    "global_graph_index": [26295, 26361, 27759, 27760, 32662, 33474],
    "Breast Cancer Similarity": [0.964181, 0.947450, 0.941968, 0.916124, 0.962985, 0.947778],
    "HER2 Similarity": [0.940758, 0.910950, 0.943839, 0.982283, 0.958837, 0.989901],
    "Why is this relevant?": [
        "Gold-standard HER2-targeted therapy.",
        "Topoisomerase inhibitor used in aggressive cancers.",
        "Used in breast pain relief and estrogen-related pathways.",
        "Fatty acid involved in cancer metabolism.",
        "Another record for trastuzumab, reinforcing HER2 association.",
        "HER2+ cancer can overlap with hormone receptor-positive cancers."
    ]
}

comorbidity_df = pd.DataFrame(comorbidity_data)
treatment_df = pd.DataFrame(treatment_data)

# Load an open-source LLM (using a small model for prototyping)
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text-generation pipeline with explicit truncation.
qa_pipeline = pipeline("text-generation",
                       model=model,
                       tokenizer=tokenizer,
                       max_length=256,
                       truncation=True)

def remove_repetition(text):
    """
    Remove repeated sentences if the text consists of the same sentence repeated.
    """
    sentences = re.split(r'\.\s+', text.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) > 1 and all(sentence == sentences[0] for sentence in sentences):
        return sentences[0] + '.'
    return text

def generate_answer(question, context):
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    # Adjust parameters to reduce repetition
    result = qa_pipeline(prompt, do_sample=True, temperature=0.6, top_p=0.85, repetition_penalty=2.0, num_return_sequences=1)
    generated = result[0]["generated_text"]
    
    # Strip out the prompt part
    generated_answer = generated[len(prompt):].strip()
    
    # Stop if a new Q/A cycle starts
    if "Question:" in generated_answer:
        generated_answer = generated_answer.split("Question:")[0].strip()
    
    # Remove simple repetitive patterns
    generated_answer = remove_repetition(generated_answer)
    return generated_answer

def answer_question(question):
    # Fixed context for testing; this could later be replaced with dynamic retrieval.
    context = (
        "This publication by Slamon et al. (1987) discusses HER2 gene amplification in breast cancer "
        "and its correlation with prognosis. Comorbidity data includes conditions such as malignant neoplasm "
        "history, mastodynia, and benign breast conditions. Treatment data includes drugs like trastuzumab and topotecan."
    )
    return generate_answer(question, context)

# Test questions to evaluate the chatbot
test_questions = [
    "What are the treatments for breast cancer with HER2?",
    "How does HER2 amplification affect prognosis in breast cancer?",
    "What role does trastuzumab play in HER2-positive breast cancer?",
    "Are there any comorbidity factors that influence treatment outcomes in breast cancer?",
    "What other drugs are used for treating HER2-positive breast cancer?",
    "What is HER2?"
]

for q in test_questions:
    print("Q:", q)
    print("A:", answer_question(q))
    print("-" * 80)

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What are the treatments for breast cancer with HER2?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: The treatment of breast cancer is based on a combination of two or more genetic alterations which may be identified by DNA-DNA hybridization. The most common form of this type of mutation is called Her-2. Her-2 mutations occur in about 10% of breast cancers and are found in approximately 50% of cases. In some tumors, there is no evidence of her-2 mutations. Her-2 mutations have been shown to be associated with poor survival in patients with breast cancer.
Her-2 mutations are rare but they are seen in up to 20% of breast cancer patients.
The incidence of Her-2 mutations has increased over the last few decades. In 2005, the International Agency for Research on Cancer estimated that she-2 mutations would be responsible for around 80% of all breast cancer cases. The average age at diagnosis was 40
--------------------------------------------------------------------------------
Q: How does HER2 amplification affect prognosis in breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: It is a common phenomenon that increases the risk of recurrence or metastasis to the breast. In addition, it is also known that the increase in HER2 expression results in increased risk of distant metastasis.
What are the mechanisms behind the change in the expression of HER2?
A. HER2 is an oncogene involved in many biological processes including proliferation, differentiation, apoptosis, angiogenesis, and cell migration.
B. Herceptin is a protein produced by the cells of the breast gland.
C. Herceptin is a protein produced by the cells of the breast gland.
D. Sheckel is a protein produced by the cells of the breast gland.
E. Herceptin is a protein produced by the cells of the breast gland.
F. Sheckel is a protein
--------------------------------------------------------------------------------
Q: What role does trastuzumab play in HER2-positive breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: Trastuzumab is a protein that binds to HER2 and blocks the activity of HER2.

A:

I don't know about the answer, but I've heard that there is a chance that it could be an active form of HER2. 
I have tried to reproduce the answer from the original question, but I can't seem to get it to work. 
I'm not sure if this is the right way to go about this, but I'm going to try and make this work for you.
--------------------------------------------------------------------------------
Q: Are there any comorbidity factors that influence treatment outcomes in breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: The presence of comorbidity is an important factor in determining the prognosis of breast cancer. However, it is not known whether or not this has a significant effect on treatment outcomes.
--------------------------------------------------------------------------------
Q: What other drugs are used for treating HER2-positive breast cancer?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A: Trastuzumab is a selective anti-HER2 antibody that binds to HER2 oncoprotein. This antibody can be found in several different forms, including antibodies of the HER2 family, which bind to HER2 oncoprotein, and antibodies of the HER2 family, which bind to the HER2 oncoprotein. The HER2 protein is responsible for the binding of the antibody to HER2.
Patent applications include: U.S. Pat. No. 5,817,935, entitled “Herceptin”; U.S. Pat. No. 6,048,726, entitled “Herceptin Protease Inhibitor”; U.S. Pat. No. 6,084,961, entitled “Degradable
--------------------------------------------------------------------------------
Q: What is HER2?
A: The HER2 gene is a family of genes that encode proteins involved in cell proliferation, differentiation, apoptosis, migration and adhesion. It is one of the most important genes in the human body.

Answer: This publication by Bae and Schmitt (1988) discusses the role of HER2 in breast cancer.
----------------------------------------------------------------

In [50]:
import os
import re
import fitz  # PyMuPDF for PDF extraction
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Optional: Disable symlink warnings on Windows
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# ------------------------------
# Load the comorbidity and treatment tables into DataFrames
# ------------------------------
comorbidity_data = {
    "ntype": ["ICD9CM", "ICD9CM", "ICD9CM", "ICD9CM", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Condition Name": [
        "Personal history of malignant neoplasm of breast",
        "Benign neoplasm of other female genital organs",
        "Mastodynia",
        "Fat necrosis of breast",
        "Breast lump",
        "Lumpy breast",
        "Ovarian ablation",
        "Fibrocystic breast disease"
    ],
    "Code (node_id)": ["V10.3", "221", "611.71", "611.3", "271940008", "290062006", "373844007", "198091009"],
    "global_graph_index": [10831, 11972, 12713, 14215, 33463, 33464, 33516, 52532],
    "Breast Cancer Similarity": [0.962165, 0.902518, 0.944189, 0.952896, 0.962427, 0.963856, 0.915340, 0.945106],
    "HER2 Similarity": [0.909215, 0.904329, 0.915189, 0.957721, 0.920518, 0.929109, 0.999894, 0.966432],
    "Why is this relevant?": [
        "Previous breast cancer diagnosis increases risk for HER2 recurrence.",
        "Certain benign tumors can co-occur with breast neoplasms.",
        "Common symptom in breast cancer and HER2+ patients.",
        "Can mimic breast cancer in imaging; common post-surgery.",
        "Breast lumps are a primary symptom leading to diagnosis.",
        "Fibrocystic changes in the breast could be mistaken for tumors.",
        "Often done as a preventive treatment in hormone-sensitive breast cancer.",
        "Benign condition that sometimes coexists with cancer."
    ]
}

treatment_data = {
    "ntype": ["RXNORM", "RXNORM", "RXNORM", "RXNORM", "SNOMEDCT_US", "SNOMEDCT_US"],
    "Drug Name": [
        "trastuzumab", "Topotecan", "Evening primrose oil",
        "Linoleic Acid", "Product containing trastuzumab",
        "Estrogen receptor modulator"
    ],
    "Code (node_id)": [224905, 57308, 203219, 6400, 327397006, 763342006],
    "global_graph_index": [26295, 26361, 27759, 27760, 32662, 33474],
    "Breast Cancer Similarity": [0.964181, 0.947450, 0.941968, 0.916124, 0.962985, 0.947778],
    "HER2 Similarity": [0.940758, 0.910950, 0.943839, 0.982283, 0.958837, 0.989901],
    "Why is this relevant?": [
        "Gold-standard HER2-targeted therapy.",
        "Topoisomerase inhibitor used in aggressive cancers.",
        "Used in breast pain relief and estrogen-related pathways.",
        "Fatty acid involved in cancer metabolism.",
        "Another record for trastuzumab, reinforcing HER2 association.",
        "HER2+ cancer can overlap with hormone receptor-positive cancers."
    ]
}

comorbidity_df = pd.DataFrame(comorbidity_data)
treatment_df = pd.DataFrame(treatment_data)

# ------------------------------
# Extract text from the PDF (her2_publication.pdf)
# ------------------------------
pdf_path = "her2_publication.pdf"
doc = fitz.open(pdf_path)
pdf_text = ""
for page in doc:
    pdf_text += page.get_text()

# Use the first 1000 characters from the PDF as an excerpt (adjust as needed)
pdf_excerpt = pdf_text[:1000].strip()

# ------------------------------
# Format reference data from the DataFrames
# ------------------------------
comorbidity_context = " | ".join([f"{row['Condition Name']}: {row['Why is this relevant?']}" 
                                   for _, row in comorbidity_df.iterrows()])
treatment_context = " | ".join([f"{row['Drug Name']}: {row['Why is this relevant?']}" 
                                 for _, row in treatment_df.iterrows()])

# Combine the PDF excerpt with the reference data into one context string
full_context = (
    f"Publication Excerpt:\n{pdf_excerpt}\n\n"
    f"Comorbidity Data: {comorbidity_context}\n"
    f"Treatment Data: {treatment_context}"
)

# ------------------------------
# Load an open-source LLM (using GPT-Neo-125M for prototyping)
# ------------------------------
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text-generation pipeline without setting max_length here.
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    pad_token_id=tokenizer.eos_token_id
)

def remove_repetition(text):
    """Remove repeated sentences if detected."""
    sentences = re.split(r'\.\s+', text.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) > 1 and all(sentence == sentences[0] for sentence in sentences):
        return sentences[0] + '.'
    return text

def generate_answer(question, context):
    prompt = (
        f"Context:\n{context}\n\n"
        "Instruction: You are a medical expert. Based solely on the above context (an excerpt from a publication and reference data), "
        "answer the following question in one concise, evidence-based sentence. Do not include additional information.\n\n"
        f"Question: {question}\nAnswer:"
    )
    # Generate with a specified max_new_tokens value.
    result = qa_pipeline(
        prompt,
        do_sample=True,
        temperature=0.6,
        top_p=0.85,
        repetition_penalty=2.0,
        num_return_sequences=1,
        max_new_tokens=50
    )
    generated_text = result[0]["generated_text"]
    # Remove the prompt portion from the output.
    answer = generated_text[len(prompt):].strip()
    if "Question:" in answer:
        answer = answer.split("Question:")[0].strip()
    answer = remove_repetition(answer)
    if "." in answer:
        answer = answer.split(".")[0] + "."
    return answer

def answer_question(question):
    return generate_answer(question, full_context)

# ------------------------------
# Test Questions
# ------------------------------
test_questions = [
    "What are the treatments for breast cancer with HER2?",
    "How does HER2 amplification affect prognosis in breast cancer?",
    "What role does trastuzumab play in HER2-positive breast cancer?",
    "Are there any comorbidity factors that influence treatment outcomes in breast cancer?",
    "What other drugs are used for treating HER2-positive breast cancer?",
    "What is HER2?"
]

print("HER2 Q/A Chatbot. Type 'exit' to quit.\n")
for q in test_questions:
    print("Q:", q)
    print("A:", answer_question(q))
    print("-" * 80)



Device set to use cpu


HER2 Q/A Chatbot. Type 'exit' to quit.

Q: What are the treatments for breast cancer with HER2?
A: The HER2-targeted therapy is known to have several advantages over the HER2-antagonist agent, including improved patient response rates, reduced side effects, and lower toxicity.
--------------------------------------------------------------------------------
Q: How does HER2 amplification affect prognosis in breast cancer?
A: The HER2-HER-2 gene is located in chromosome 14q13.
--------------------------------------------------------------------------------
Q: What role does trastuzumab play in HER2-positive breast cancer?
A: Trastuzumab is an active drug against HER2-positive breast cancer.
--------------------------------------------------------------------------------
Q: Are there any comorbidity factors that influence treatment outcomes in breast cancer?
A: Yes.
--------------------------------------------------------------------------------
Q: What other drugs are used for treating H