In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings

# === Step 1: Load and Clean RxNorm CSV ===
df = pd.read_csv("rxnorm_enriched_chunks.csv")
df = df.dropna(subset=["Text_Chunk", "STR", "RXCUI", "TTY"])  # remove empty rows

# === Step 2: Chunk Texts (optional overlap) ===
def chunk_text(text, chunk_size=300, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size].strip()
        if len(chunk) > 50:
            chunks.append(chunk)
    return chunks

chunked_docs = []
for _, row in df.iterrows():
    chunks = chunk_text(row["Text_Chunk"])
    for idx, chunk in enumerate(chunks):
        metadata = {
            "drug_name": row["STR"],
            "rxcui": row["RXCUI"],
            "term_type": row["TTY"],
            "code": row.get("CODE", ""),
            "source": row.get("SAB", ""),
            "chunk_index": idx
        }
        doc = Document(page_content=chunk, metadata=metadata)
        chunked_docs.append(doc)

print(f"Total chunked documents: {len(chunked_docs)}")

# === Step 3: Create embeddings ===
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_db = FAISS.from_documents(chunked_docs, embedding_model)

# === Step 4: Save FAISS index and metadata ===
faiss_db.save_local("rxnorm_faiss_index")
print("FAISS index saved to ./rxnorm_faiss_index")


Total chunked documents: 116


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


FAISS index saved to ./rxnorm_faiss_index


In [11]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
retriever = FAISS.load_local("rxnorm_faiss_index", embeddings=embedding_model, allow_dangerous_deserialization=True)

# 检索示例
docs = retriever.similarity_search("what is atorvastatin", k=3)
for d in docs:
    print(d.page_content)
    print(d.metadata)

sebetralstat 300 MG Oral Tablet [Ekterly] (SBD) - yellow.
{'drug_name': 'sebetralstat 300 MG Oral Tablet [Ekterly]', 'rxcui': 2717955, 'term_type': 'SBD', 'code': '2717955', 'source': 'RXNORM', 'chunk_index': 0}
Sebetralstat 300 mg ORAL TABLET [EKTERLY] (DP) - yellow.
{'drug_name': 'Sebetralstat 300 mg ORAL TABLET [EKTERLY]', 'rxcui': 2717955, 'term_type': 'DP', 'code': '82928-300', 'source': 'MTHSPL', 'chunk_index': 0}
afluria 2025-2026 vaccine 0.5 ML Prefilled Syringe (PSN) - .
{'drug_name': 'afluria 2025-2026 vaccine 0.5 ML Prefilled Syringe', 'rxcui': 2718400, 'term_type': 'PSN', 'code': '2718400', 'source': 'RXNORM', 'chunk_index': 0}


In [None]:
import os
import time
import pandas as pd
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from tqdm import tqdm

# --- Step 0: Setup ---
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

# Load FAISS retriever
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_db = FAISS.load_local("rxnorm_faiss_index", embeddings=embedding_model, allow_dangerous_deserialization=True)
retriever = faiss_db.as_retriever(search_kwargs={"k": 5})

# Create LLM chain
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# --- Medication + Problem helpers ---
def extract_medications(med_str: str) -> list:
    return [m.strip().split()[0].capitalize() for m in str(med_str).split(",") if m.strip()]

def extract_problems(problem_str: str) -> list:
    return [p.strip() for p in str(problem_str).split(",") if p.strip()]

def build_prompt(med: str, context: str, problems: list) -> str:
    return (
        "You are a clinical decision support assistant.\n"
        "Use the medication information and patient's problem list to identify which problem(s) the medication treats.\n"
        "If the medication is not in the knowledge base, reply 'I don’t know'.\n\n"
        f"Medication: {med}\n"
        f"Info: {context}\n"
        f"Patient Problems: {problems}\n"
        "Which problem(s) from the list does this medication treat?"
    )

def process_patient(row, chain) -> dict:
    patient_id = row["Patient_ID"]
    meds = extract_medications(row.get("Outpatient_Medications", ""))
    problems = extract_problems(row.get("Past_Medical_History", ""))

    result = {
        "Patient_ID": patient_id,
        "Medications": meds,
        "Treated_Problems_by_Medication": {}
    }

    for med in meds:
        try:
            docs = retriever.get_relevant_documents(med)
            context = "\n".join([doc.page_content for doc in docs]) if docs else "No relevant documents found."
            prompt = build_prompt(med, context, problems)
            response = chain.run(prompt)
            matched = [p for p in problems if p.lower() in response.lower()]
            result["Treated_Problems_by_Medication"][med] = matched
        except Exception as e:
            result["Treated_Problems_by_Medication"][med] = f"Error: {str(e)}"

        time.sleep(0.2)

    return result

# --- Main ---
def main():
    df = pd.read_csv("chest_pain_patients.csv")
    df["Patient_ID"] = df.index
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing patients"):
        result = process_patient(row, qa_chain)
        results.append(result)

    final_df = pd.DataFrame(results)
    final_df.to_csv("medication_problem_mapping_summary.csv", index=False)
    print("Output saved to medication_problem_mapping_summary.csv")

if __name__ == "__main__":
    main()


Processing patients:   3%|▎         | 3/100 [02:32<1:22:20, 50.94s/it]

Output saved to medication_problem_mapping_summary.csv





In [None]:
# Ensure the OPENAI_API_KEY is set in the environment
import os

# Map your existing key to the expected variable name
os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]

from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4", temperature=0)
response = llm.invoke("What is the capital of France?")
print(response)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}