In [17]:
# === Standard Library ===
import json
import logging
import os
import sys
import warnings
from difflib import SequenceMatcher
from pathlib import Path
# === Ontology Mapping ===
#from ontoma import OnToma
# === Web Requests and Parsing ===
import requests
from bs4 import BeautifulSoup
# === Data Handling ===
import pandas as pd
# === AI & Language Detection ===
import openai
from langdetect import detect

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Silence pronto OWL parser warnings and root logging
warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("pronto").setLevel(logging.CRITICAL)
logging.getLogger("rdflib").setLevel(logging.CRITICAL)
logging.getLogger().setLevel(logging.ERROR)

# Redirect stderr to suppress parser warnings completely
class NullWriter:
    def write(self, s): pass
    def flush(self): pass
sys.stderr = NullWriter()

# Set OnToma cache directory
os.environ["ONTOLOGY_INDEX_CACHE_DIR"] = os.path.join(Path.home(), ".ontoma_cache")

def match_disease_to_phenotype_llm(disease_term: str, tsv_path="db/clinicalAnnotations/clinical_annotations.tsv"):
    df = pd.read_csv(tsv_path, sep="\t")
    df.dropna(subset=["Phenotype(s)"], inplace=True)
    phenotypes = list(df["Phenotype(s)"].unique())

    joined_phenos = "\n".join(f"- {p}" for p in phenotypes)
    prompt = (
        f"You are a biomedical assistant. A user is searching for a disease: '{disease_term}'.\n"
        f"From the list below, identify the phenotype that most closely matches this disease:\n\n"
        f"{joined_phenos}\n\n"
        f"Only return the matching phenotype string exactly as it appears above. Do not invent or modify anything."
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        print("\n🧠 Full LLM raw response:")
        print(response)

        matched = response.choices[0].message.content.strip()

        # ✅ Exact match
        if matched in phenotypes:
            drugs = df[df["Phenotype(s)"] == matched]["Drug(s)"].unique().tolist()
            print(f"✅ LLM-selected phenotype match: '{matched}'")
            return matched, drugs

        # 🔁 Fuzzy match fallback
        similarity_scores = [(p, SequenceMatcher(None, matched, p).ratio()) for p in phenotypes]
        best_match, score = max(similarity_scores, key=lambda x: x[1])
        if score > 0.8:
            drugs = df[df["Phenotype(s)"] == best_match]["Drug(s)"].unique().tolist()
            print(f"⚠️ LLM match. Best fuzzy match: '{best_match}' (similarity: {score:.2f})")
            return best_match, drugs
        else:
            print("❌ LLM returned an unrecognized phenotype, and no fuzzy match exceeded threshold.")
            return None, []

    except Exception as e:
        print(f"❌ LLM matching failed: {e}")
        return None, []



if __name__ == "__main__":
    disease_term = input("\U0001f9e0 Enter a disease name (e.g., 'asthma'): ").strip()
    match_label, drugs = match_disease_to_phenotype_llm(disease_term)
    if drugs:
        print(f"\n🧬 Drugs matched for phenotype '{match_label}':")
        for d in drugs:
            print("-", d)
    else:
        print("No drugs found for input.")

🧠 Enter a disease name (e.g., 'asthma'):  asthma



🧠 Full LLM raw response:
ChatCompletion(id='chatcmpl-BbTuSaxfGZccp8M9LIeIru4t0ztuR', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='- Asthma', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1748272808, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_07871e2ad8', usage=CompletionUsage(completion_tokens=3, prompt_tokens=8210, total_tokens=8213, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))
⚠️ LLM match. Best fuzzy match: 'Asthma' (similarity: 0.86)

🧬 Drugs matched for phenotype 'Asthma':
- selective beta-2-adrenoreceptor agonists
- salmeterol
- salbutamol;selective beta-2-adrenoreceptor agonists
- montelukast
- corticosteroids
- salbutamol
- fluni