In [None]:
!pip install rouge_score
!pip install bert-score
!pip install git+https://github.com/AIPHES/emnlp19-moverscore.git
!sed -i 's/np\.float\b/float/g' /usr/local/lib/python3.11/dist-packages/moverscore_v2.py

In [None]:
import torch
import numpy as np
import re
import json
import requests
from fuzzywuzzy import process
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers.cross_encoder import CrossEncoder
import networkx as nx

In [None]:
from umls import UMLS_API
from cross_encoder import UMLS_CrossEncoder
from fol import FOLReasoner
import ranking

umls_api = UMLS_API("YOUR_UMLS_API_KEY_HERE")
cross_encoder = UMLS_CrossEncoder()
fol_reasoner = FOLReasoner()

In [4]:
MEDICAL_TERMS = {
  # Drugs (generic and brand)
  "amoxicillin": ("C0002637", "Amoxicillin", "A broad-spectrum penicillin antibiotic used to treat various infections."),
  "ceftaroline": ("C0564627", "Ceftaroline", "A cephalosporin antibiotic effective against MRSA, used for skin and respiratory infections:contentReference[oaicite:8]{index=8}."),
  "doxycycline": ("C0004057", "Doxycycline", "A tetracycline antibiotic used for infections including acne, malaria prevention, and Lyme disease."),
  "vancomycin": ("C0040506", "Vancomycin", "A glycopeptide antibiotic used to treat serious gram-positive infections."),
  "azithromycin": ("C0003535", "Azithromycin", "A macrolide antibiotic used for respiratory, skin, and sexually transmitted infections."),
  "levofloxacin": ("C0149689", "Levofloxacin", "A fluoroquinolone antibiotic for various bacterial infections."),
  "fidaxomicin": ("C2821202", "Fidaxomicin", "A macrolide antibiotic primarily used to treat Clostridioides difficile colitis."),
  "dalbavancin": ("C2820913", "Dalbavancin", "A long-acting lipoglycopeptide antibiotic for acute bacterial skin infections."),
  "oritavancin": ("C2821380", "Oritavancin", "A lipoglycopeptide antibiotic used to treat Gram-positive skin infections."),
  "delafloxacin": ("C1284983", "Delafloxacin", "A fluoroquinolone antibiotic for acute bacterial skin and lung infections."),
  "tigecycline": ("C1308565", "Tigecycline", "A glycylcycline antibiotic effective against multi-drug-resistant organisms."),
  "ceftolozane": ("C0790322", "Ceftolozane", "A cephalosporin antibiotic often combined with tazobactam for resistant infections."),
  "ceftolozane/tazobactam": ("C1533118", "Ceftolozane-Tazobactam", "A combination antibiotic for complicated intra-abdominal and urinary tract infections."),
  
  # Antivirals and antimalarials
  "remdesivir": ("C1474153", "Remdesivir", "An antiviral drug approved for COVID-19, targeting viral RNA polymerase."),
  "molnupiravir": ("C3263273", "Molnupiravir", "An oral antiviral for COVID-19 that introduces copying errors during viral replication."),
  "nirmatrelvir": ("C3545470", "Nirmatrelvir", "A SARS-CoV-2 protease inhibitor used in combination therapy for COVID-19 (Paxlovid)."),
  "baloxavir": ("C4062026", "Baloxavir", "An antiviral approved for influenza, targeting the viral cap-dependent endonuclease."),
  "ivermectin": ("C0021485", "Ivermectin", "An antiparasitic medication used for onchocerciasis and strongyloidiasis."),
  "artesunate": ("C3462231", "Artesunate", "An artemisinin derivative used to treat severe malaria."),
  "tafenoquine": ("C3271764", "Tafenoquine", "An antimalarial drug used for relapse prevention in vivax malaria."),
  
  # Cancer therapies
  "ibritumomab tiuxetan": ("C0279390", "Ibritumomab Tiuxetan", "A radioimmunotherapy targeting CD20 on B-cells (Zevalin) for certain lymphomas."),
  "nivolumab": ("C1851170", "Nivolumab", "A PD-1 immune checkpoint inhibitor antibody used to treat various advanced cancers:contentReference[oaicite:9]{index=9}."),
  "pembrolizumab": ("C1855280", "Pembrolizumab", "A PD-1 immune checkpoint inhibitor antibody used for melanoma, lung cancer, and others."),
  "atezolizumab": ("C1622437", "Atezolizumab", "A PD-L1 immune checkpoint inhibitor antibody used for urothelial and lung cancers."),
  "durvalumab": ("C1604100", "Durvalumab", "A PD-L1 immune checkpoint inhibitor antibody used for bladder and lung cancers."),
  "avelumab": ("C1622366", "Avelumab", "A PD-L1 immune checkpoint inhibitor antibody used for Merkel cell carcinoma and others."),
  "ipilimumab": ("C1620927", "Ipilimumab", "A CTLA-4 immune checkpoint inhibitor antibody used for melanoma and renal carcinoma."),
  "ibrutinib": ("C1574738", "Ibrutinib", "A Bruton's tyrosine kinase inhibitor used in chronic lymphocytic leukemia and lymphoma."),
  "acalabrutinib": ("C1575185", "Acalabrutinib", "A second-generation BTK inhibitor for CLL and mantle cell lymphoma."),
  "lenalidomide": ("C1547141", "Lenalidomide", "An immunomodulatory drug used in multiple myeloma and myelodysplastic syndromes."),
  "pomalidomide": ("C1566210", "Pomalidomide", "An immunomodulatory drug for relapsed or refractory multiple myeloma."),
  "carfilzomib": ("C1543420", "Carfilzomib", "A proteasome inhibitor used in relapsed multiple myeloma."),
  "bortezomib": ("C0013448", "Bortezomib", "A proteasome inhibitor used in multiple myeloma and mantle cell lymphoma."),
  "blinatumomab": ("C1567949", "Blinatumomab", "A bispecific T-cell engager (CD19) used for acute lymphoblastic leukemia."),
  "tisagenlecleucel": ("C1706443", "Tisagenlecleucel", "A CD19-directed CAR T-cell therapy for refractory leukemia/lymphoma."),
  "axicabtagene ciloleucel": ("C1706475", "Axicabtagene Ciloleucel", "A CAR T-cell therapy for large B-cell lymphoma."),
  "palbociclib": ("C1567321", "Palbociclib", "A CDK4/6 inhibitor used in ER-positive, HER2-negative breast cancer."),
  "ribociclib": ("C1579765", "Ribociclib", "A CDK4/6 inhibitor for hormone receptor-positive breast cancer."),
  "abemaciclib": ("C1556827", "Abemaciclib", "A CDK4/6 inhibitor used in metastatic breast cancer."),
  "olaparib": ("C1566249", "Olaparib", "A PARP inhibitor for ovarian and breast cancer with BRCA mutation."),
  "niraparib": ("C1557761", "Niraparib", "A PARP inhibitor for ovarian cancer maintenance therapy."),
  "rucaparib": ("C1554562", "Rucaparib", "A PARP inhibitor for BRCA-mutated ovarian cancer."),
  "lenvatinib": ("C1576431", "Lenvatinib", "A multikinase inhibitor used in thyroid carcinoma and hepatocellular carcinoma."),
  "sorafenib": ("C1349073", "Sorafenib", "A multikinase inhibitor used for renal and liver cancers."),
  "sunitinib": ("C1337139", "Sunitinib", "A tyrosine kinase inhibitor used for renal cell carcinoma and GIST."),
  "regorafenib": ("C1578946", "Regorafenib", "A multi-kinase inhibitor used in colorectal cancer and GIST."),
  "cabozantinib": ("C1569040", "Cabozantinib", "A tyrosine kinase inhibitor for medullary thyroid and renal carcinomas."),
  
  # Biologics / Immunomodulators
  "adalimumab": ("C0021203", "Adalimumab", "A TNF-alpha inhibitor monoclonal antibody used in rheumatoid arthritis and psoriasis."),
  "infliximab": ("C0021217", "Infliximab", "A TNF-alpha inhibitor monoclonal antibody for Crohn's disease and arthritis."),
  "etanercept": ("C0021309", "Etanercept", "A TNF receptor fusion protein used in autoimmune inflammatory diseases."),
  "ustekinumab": ("C1578473", "Ustekinumab", "A monoclonal antibody against IL-12/23 (p40) used for psoriasis and psoriatic arthritis."),
  "secukinumab": ("C1576773", "Secukinumab", "An IL-17A inhibitor monoclonal antibody for psoriasis and ankylosing spondylitis."),
  "ixekizumab": ("C1555670", "Ixekizumab", "An IL-17A inhibitor monoclonal antibody used in psoriasis and psoriatic arthritis."),
  "brodalumab": ("C1540648", "Brodalumab", "An IL-17 receptor A inhibitor for severe plaque psoriasis."),
  "dupilumab": ("C1558429", "Dupilumab", "An IL-4 receptor alpha antagonist monoclonal antibody for asthma and eczema."),
  "tildrakizumab": ("C1701713", "Tildrakizumab", "An IL-23 inhibitor monoclonal antibody for psoriasis."),
  "risankizumab": ("C1701715", "Risankizumab", "An IL-23 inhibitor monoclonal antibody for plaque psoriasis."),
  "tocilizumab": ("C0034061", "Tocilizumab", "An IL-6 receptor antagonist monoclonal antibody used for rheumatoid arthritis."),
  "sarilumab": ("C1576879", "Sarilumab", "An IL-6 receptor antagonist monoclonal antibody for rheumatoid arthritis."),
  "anakinra": ("C0024085", "Anakinra", "An IL-1 receptor antagonist used for rheumatoid arthritis and autoinflammatory syndromes."),
  "belimumab": ("C1551497", "Belimumab", "A monoclonal antibody targeting B-lymphocyte stimulator (BLyS) for systemic lupus erythematosus."),
  "rituximab": ("C0035813", "Rituximab", "An anti-CD20 monoclonal antibody used in B-cell lymphomas and autoimmune diseases."),
  "vedolizumab": ("C1548296", "Vedolizumab", "An integrin inhibitor monoclonal antibody for inflammatory bowel disease."),
  "tofacitinib": ("C3554708", "Tofacitinib", "A JAK inhibitor used to treat rheumatoid arthritis and other autoimmune conditions:contentReference[oaicite:10]{index=10}."),
  "baricitinib": ("C3557265", "Baricitinib", "A JAK inhibitor for rheumatoid arthritis and recently for COVID-19."),
  "upadacitinib": ("C3558577", "Upadacitinib", "A selective JAK1 inhibitor used for rheumatoid arthritis."),
  "tepoxalin": ("C1544755", "Tepoxalin", "A dual COX/5-LOX inhibitor used in veterinary medicine (for context of rare NSAIDs)."),
  
  # Psychiatric medications
  "risperidone": ("C0005584", "Risperidone", "An atypical antipsychotic used in schizophrenia and bipolar disorder."),
  "brexpiprazole": ("C1557767", "Brexpiprazole", "A serotonin–dopamine activity modulator for schizophrenia and major depression."),
  "lurasidone": ("C1572389", "Lurasidone", "An atypical antipsychotic for schizophrenia and bipolar depression."),
  "asenapine": ("C1558209", "Asenapine", "An atypical antipsychotic administered sublingually, used for schizophrenia and bipolar disorder."),
  "esketamine": ("C1701807", "Esketamine", "An NMDA receptor antagonist nasal spray for treatment-resistant depression."),
  "cariprazine": ("C1541378", "Cariprazine", "An atypical antipsychotic for schizophrenia and bipolar mania."),
  
  # Endocrine and metabolic drugs
  "dapagliflozin": ("C1577772", "Dapagliflozin", "An SGLT2 inhibitor for type 2 diabetes and heart failure."),
  "empagliflozin": ("C1577771", "Empagliflozin", "An SGLT2 inhibitor for type 2 diabetes and heart failure."),
  "canagliflozin": ("C1577769", "Canagliflozin", "An SGLT2 inhibitor for type 2 diabetes, reduces cardiovascular risk."),
  "liraglutide": ("C1557395", "Liraglutide", "A GLP-1 receptor agonist for type 2 diabetes and obesity."),
  "semaglutide": ("C1557404", "Semaglutide", "A GLP-1 receptor agonist for type 2 diabetes and obesity."),
  "exenatide": ("C1546440", "Exenatide", "A GLP-1 receptor agonist for type 2 diabetes."),
  "levothyroxine": ("C0027356", "Levothyroxine", "A synthetic thyroid hormone used to treat hypothyroidism."),
  
  # Cardiovascular drugs
  "alirocumab": ("C1578504", "Alirocumab", "A PCSK9 inhibitor monoclonal antibody for lowering LDL cholesterol."),
  "evolocumab": ("C1578506", "Evolocumab", "A PCSK9 inhibitor monoclonal antibody to reduce cholesterol levels."),
  "apixaban": ("C1558944", "Apixaban", "An oral factor Xa inhibitor anticoagulant for atrial fibrillation and VTE."),
  "rivaroxaban": ("C1558936", "Rivaroxaban", "An oral factor Xa inhibitor anticoagulant for thrombosis and stroke prevention."),
  "dabigatran": ("C1559003", "Dabigatran", "An oral direct thrombin inhibitor anticoagulant."),
  "bempedoic acid": ("C1557861", "Bempedoic Acid", "An ATP-citrate lyase inhibitor to lower LDL cholesterol."),
  
  # Other notable drugs
  "linezolid": ("C0024174", "Linezolid", "An oxazolidinone antibiotic for Gram-positive infections."),
  "metronidazole": ("C0026184", "Metronidazole", "An antibiotic effective against anaerobic bacteria and protozoa."),
  "trimethoprim/sulfamethoxazole": ("C0024749", "Trimethoprim-Sulfamethoxazole", "A combination antibiotic for UTIs, pneumonia (Pneumocystis) and other infections."),
  "isoniazid": ("C0020839", "Isoniazid", "First-line antitubercular medication."),
  "rifampin": ("C0036051", "Rifampin", "A rifamycin antibiotic used in tuberculosis treatment."),
  "ethambutol": ("C0002963", "Ethambutol", "An antimycobacterial agent used in tuberculosis therapy."),
  "pyrazinamide": ("C0034149", "Pyrazinamide", "An antitubercular drug used with isoniazid and rifampin."),
  
  # Diseases and conditions
  "systemic lupus erythematosus": ("C0024141", "Systemic Lupus Erythematosus", "An autoimmune disease in which the immune system attacks multiple organs:contentReference[oaicite:11]{index=11}."),
  "rheumatoid arthritis": ("C0003873", "Rheumatoid Arthritis", "A chronic autoimmune disorder causing inflammation of joints:contentReference[oaicite:12]{index=12}."),
  "multiple sclerosis": ("C0026769", "Multiple Sclerosis", "A demyelinating disease of the central nervous system:contentReference[oaicite:13]{index=13}."),
  "amyotrophic lateral sclerosis": ("C0002736", "Amyotrophic Lateral Sclerosis", "A progressive neurodegenerative disease affecting motor neurons:contentReference[oaicite:14]{index=14}."),
  "guillain-barre syndrome": ("C0019321", "Guillain-Barr Syndrome", "An acute autoimmune neuropathy causing rapid muscle weakness:contentReference[oaicite:15]{index=15}."),
  "myasthenia gravis": ("C0027051", "Myasthenia Gravis", "A chronic autoimmune neuromuscular disease characterized by muscle weakness."),
  "huntington disease": ("C0020179", "Huntington Disease", "A genetic neurodegenerative disorder causing movement, cognitive, and psychiatric disturbances."),
  "parkinson disease": ("C0030567", "Parkinson Disease", "A neurodegenerative disorder characterized by tremor, rigidity, and bradykinesia."),
  "alzheimer disease": ("C0002395", "Alzheimer Disease", "A neurodegenerative disease causing dementia and cognitive decline."),
  "sarcoidosis": ("C0033866", "Sarcoidosis", "An inflammatory disease characterized by granulomas in multiple organs, especially lungs."),
  "systemic sclerosis": ("C0036161", "Systemic Sclerosis", "An autoimmune connective tissue disease causing skin and organ fibrosis."),
  "hashimoto thyroiditis": ("C0021642", "Hashimoto Thyroiditis", "An autoimmune thyroid disorder causing hypothyroidism."),
  "graves disease": ("C0017725", "Graves Disease", "An autoimmune disorder causing hyperthyroidism and goiter."),
  "addison disease": ("C0002651", "Addison Disease", "Primary adrenal insufficiency, often autoimmune, causing cortisol deficiency."),
  "cushing syndrome": ("C0007873", "Cushing Syndrome", "A condition caused by chronic high cortisol, often from steroids or adrenal tumor."),
  "diabetes mellitus type 1": ("C0011860", "Diabetes Mellitus Type 1", "An autoimmune destruction of pancreatic beta cells leading to insulin deficiency."),
  "diabetes mellitus type 2": ("C0011860", "Diabetes Mellitus Type 2", "A metabolic disorder characterized by insulin resistance and relative insulin deficiency."),
  "polycystic ovary syndrome": ("C0030305", "Polycystic Ovary Syndrome", "An endocrine disorder causing ovulatory dysfunction and hyperandrogenism."),
  "cystic fibrosis": ("C0010674", "Cystic Fibrosis", "A genetic disorder affecting chloride channels, causing thick mucus in lungs and GI tract."),
  "sickle cell anemia": ("C0023433", "Sickle Cell Anemia", "A hereditary hemoglobinopathy causing sickle-shaped red blood cells and vaso-occlusion."),
  "beta thalassemia": ("C0005842", "Beta Thalassemia", "A genetic disorder causing reduced beta-globin production and anemia."),
  "gauchers disease": ("C0027817", "Gaucher Disease", "A lysosomal storage disorder caused by glucocerebrosidase deficiency."),
  "tay sachs disease": ("C0032451", "Tay-Sachs Disease", "A lysosomal storage disorder caused by hexosaminidase A deficiency, leading to neurodegeneration."),
  "pompe disease": ("C0029363", "Pompe Disease", "A glycogen storage disorder (acid maltase deficiency) affecting heart and muscles."),
  "fabry disease": ("C0016167", "Fabry Disease", "A lysosomal storage disorder caused by alpha-galactosidase A deficiency."),
  "metachromatic leukodystrophy": ("C0025595", "Metachromatic Leukodystrophy", "A genetic disorder causing myelin sheath degeneration in nerves."),
  "amyloidosis": ("C0004491", "Amyloidosis", "A group of conditions where misfolded proteins deposit as amyloid in organs."),
  "hemochromatosis": ("C1386814", "Hemochromatosis", "An iron overload disorder that can damage liver, heart, and pancreas."),
  "wilsons disease": ("C0042377", "Wilson Disease", "A genetic disorder causing copper accumulation, leading to liver and neurological disease."),
  "phenylketonuria": ("C0037356", "Phenylketonuria", "An inherited metabolic disorder causing phenylalanine accumulation."),
  "maple syrup urine disease": ("C0521488", "Maple Syrup Urine Disease", "An inherited disorder causing branched-chain amino acid accumulation."),
  "mucopolysaccharidosis": ("C0430027", "Mucopolysaccharidosis", "A group of inherited metabolic disorders affecting glycosaminoglycan breakdown."),
  
  # Infectious diseases
  "tuberculosis": ("C0041296", "Tuberculosis", "A chronic infectious disease caused by Mycobacterium tuberculosis, usually affecting lungs."),
  "leprosy": ("C0024109", "Leprosy", "A chronic infection by Mycobacterium leprae affecting skin and nerves."),
  "dengue fever": ("C0019221", "Dengue Fever", "A mosquito-borne viral infection causing fever, rash, and severe joint pain."),
  "ebola hemorrhagic fever": ("C0019041", "Ebola Virus Disease", "A severe viral hemorrhagic fever with high mortality."),
  "sars": ("C0036790", "Severe Acute Respiratory Syndrome", "A viral respiratory illness caused by a coronavirus, first identified in 2003."),
  "mers": ("C3534218", "Middle East Respiratory Syndrome", "A viral respiratory disease caused by the MERS coronavirus."),
  "lyme disease": ("C0024651", "Lyme Disease", "An infectious disease caused by Borrelia burgdorferi, transmitted by ticks."),
  "zika virus infection": ("C3547502", "Zika Virus Infection", "A mosquito-borne viral disease that can cause birth defects."),
  "chikungunya": ("C0008034", "Chikungunya", "A mosquito-borne viral disease causing fever and joint pain."),
  "malaria": ("C0025289", "Malaria", "A mosquito-borne parasitic infection causing cyclical fevers and anemia."),
  "influenza": ("C0021400", "Influenza", "An acute respiratory viral infection caused by influenza viruses."),
  
  # Cancers (disease names)
  "acute lymphoblastic leukemia": ("C0005612", "Acute Lymphoblastic Leukemia", "A rapidly progressing cancer of lymphoid lineage, common in children."),
  "acute myeloid leukemia": ("C0022694", "Acute Myeloid Leukemia", "A rapidly progressing cancer of myeloid blood cells."),
  "chronic lymphocytic leukemia": ("C0007797", "Chronic Lymphocytic Leukemia", "A slow-growing cancer of B lymphocytes in adults."),
  "chronic myeloid leukemia": ("C0007786", "Chronic Myeloid Leukemia", "A myeloproliferative neoplasm associated with the BCR-ABL fusion gene."),
  "multiple myeloma": ("C0002645", "Multiple Myeloma", "A malignant proliferation of plasma cells in the bone marrow."),
  "hodgkin lymphoma": ("C0019204", "Hodgkin Lymphoma", "A cancer of the lymphatic system characterized by Reed-Sternberg cells."),
  "non-hodgkin lymphoma": ("C0024017", "Non-Hodgkin Lymphoma", "A diverse group of lymphoid cancers without Reed-Sternberg cells."),
  "melanoma": ("C0025202", "Melanoma", "A malignant skin tumor arising from melanocytes."),
  "breast cancer": ("C0006142", "Breast Cancer", "A malignant tumor of breast tissue, often adenocarcinoma."),
  "prostate cancer": ("C0033578", "Prostate Cancer", "A malignant tumor of the prostate gland."),
  "lung cancer": ("C0024121", "Lung Cancer", "A malignant lung tumor, commonly small cell or non-small cell carcinoma."),
  "colon cancer": ("C0007102", "Colorectal Carcinoma", "A malignant tumor of the colon or rectum."),
  "pancreatic cancer": ("C0039731", "Pancreatic Carcinoma", "A malignant tumor arising from the pancreatic exocrine cells."),
  "hepatocellular carcinoma": ("C0007109", "Hepatocellular Carcinoma", "A primary liver cancer arising from hepatocytes."),
  "glioblastoma": ("C0242339", "Glioblastoma", "A highly malignant primary brain tumor (astrocytoma)."),
  "melanoma, uveal": ("C0041431", "Uveal Melanoma", "A malignant melanoma of the eye's uveal tract."),
  "carcinoid tumor": ("C0007091", "Carcinoid Tumor", "A slow-growing neuroendocrine tumor often of the gastrointestinal tract or lungs."),
  
  # Other conditions
  "acute respiratory distress syndrome": ("C0003392", "Acute Respiratory Distress Syndrome", "A severe form of lung injury causing respiratory failure."),
  "acute kidney injury": ("C0002907", "Acute Kidney Injury", "A sudden decline in renal function, previously called acute renal failure."),
  "chronic kidney disease": ("C0010970", "Chronic Kidney Disease", "Long-term loss of kidney function leading to renal failure."),
  "metabolic syndrome": ("C0457199", "Metabolic Syndrome", "A cluster of conditions (hypertension, dyslipidemia, etc.) increasing cardiovascular risk."),
  "polymyalgia rheumatica": ("C0022810", "Polymyalgia Rheumatica", "An inflammatory syndrome causing muscle pain and stiffness in older adults."),
  "temporal arteritis": ("C0041832", "Temporal Arteritis", "An inflammatory disease of large blood vessels (giant cell arteritis) often causing headache."),
  "factor v leiden thrombophilia": ("C1866765", "Factor V Leiden Thrombophilia", "A genetic mutation causing hypercoagulability due to Factor V resistance."),
  "paroxysmal nocturnal hemoglobinuria": ("C0205383", "Paroxysmal Nocturnal Hemoglobinuria", "An acquired hematopoietic stem cell disorder causing hemolysis."),
  "antiphospholipid syndrome": ("C0021309", "Antiphospholipid Syndrome", "An autoimmune disorder causing thrombosis due to antibodies against phospholipids."),
  "hemophagocytic lymphohistiocytosis": ("C0027653", "Hemophagocytic Lymphohistiocytosis", "An aggressive immune activation syndrome causing fever and cytopenias."),
}


In [5]:
from concurrent.futures import ThreadPoolExecutor
import json
import re
import redis

# Khởi tạo Redis client
r = redis.Redis(host='localhost', port=6379, db=0)

def cached_search_cui(term):
    key = f"search_cui:{term}"
    try:
        result = r.get(key)
        if result:
            return json.loads(result)
        result = umls_api.search_cui(term)
        r.set(key, json.dumps(result), ex=43200)
        return result
    except Exception as e:
        return umls_api.search_cui(term)  # Fallback to API call

def cached_get_definitions(cui):
    """Lưu trữ định nghĩa từ UMLS API trong Redis."""
    key = f"definitions:{cui}"
    try:
        result = r.get(key)
        if result:
            return json.loads(result)
        result = umls_api.get_definitions(cui)
        r.set(key, json.dumps(result), ex=43200)
        return result
    except Exception as e:
        return umls_api.get_definitions(cui)

def cached_get_relations(cui):
    """Lưu trữ quan hệ từ UMLS API trong Redis."""
    key = f"relations:{cui}"
    try:
        result = r.get(key)
        if result:
            return json.loads(result)
        result = umls_api.get_relations(cui)
        r.set(key, json.dumps(result), ex=43200)
        return result
    except Exception as e:
        return umls_api.get_relations(cui)

In [6]:
from ner import MedicalNERLLM

correct_spelling = MedicalNERLLM().correct_spelling

In [7]:
from itertools import combinations

def process_term(key, query):
    corrected_key = correct_spelling(key)
    cuis = cached_search_cui(corrected_key)

    if not cuis:
        if corrected_key in MEDICAL_TERMS:
            cui, name, defi = MEDICAL_TERMS[corrected_key]
            return cui, {"name": name, "definition": defi, "rels": []}
        return None

    cui, name = cuis[0][0], cuis[0][1]
    defi = ""
    definitions = cached_get_definitions(cui)

    if definitions:
        for definition in definitions:
            source = definition["rootSource"]
            if source in ["MSH", "NCI", "ICF", "CSP", "HPO"]:
                defi = definition["value"]
                break

    relations = cached_get_relations(cui)
    rels = []


    if relations:
        rank_rels = ranking.ppr_ranking(query, relations, corrected_key, top_k=15)
        # rank_rels = ranking.similarity_score(query, relations, top_k=10)
        # rank_rels = ranking.MMR_reranking(query, relations, top_k=10)

        return cui, name, defi, rank_rels, relations
        
    return None

def rerank(query, relations):
    if not relations:
        return []
        
    rank_rel_texts = [
        f"{rel.get('relatedFromIdName', '')} {rel.get('additionalRelationLabel', '').replace('_', ' ')} {rel.get('relatedIdName', '')}"
        for rel in relations
    ]
    
    try:
        scores = cross_encoder.score(query, rank_rel_texts)
        scored_relations = list(zip(scores, relations))
        scored_relations.sort(key=lambda x: x[0], reverse=True)
        top_relations = scored_relations[:10]
        
        output_rels = [
            (
                rel.get("relatedFromIdName", ""),
                rel.get("additionalRelationLabel", "").replace("_", " "),
                rel.get("relatedIdName", "")
            )
            for _, rel in top_relations
        ]
        
        return output_rels
        
    except Exception as e:
        print(f"Error during reranking: {e}")
        return []

#### Check Medical NER
1. Aspirin should be taken twice daily to prevent blood clots
2. Paracetamol is commonly used to reduce fever in children
3. She complained of severe abdominal pain and nausea.

In [None]:
medical_ner_llm = MedicalNERLLM()
question = 'She complained of severe abdominal pain and nausea.'
medical_ner_llm.predict(question)

In [9]:
def get_umls_keys(query, prompt, llm):
    umls_res = {}
    prompt = prompt.replace("{question}", query)

    try:
        keys_text = llm.predict(prompt)
        print("Medical terms: ", keys_text)
    except Exception as e:
        print(f"Error during model processing: {e}")
        return ""

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(lambda key: process_term(key, query), keys_text))

    for result in results:
        if result:
            cui, name, defi, rank_rels, relations = result
            umls_res[cui] = {"name": name, 
                             "defi":defi, 
                             "rank-rels": rank_rels, 
                             "relations": relations}

    if not umls_res:
        return ""

    input_fols = [a.get("relations", []) for a in umls_res.values()]
    
    result_fol = []
    if len(input_fols) > 1:
        pairs = list(combinations(input_fols, 2))
        for pair in pairs:
            temp = fol_reasoner.apply_rules_to_kg(pair[0] + pair[1])
            result_fol.extend(temp[:10])
            
    input_reranks = [
        rel for relations in umls_res.values() 
        for rel in relations.get("relations", [])
    ]

    for result in result_fol:
        input_reranks.append({"relatedFromIdName": result[0], 
                              "additionalRelationLabel": result[1], 
                              "relatedIdName": result[2]})
        
    print("Before rerank: ", input_reranks[:20])

    result_rerank = rerank(query, input_reranks)

    context_lines = []
    for cui, v in umls_res.items():
        name = v.get("name", "")
        definition = v.get("defi") or "No definition available."
        context_lines.append(f"Name: {name}\nDefinition: {definition}\n")

    for result in result_rerank:
        if result:
            rels_text = f"({result[0] or ''},{result[1] or ''},{result[2] or ''})"
            context_lines.append(rels_text)

    return "\n".join(context_lines) if context_lines else ""


In [None]:
query = "How does obesity contribute to type 2 diabetes in individuals with a sedentary lifestyle"
prompt = query

result = get_umls_keys(query, prompt, medical_ner_llm)
print(result)

In [11]:
# Test translation
translation_test_1 = "Mặc dù bệnh nhân đã được điều trị bằng phác đồ kháng sinh phổ rộng kết hợp với corticosteroid liều cao, tình trạng suy hô hấp cấp vẫn tiến triển nặng, buộc phải đặt nội khí quản và thở máy xâm nhập trong vòng 24 giờ kể từ khi nhập viện."
translation_test_2 = "Sau khi xuất hiện các triệu chứng như ho kéo dài, sụt cân và đau ngực, bệnh nhân được bác sĩ chỉ định chụp X-quang phổi và xét nghiệm đờm để kiểm tra lao phổi."
translation_test_3 = "Người đàn ông 58 tuổi có tiền sử tăng huyết áp và đái tháo đường được chuyển đến bệnh viện trong tình trạng khó thở, nhịp tim nhanh và huyết áp tụt đột ngột."

In [None]:
from translation import EnViT5Translator

translator = EnViT5Translator()

In [None]:
query = "Người đàn ông 58 tuổi có tiền sử tăng huyết áp và đái tháo đường được chuyển đến bệnh viện trong tình trạng khó thở, nhịp tim nhanh và huyết áp tụt đột ngột."
query = translation_test_1
query = translator.translate(query)[4:]
print(query)

In [14]:
token = 'YOUR_OPENAI_API_KEY_HERE'

In [15]:
import openai

def chat_with_openai(api_key, prompt, model="gpt-4o-mini-2024-07-18"):
    client = openai.OpenAI(api_key=api_key)

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during OpenAI request: {e}")
        return ""


def generate_answer(query, llm, context, token):
    if context==True:
        umls_context = get_umls_keys(query, "{question}", llm)
    else:
        umls_context = ""
    
    prompt = f"""
            You are a physician answering a patient's question based on the provided medical context. Follow these instructions:
            1. Use the context (including definitions and relations) to answer the question accurately.
            2. If the context lacks sufficient information, state that clearly and provide a general answer based on standard medical knowledge, but avoid speculation.
            3. Return the answer in Vietnamese.

            ### Example 1
            Context: 
                - CUI: C0004238  
                - Definition: Hypertension is a medical condition characterized by persistently high blood pressure in the arteries.  
                - Relation: Hypertension -> treated_by -> Metoprolol.  
            Patient: 
                Thuốc nào điều trị tăng huyết áp?  
            Physician: 
                Thuốc Metoprolol được sử dụng để điều trị tăng huyết áp dựa trên thông tin y khoa cung cấp.

            ### Example 2
            Context: 
                - No specific medical context provided.  
            Patient: 
                Thuốc nào điều trị đau đầu?  
            Physician: 
                Thông tin y khoa cung cấp không đủ để xác định thuốc điều trị đau đầu. Tuy nhiên, theo kiến thức y khoa chung, các loại thuốc như paracetamol hoặc ibuprofen thường được sử dụng để giảm đau đầu, nhưng bạn nên tham khảo ý kiến bác sĩ để được tư vấn phù hợp.

            ### Actual Task
            Context: 
                {umls_context} 
            Patient: 
                {query} 
            Physician:
        """
    
    print("Prompt: ", prompt)
    
    answer = chat_with_openai(token, prompt)
    return answer

In [16]:
# Test generate by LLMs
generate_test_1 = "Bệnh cao huyết áp có thể dẫn đến những biến chứng nguy hiểm nào nếu không điều trị?"
generate_test_2 = "Những dấu hiệu nào cho thấy người bệnh có thể đang bị suy thận mãn tính?"
generate_test_3 = "Phương pháp chẩn đoán nào thường được sử dụng để phát hiện ung thư gan giai đoạn đầu?"
generate_test_4 = "Chế độ ăn uống như thế nào là phù hợp cho người bị mỡ máu cao?"
generate_test_5 = "Thuốc Paracetamol có thể dùng để hạ sốt ở trẻ em bao nhiêu lần một ngày?"
generate_test_6 = "Việc tiêm vaccine HPV có tác dụng phòng ngừa loại ung thư nào?"

In [None]:
query = generate_test_5
query = translator.translate(query)[4:]
print(query)
answer = generate_answer(query, medical_ner_llm, context=True, token=token)
print("Answer by GPT-4o-mini:", answer, "\n\n")

In [None]:
import pandas as pd
data_vi = pd.read_csv(r"E:\Projects\HealthcareChatbot\KG-Rank\data\200_good_vimedaqa.csv")

sample = data_vi.iloc[0]
sample

In [None]:
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from moverscore_v2 import word_mover_score, get_idf_dict
import transformers
transformers.logging.set_verbosity_error()

# Khởi tạo scorer cho ROUGE
rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

In [None]:
from transformers import pipeline

# Tải pipeline NER
ner_pipeline = pipeline(
    "ner",
    model="dmis-lab/biosyn-biobert-bc5cdr-disease",
    tokenizer="dmis-lab/biosyn-biobert-bc5cdr-disease",
    aggregation_strategy="simple"
)

# Câu hỏi (đã dịch bằng EnViT5)
text = "How is follicular lymphoma treated?"

# Chạy NER
entities = ner_pipeline(text)
print([(ent["word"], ent["entity_group"]) for ent in entities])
# Kết quả: [('follicular lymphoma', 'DISEASE')]

In [None]:
query = sample
query = translator.translate(query)[4:]
print("Translated query: ", query)

ground_truth = row['answer']
ground_truth = translate_vi_to_en(ground_truth)[4:]

answer = generate_answer(query, medical_ner_llm, context=True, token=token)

# Bỏ qua các mẫu lỗi
if (ground_truth is None or predicted_answer is None or 
    not ground_truth.strip() or not predicted_answer.strip()):
    print("Warning: Skipping sample due to None or empty value.")
    rouge_l_scores.append(0.0)
    bert_scores.append(0.0)
    mover_scores.append(0.0)
    print("ROUGE-L F1: 0.0000 | BERTScore F1: 0.0000 | MoverScore: 0.0000\n")
    continue

predicted_answers.append(predicted_answer)

# ROUGE-L
rouge_result = rouge_scorer.score(ground_truth, predicted_answer)
rouge_l_f1 = rouge_result['rougeL'].fmeasure
rouge_l_scores.append(rouge_l_f1)

# BERTScore (F1)
_, _, F1 = bert_score(
    [predicted_answer], 
    [ground_truth], 
    lang="en", 
    model_type="roberta-large"
)
bert_f1 = F1.item()
bert_scores.append(bert_f1)

# MoverScore
idf_hyp = get_idf_dict(predicted_answer)
idf_ref = get_idf_dict(ground_truth)
mover = word_mover_score(
    [ground_truth], [predicted_answer],
    idf_ref, idf_hyp,
    stop_words=[], n_gram=1,
    remove_subwords=True,
    batch_size=1,
    device='cuda'  # Chạy trên GPU
)
mover_value = mover[0]
mover_scores.append(mover_value)

# In kết quả
print(f"ROUGE-L F1: {rouge_l_f1:.4f} | BERTScore F1: {bert_f1:.4f} | MoverScore: {mover_value:.4f}\n")


#### Đánh giá chất lượng mô hình EnViT5 qua MoverScore

In [None]:
import pandas as pd

# Đọc từng dòng, loại bỏ \n và khoảng trắng
with open(r"E:\Projects\HealthcareChatbot\KG-Rank\EnViT5-fine-tuned\data\test.vi.new.txt", encoding="utf-8") as f:
    vi_sentences = [line.strip() for line in f]

with open(r"E:\Projects\HealthcareChatbot\KG-Rank\EnViT5-fine-tuned\data\test.en.new.txt", encoding="utf-8") as f:
    en_sentences = [line.strip() for line in f]

# Tạo DataFrame
df = pd.DataFrame({"vi": vi_sentences, "en": en_sentences})

df.head(2)

In [None]:
predicted_answer = translator.translate(df.iloc[0]['vi'])
ground_truth = df.iloc[0]['en']

print(df.iloc[0]['vi'])
print(predicted_answer)
print(ground_truth)

In [None]:
idf_hyp = get_idf_dict(translator.translate(df.iloc[1]['vi']))
idf_ref = get_idf_dict(df.iloc[1]['en'])
mover = word_mover_score(
    [ground_truth], [predicted_answer],
    idf_ref, idf_hyp,
    stop_words=[], n_gram=1,
    remove_subwords=True,
    batch_size=1,
    device='cuda'  # Chạy trên GPU
)
mover_value = mover[0]
print(mover_value)

In [None]:
def average_moverscore(df, translator, get_idf_dict, word_mover_score, device='cuda'):
    total_score = 0
    n = len(df)
    
    for i in range(n):
        vi_sentence = df.iloc[i]['vi']
        en_sentence = df.iloc[i]['en']
        
        # Dịch câu vi sang en để lấy idf hyp
        translated_vi = translator.translate(vi_sentence)
        
        idf_hyp = get_idf_dict(translated_vi)
        idf_ref = get_idf_dict(en_sentence)
        
        mover = word_mover_score(
            [en_sentence], [translated_vi],
            idf_ref, idf_hyp,
            stop_words=[],
            n_gram=1,
            remove_subwords=True,
            batch_size=1,
            device=device
        )
        
        mover_value = mover[0]
        total_score += mover_value
    
    avg_score = total_score / n
    return avg_score


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Tải mô hình và tokenizer
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Tạo pipeline NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Văn bản thử nghiệm
text = "The patient was diagnosed with diabetes and prescribed metformin."

# Thực hiện NER
results = ner_pipeline(text)
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")