In [None]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import re
import faiss
import openai

In [2]:
def get_service_ticket(api_key, service="http://umlsks.nlm.nih.gov"):
    tgt_resp = requests.post(
        "https://utslogin.nlm.nih.gov/cas/v1/api-key",
        data={"apikey": api_key},
        headers={"Content-Type": "application/x-www-form-urlencoded"}
    )
    tgt_match = re.search(r'action="(.+?)"', tgt_resp.text)
    if not tgt_match:
        raise ValueError("TGT 获取失败")
    tgt_url = tgt_match.group(1)
    st_resp = requests.post(tgt_url, data={"service": service})
    return st_resp.text

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()
API_KEY = os.getenv("NX_API_KEY")
ticket = get_service_ticket(API_KEY)
url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C0004057?ticket={ticket}"
response = requests.get(url)
print(response.json())

{'pageSize': 25, 'pageNumber': 1, 'pageCount': 1, 'result': {'ui': 'C0004057', 'name': 'aspirin', 'dateAdded': '09-30-1990', 'majorRevisionDate': '12-03-2024', 'classType': 'Concept', 'suppressible': False, 'status': 'R', 'semanticTypes': [{'name': 'Organic Chemical', 'uri': 'https://uts-ws.nlm.nih.gov/rest/semantic-network/2025AA/TUI/T109'}, {'name': 'Pharmacologic Substance', 'uri': 'https://uts-ws.nlm.nih.gov/rest/semantic-network/2025AA/TUI/T121'}], 'atoms': 'https://uts-ws.nlm.nih.gov/rest/content/2025AA/CUI/C0004057/atoms', 'definitions': 'https://uts-ws.nlm.nih.gov/rest/content/2025AA/CUI/C0004057/definitions', 'relations': 'https://uts-ws.nlm.nih.gov/rest/content/2025AA/CUI/C0004057/relations', 'defaultPreferredAtom': 'https://uts-ws.nlm.nih.gov/rest/content/2025AA/CUI/C0004057/atoms/preferred', 'atomCount': 168, 'cvMemberCount': 0, 'attributeCount': 0, 'relationCount': 300}}


In [4]:
tgt_url = "https://utslogin.nlm.nih.gov/cas/v1/api-key"

tgt_response = requests.post(
    tgt_url,
    data={"apikey": API_KEY},
    headers={"Content-Type": "application/x-www-form-urlencoded"}
)

# 从返回的 HTML 中提取 TGT url（form action）
import re
match = re.search(r'action="(.+?)"', tgt_response.text)
TGT = match.group(1) if match else None
print("TGT:", TGT)


TGT: https://utslogin.nlm.nih.gov/cas/v1/api-key/TGT-7593-44caln4045mdbh3txe-cas


In [5]:
import requests
import re

# 获取 TGT
api_key = API_KEY
tgt_url = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
tgt_response = requests.post(
    tgt_url,
    data={"apikey": api_key},
    headers={"Content-Type": "application/x-www-form-urlencoded"}
)
tgt_match = re.search(r'action="(.+?)"', tgt_response.text)
tgt_location = tgt_match.group(1)

# ✅ 获取 Service Ticket (ST)
service = "http://umlsks.nlm.nih.gov"
st_response = requests.post(tgt_location, data={"service": service})
service_ticket = st_response.text.strip()

print("Service Ticket:", service_ticket)


Service Ticket: ST-32658-qlrn40cgmdbh3vtg-cas


In [6]:
import pandas as pd
import requests
import re
import time

### Step 1: 定义字段列名（官方定义）

conso_cols = [
    "RXCUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "RXAUI", "SAUI", "SCUI", "SDUI",
    "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF", "EXTRA"
]
sat_cols = [
    "RXCUI", "LUI", "SUI", "RXAUI", "STYPE", "CODE", "ATUI", "SATUI",
    "ATN", "SAB", "ATV", "SUPPRESS", "CVF", "EXTRA"
]

### Step 2: 加载 RRF 文件

conso = pd.read_csv("/Users/dylanli/Downloads/RxNorm_weekly_prescribe_07162025/rrf/RXNCONSO.RRF", sep="|", header=None, names=conso_cols, usecols=range(19), dtype=str, engine="python")
sat = pd.read_csv("/Users/dylanli/Downloads/RxNorm_weekly_prescribe_07162025/rrf/RXNSAT.RRF", sep="|", header=None, names=sat_cols, usecols=range(14), dtype=str, engine="python")

### Step 3: 筛选英文 & 首选名称的概念

conso = conso[(conso["LAT"] == "ENG")]

print(conso.head())

conso = conso[["RXCUI", "STR", "TTY", "CODE", "SAB"]].drop_duplicates()

### Step 4: 筛选说明性属性（ATN 包含 description/display）

sat_filtered = sat[sat["ATN"].str.lower().str.contains("description|display|text", na=False)]
sat_filtered = sat_filtered[["RXCUI", "ATV"]].drop_duplicates()

### Step 5: 合并基础知识库

kb = pd.merge(conso, sat_filtered, on="RXCUI", how="left")
kb.to_csv("Database/rxnorm_knowledge_base.csv", index=False)

print("✅ 步骤一完成：已生成 rxnorm_knowledge_base.csv")

### Step 6: 调用 RxNav API 为每个 RXCUI 补全属性（description, synonym 等）

def fetch_rxcui_properties(rxcui):
    url = f"https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/allProperties.json?prop=all"
    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return ""
        data = resp.json()
        chunks = []
        for group in data.get("propConceptGroup", {}).get("propConcept", []):
            name = group.get("propName", "")
            value = group.get("propValue", "")
            if name and value and name.lower() in ["synonym", "display name", "description", "consumer friendly text", "va class"]:
                chunks.append(f"{name}: {value}")
        return " | ".join(chunks)
    except:
        return ""

# 去重后逐个获取补充信息
kb["Supplemental_Info"] = kb["RXCUI"].dropna().map(fetch_rxcui_properties)
time.sleep(0.2)

### Step 7: 构建最终文本 chunk 用于 embedding

kb["Text_Chunk"] = kb.apply(
    lambda row: f"{row['STR']} ({row['TTY']}) - {row['ATV'] if pd.notna(row['ATV']) else ''}. {row['Supplemental_Info'] if pd.notna(row['Supplemental_Info']) else ''}",
    axis=1
)

### Step 8: 导出最终 enriched 文件

kb.to_csv("rxnorm_enriched_chunks.csv", index=False)
print("✅ 步骤二完成：最终知识库已生成 rxnorm_enriched_chunks.csv")


     RXCUI  LAT   TS  LUI  STT  SUI ISPREF     RXAUI      SAUI     SCUI SDUI  \
0  2641622  ENG  NaN  NaN  NaN  NaN    NaN  12985883  12985883  2641622  NaN   
1  2641622  ENG  NaN  NaN  NaN  NaN    NaN  12985885  12985885  2641622  NaN   
2  2708995  ENG  NaN  NaN  NaN  NaN    NaN  12984979  12984979  2708995  NaN   
3  2708995  ENG  NaN  NaN  NaN  NaN    NaN  12984980  12984980  2708995  NaN   
4  2708995  ENG  NaN  NaN  NaN  NaN    NaN  12985880  12985880  2708995  NaN   

      SAB TTY        CODE                                                STR  \
0  MTHSPL  SU  A5PTS27URJ  INFLUENZA A VIRUS A/VICTORIA/4897/2022 IVR-238...   
1  MTHSPL  SU  9HA5C4XL36  INFLUENZA A VIRUS A/VICTORIA/4897/2022 IVR-238...   
2  MTHSPL  SU  L38QVJ42SY  INFLUENZA A VIRUS A/Croatia/10136RV/2023 X-425...   
3  MTHSPL  SU  R3KQM5Q4QF  INFLUENZA A VIRUS A/Croatia/10136RV/2023 X-425...   
4  MTHSPL  SU  98PQP6K6TA  INFLUENZA A VIRUS A/Croatia/10136RV/2023 X-425...   

   SRL SUPPRESS   CVF EXTRA  
0  NaN  

In [15]:
import numpy as np
import faiss
from openai import OpenAI
from dotenv import load_dotenv

# Step 1: 加载环境变量
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("❌ 请设置环境变量 OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

# Step 2: 读取知识库
df = pd.read_csv("rxnorm_enriched_chunks.csv")
df = df.dropna(subset=["Text_Chunk"])  # 去掉空文本行
chunks = df["Text_Chunk"].tolist()

# Step 3: 调用 OpenAI API 生成嵌入
def get_embedding(text, model="text-embedding-ada-002"):
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

embeddings = []
for i, chunk in enumerate(chunks):
    emb = get_embedding(chunk)
    embeddings.append(emb)
    time.sleep(0.3)  # 避免速率限制
    if (i+1) % 50 == 0:
        print(f"✅ 已处理 {i+1} 条")

# Step 4: 构建 FAISS 索引
embedding_dim = len(embeddings[0])
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings).astype("float32"))

# Step 5: 保存向量索引和 metadata
faiss.write_index(index, "rxnorm_faiss.index")
df[["RXCUI", "STR", "Text_Chunk"]].to_csv("rxnorm_faiss_metadata.csv", index=False)

print("✅ 向量化完成，已保存 rxnorm_faiss.index 和 rxnorm_faiss_metadata.csv")


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
import openai
ORG_ID = os.getenv("ORGANIZATION_ID")
openai.api_key = API_KEY
openai.organization = ORG_ID
print(openai.organization)

None
