In [1]:
import pandas as pd

In [2]:
## url: https://bioportal.bioontology.org/ontologies/RXNORM

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
import re
from typing import Optional

def parse_rxnorm_identifier(cls_id: str) -> Optional[str]:
    match = re.search(r'(?<=\/)(\w+)$', cls_id)
    if match:
        return match.group(0)
    
    return None

df = pd.read_csv('./RXNORM.csv')
df['Label'] = df['Preferred Label'].str.lower()
df['ID'] = df['Class ID'].map(parse_rxnorm_identifier)

df[['Label', 'ID']].tail(n=5)

Unnamed: 0,Label,ID
107249,roflumilast oral product,1156445
107250,wal-zan,643043
107251,carya laciniosa pollen extract injectable solu...,2203044
107252,thyroshield,723798
107253,velvet grass pollen extract injectable solution,852516


In [5]:
df = df[['ID', 'Label', 'Class ID']]
df.tail(n=5)

Unnamed: 0,ID,Label,Class ID
107249,1156445,roflumilast oral product,http://purl.bioontology.org/ontology/RXNORM/11...
107250,643043,wal-zan,http://purl.bioontology.org/ontology/RXNORM/64...
107251,2203044,carya laciniosa pollen extract injectable solu...,http://purl.bioontology.org/ontology/RXNORM/22...
107252,723798,thyroshield,http://purl.bioontology.org/ontology/RXNORM/72...
107253,852516,velvet grass pollen extract injectable solution,http://purl.bioontology.org/ontology/RXNORM/85...


In [7]:
X = df['Label'].unique().tolist()

In [8]:
Medications = { i: row.to_dict() for i, row in df.iterrows() }

In [9]:
Medications[600]

{'ID': '1792489',
 'Label': 'methoxy polyethylene glycol-epoetin beta 0.4 mg/ml [mircera]',
 'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/1792489'}

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(1,5)
)

In [12]:
vectors = tfidf_vectorizer.fit_transform(X)

In [13]:
import numpy as np
from sklearn.metrics import pairwise
from typing import List, Tuple


def query_medications(query_term: str, n=1) -> List[Tuple[dict, float]]:
    query_as_vector = tfidf_vectorizer.transform([query_term.strip().lower()])

    cosine_similarities = pairwise.pairwise_distances(
        vectors,
        query_as_vector,
        metric='cosine'
    ).flatten()

    return [
        (
            Medications[row_id],
            cosine_similarities[row_id]
        ) for row_id in np.argsort(cosine_similarities)[:n]
    ]
    
query_medications('tylenol', n=5)

[({'ID': '202433',
   'Label': 'tylenol',
   'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/202433'},
  0.0),
 ({'ID': '1187315',
   'Label': 'tylenol pill',
   'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/1187315'},
  0.1033285962675553),
 ({'ID': '220581',
   'Label': 'tylenol pm',
   'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/220581'},
  0.13853376553646313),
 ({'ID': '1187311',
   'Label': 'tylenol oral product',
   'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/1187311'},
  0.1474276481504717),
 ({'ID': '1187314',
   'Label': 'tylenol pm pill',
   'Class ID': 'http://purl.bioontology.org/ontology/RXNORM/1187314'},
  0.21023377341341676)]

In [14]:
def link_medication(query_term: str) -> str:
    med, _ = query_medications(query_term, n=1)[0]
    return med['ID']

link_medication('tylenol pm')

KeyError: 'RxNorm ID'