In [None]:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [None]:
# Function to fetch CUI name with caching
def get_cui_name(cui, api_key, cache):
    if cui in cache:
        return cache[cui]  # Return cached value if it exists

    url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}?apiKey={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        name = data['result']['name']
        cache[cui] = name  # Cache the fetched name
        return name
    except requests.exceptions.HTTPError:
        print(f"HTTP error occurred for CUI {cui}")
        cache[cui] = None  # Cache the failure as None
        return None
    except KeyError:
        print(f"Unexpected response format for CUI {cui}")
        cache[cui] = None  # Cache the failure as None
        return None

In [None]:
# Parallelized function to convert SUBJECT_CUI and OBJECT_CUI to their names with caching
def convert_cuis_to_names(df, api_key, max_workers=12):
    cache = {}

    def fetch_and_map_cui(cui, column_name):
        name = get_cui_name(cui, api_key, cache)
        return (cui, name, column_name)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        for cui in df['SUBJECT_CUI']:
            futures.append(executor.submit(fetch_and_map_cui, cui, 'drug'))
        
        for cui in df['OBJECT_CUI']:
            futures.append(executor.submit(fetch_and_map_cui, cui, 'disease'))
        
        df['drug'] = None
        df['disease'] = None

        with tqdm(total=len(futures), desc="Fetching CUIs", unit="CUI") as pbar:
            for future in as_completed(futures):
                cui, name, column_name = future.result()
                if column_name == 'drug':
                    df.loc[df['SUBJECT_CUI'] == cui, 'drug'] = name
                elif column_name == 'disease':
                    df.loc[df['OBJECT_CUI'] == cui, 'disease'] = name
                pbar.update(1)
    
    return df

In [None]:
df = pd.read_csv("../data/intended_data.csv")
df.shape

In [None]:
df.head()

In [None]:
api_key = ''

# Convert CUIs to names
df_with_names = convert_cuis_to_names(df, api_key, max_workers=12)

In [None]:
df_with_names.head()

In [None]:
df_with_names["disease"].nunique()

In [None]:
df_with_names["disease"].value_counts().head(20)

In [None]:
df_with_names.dropna(inplace=True)

In [None]:
df_with_names.to_csv("../data/intended_data_names.csv", index=False)