In [None]:
#medgen_file = "hierarchies/MedGenIDMappings.txt"
#medgen_df = pd.read_csv(medgen_file, sep="|", dtype=str)
#medgen_df.columns = ["CUI", "pref_name", "source_id", "source", "?"]
#medgen_df = medgen_df.map(lambda x: x.strip() if isinstance(x, str) else x)

#disgenet_file = "DisGDis.csv"
#df_full = pd.read_csv(disgenet_file, dtype=str)
#df_full = df_full.rename(columns={"diseaseUMLSCUI": "CUI"})
#merged_df = df_full.merge(medgen_df, on="CUI", how="left")
#merged_df = merged_df[merged_df['source'] == 'MedGen']
#merged_df = merged_df.drop(columns=["?","source"])
#merged_df.to_csv("hierarchies/MedGen_mapping.csv", index=False)

In [None]:
# Old method of doing CUI-to-MeSH UID mapping via MedGen

#medgen_df = pd.read_csv("MedGenIDMappings.txt", sep="|", dtype=str,
#                        names=["CUI", "pref_name", "source_id", "source", "?"], skiprows=1).map(lambda x: x.strip() if isinstance(x, str) else x)

#mesh_df = pd.read_csv("DisGDis.csv", dtype=str).rename(columns={"diseaseUMLSCUI": "CUI"})
#mesh_df = mesh_df.merge(medgen_df, on="CUI", how="left")\
#          .query("source == 'MeSH'")\
#          .drop(columns=["?", "source"], errors="ignore")\
#          .rename(columns={"source_id": "MSH"}) 
#mesh_df.to_csv("hierarchies/MeSH_mapping.csv", index=False)

#for i, gene in enumerate(top_genes, start=1):
#    mesh_df[mesh_df["symbolOfGene"] == gene].to_csv(f"hierarchies/ranked_genes/{gene}_{i}.csv", index=False)

In [None]:
# Takes normalised MeSH disease names and also keeps the highest score for duplicates

#API_KEY = "7eaf32db-24f3-4d66-b3cc-cc9a1ad56f9a"
#BASE_URL = "https://uts-ws.nlm.nih.gov/rest/content/current/source/MSH/"
#file_path = "MeSH_CID_hierarchical_class/DRD2_2.csv"
#df = pd.read_csv(file_path)

#def fetch_mesh_name(mesh_id):
#    """Fetches the normalised MSH disease name from the UMLS API."""
#    url = f"{BASE_URL}{mesh_id}?apiKey={API_KEY}"
#    response = requests.get(url)
#    if response.status_code == 200:
#        data = response.json()
#        if "result" in data and "name" in data["result"]:
#            return data["result"]["name"]
#    print(f"Error fetching data for {mesh_id}: {response.status_code}")
#    return None

#df["MSH_normalised_name"] = None
#for index, row in tqdm(df.iterrows(), total=len(df), desc="Fetching MSH names"):
#    mesh_id = row["MSH"]
#    if isinstance(mesh_id, str):
#        df.at[index, "MSH_normalised_name"] = fetch_mesh_name(mesh_id)
#    time.sleep(0.5)  # Avoid hitting API rate limits

#df = df[["MSH_normalised_name", "pref_name", "score", "MSH"]]
#df = df.drop_duplicates(subset=["MSH"])
#output_file = "MeSH_CID_hierarchical_class/DRD_2_input.csv"
#df.to_csv(output_file, index=False)

In [None]:
from collections import defaultdict

# UMLS API Configuration
API_KEY = "7eaf32db-24f3-4d66-b3cc-cc9a1ad56f9a"
BASE_URL_MSH = "https://uts-ws.nlm.nih.gov/rest/content/current/source/MSH/"

# Load input file containing scored diseases
file_path = "hierarchies/input_files/MAPT_1_input.csv"
df = pd.read_csv(file_path, dtype=str)

# Convert input data to dictionaries for fast lookup
score_dict = dict(zip(df["MSH"], df["score"]))
mesh_name_dict = dict(zip(df["MSH"], df["MSH_normalised_name"]))

# Dictionary to store hierarchical structures
hierarchy = defaultdict(lambda: {"name": None, "parents": [], "score": None})

# Function to fetch the MeSH name using mesh ID
def fetch_mesh_name(mesh_id):
    """Fetches the normalised MeSH disease name from the UMLS API."""
    if not mesh_id or "V-MSH" in mesh_id:  # Ignore invalid IDs
        return None
    
    url = f"{BASE_URL_MSH}{mesh_id}?apiKey={API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if "result" in data and "name" in data["result"]:
            return data["result"]["name"]
    
    print(f"⚠️ Error fetching data for {mesh_id}: {response.status_code}")
    return None  # Return None instead of defaulting to "Unknown"

# Function to fetch parent relationships
def fetch_parents(mesh_id):
    """Fetch parent relationships for a given MeSH ID."""
    if not mesh_id or "V-MSH" in mesh_id:
        return []
    
    url = f"{BASE_URL_MSH}{mesh_id}/parents?apiKey={API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return [(item["ui"], item["name"]) for item in data.get("result", [])]
    
    print(f"⚠️ Error fetching parents for {mesh_id}: {response.status_code}")
    return []  # Return an empty list if no parents found

def build_tree(mesh_id, lineage=None, level=0):
    """Recursively builds the tree for a single disease, ensuring correct branching."""
    if lineage is None:
        lineage = []

    if not mesh_id or mesh_id in lineage:  # Prevent infinite loops & invalid IDs
        return []

    # Fetch disease name
    disease_name = fetch_mesh_name(mesh_id)
    if disease_name is None:
        return []  # Skip if no valid name

    score = score_dict.get(mesh_id, "NaN")  # Use existing score if found
    parents = fetch_parents(mesh_id)

    # Store current node in hierarchy
    hierarchy[mesh_id]["name"] = disease_name
    hierarchy[mesh_id]["parents"] = parents
    hierarchy[mesh_id]["score"] = score

    # ✅ If no parents, return a single node list
    if not parents:
        return [[(level, score, disease_name, mesh_id)]]

    # ✅ Process parents recursively
    trees = []
    for parent_id, parent_name in parents:
        subtree = build_tree(parent_id, lineage + [mesh_id], level + 1)

        # ✅ Ensure subtree is a list of lists
        if isinstance(subtree, list):
            structured_subtree = subtree
        else:
            structured_subtree = [[(level + 1, "NaN", parent_name, parent_id)]]

        # ✅ Append correctly formatted subtree to trees
        trees.append([(level, score, disease_name, mesh_id)] + [item for sublist in structured_subtree for item in sublist])

    return trees  # Always return a **list of lists**

# Process each disease separately, ensuring distinct tree branches
output_file = "MAPT_1_test_fixed.txt"
with open(output_file, "w") as f:
    f.write("MAPT\n\n")
    
    for mesh_id in df["MSH"]:
        trees = build_tree(mesh_id)
        
        # ✅ Debugging to ensure trees are structured correctly
        print(f"DEBUG: Trees for {mesh_id}: {trees}")

        # ✅ Write each separate tree correctly
        for tree in trees:
            for level, score, name, mesh in tree:
                indent = "\t" * level
                score_str = f"({score})" if score != "NaN" else "(NaN)"
                f.write(f"{score_str}{indent}-{name}; {mesh}\n")
            f.write("\n")  # Separate distinct trees with a blank line

print(f"✅ Hierarchy saved to {output_file}")