In [2]:
import os
import re
import numpy as np
import pandas as pd
# from IPython.display import disp

In [5]:
GEM = "/Users/sofsen/Desktop/SMILES 2025/mimic-4/2018_I9gem.txt"

In [6]:
dataset_dir = '../../mimic-4/physionet.org/files/mimiciv/3.1/'
df = pd.read_csv(os.path.join(dataset_dir, "hosp/diagnoses_icd.csv"))
admissions = pd.read_csv(os.path.join(dataset_dir, "hosp/admissions.csv"))
df = pd.merge(
    df,
    admissions[['hadm_id', 'admittime', 'dischtime', 'deathtime']],
    on='hadm_id'
)

In [7]:
def read_gem(path):
    with open(path, "r") as f:
        lines = f.readlines()

        def processor(s):
            s = [x for x in s.strip().split(" ") if x != ""]
            return {
                "icd9": s[0],
                "icd10": s[1],
                "flags": s[2]
            }

    lines = [processor(line) for line in lines]
    gem = pd.DataFrame.from_records(lines)
    return gem

In [8]:
gem = read_gem(os.path.join(GEM))

In [9]:
def norm(code: str) -> str:
    if pd.isna(code):
        return None
    code = re.sub(r'[\s\.]', '', str(code).upper())
    if code.isdigit() and len(code) < 3:
        code = code.zfill(3)
    return code
    
def icd9_to_icd10_fuzzy(icd9: str, mapping: dict) -> str:
    if icd9 in mapping:
        return mapping[icd9]

    code = icd9
    while len(code) < 5:
        code += "0"
        if code in mapping:
            return mapping[code]

    desc = {k: v for k, v in mapping.items() if k.startswith(icd9)}
    if desc:
        for k in sorted(desc):
            if k.endswith("0"):
                return desc[k]
        return next(iter(desc.values()))
    return None

def icd9_to_icd10(
    df: pd.DataFrame,
    gem: pd.DataFrame,
    df_icd_code_col:str="icd_code",
    df_icd_version_col:str="icd_version", 
    gem_icd9_col="icd9", 
    gem_icd10_col="icd10"
) -> pd.DataFrame:
    '''
    converts dataframe with 
    '''
    print("Normalization...")
    gem["icd9_norm"]  = gem[gem_icd9_col].apply(norm)
    gem["icd10_norm"] = gem[gem_icd10_col].apply(norm)
    gem["flags"]      = gem["flags"].astype(str)
    df["icd_code_norm"] = df[df_icd_code_col].apply(norm)
    print("ICD codes are normalized")

    print("Number of bad mappings:", gem["flags"].str[0].eq("7").sum(), "out of", len(gem), "let's through out them")
    gem_ok = gem[~gem["flags"].str[0].eq("7")]  
    gem_ok["rank"] = gem_ok["flags"].str[0].astype(int)   
    gem_ok = gem_ok.sort_values(["icd9_norm", "rank"])
    MAP_9to10 = gem_ok.groupby("icd9_norm")["icd10_norm"].first().to_dict()
    print("Mapping 9to10 is created")
    
    def map_row(row):
        if row[df_icd_version_col] == 10:
            return row["icd_code_norm"]
        return icd9_to_icd10_fuzzy(row["icd_code_norm"], MAP_9to10)
    
    print("Mapping started...")
    df["icd10_code"] = df.apply(map_row, axis=1)
    df.drop(columns="icd_code_norm", inplace=True)
    print("Mapping finished")

    n9  = (df[df_icd_version_col] == 9).sum()
    nun = df["icd10_code"].isna() & (df[df_icd_version_col] == 9)
    print(f"Не смаппилось ICD-9 кодов: {nun.sum()} из {n9}")

    return df

In [10]:
df = icd9_to_icd10(df, gem)


Normalization...
ICD codes are normalized
Number of bad mappings: 0 out of 24860 let's through out them
Mapping 9to10 is created
Mapping started...
Mapping finished
Не смаппилось ICD-9 кодов: 0 из 2908741


In [11]:
df["icd10_category"] = df["icd10_code"].str[:3]


In [12]:
df.to_csv("diagnoses_icd10.csv")