In [None]:
import numpy as np
import pandas as pd, re, requests
from itertools import combinations
from scipy.sparse import dok_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
# short preprocess
dataset_dir = '../../mimic-4/physionet.org/files/mimiciv/3.1/'
df = pd.read_csv(os.path.join(dataset_dir, "hosp/diagnoses_icd.csv"))
admissions = pd.read_csv(os.path.join(dataset_dir, "hosp/admissions.csv"))
df = pd.merge(
    df,
    admissions[['hadm_id', 'admittime', 'dischtime', 'deathtime']],
    on='hadm_id'
)

df[df['icd_version']==9].head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,admittime,dischtime,deathtime
0,10000032,22595853,1,5723,9,2180-05-06 22:23:00,2180-05-07 17:15:00,
1,10000032,22595853,2,78959,9,2180-05-06 22:23:00,2180-05-07 17:15:00,
2,10000032,22595853,3,5715,9,2180-05-06 22:23:00,2180-05-07 17:15:00,
3,10000032,22595853,4,7070,9,2180-05-06 22:23:00,2180-05-07 17:15:00,
4,10000032,22595853,5,496,9,2180-05-06 22:23:00,2180-05-07 17:15:00,


In [6]:
df[df['icd_version']==9][['icd_code', 'icd_version']][:30]

Unnamed: 0,icd_code,icd_version
0,5723,9
1,78959,9
2,5715,9
3,07070,9
4,496,9
5,29680,9
6,30981,9
7,V1582,9
8,07071,9
9,78959,9


In [None]:
URL_GEM = "https://data.nber.org/gem/icd9toicd10cmgem.csv"

gem = pd.read_csv(URL_GEM, dtype=str).rename(columns=str.lower)

def norm(code):
    if pd.isna(code):
        return None
    code = re.sub(r'[\s\.]', '', str(code).upper())
    if code.isdigit() and len(code) < 3:
        code = code.zfill(3)
    return code

gem["icd9_norm"]  = gem["icd9cm"].apply(norm)
gem["icd10_norm"] = gem["icd10cm"].apply(norm)
gem["flags"]      = gem["flags"].astype(str)

gem_ok = gem[~gem["flags"].str[0].eq("7")]     
gem_ok["rank"] = gem_ok["flags"].str[0].astype(int)   

gem_ok = gem_ok.sort_values(["icd9_norm", "rank"])

MAP_9to10 = (
    gem_ok.groupby("icd9_norm")["icd10_norm"]
          .first()
          .to_dict()
)

def icd9_to_icd10_fuzzy(icd9, mapping):
    if icd9 in mapping:
        return mapping[icd9]

    code = icd9
    while len(code) < 5:
        code += "0"
        if code in mapping:
            return mapping[code]

    desc = {k: v for k, v in mapping.items() if k.startswith(icd9)}
    if desc:
        for k in sorted(desc):
            if k.endswith("0"):
                return desc[k]
        return next(iter(desc.values()))
    return None


df["icd_code_norm"] = df["icd_code"].apply(norm)

def map_row(row):
    if row["icd_version"] == 10:
        return row["icd_code_norm"]
    code = row["icd_code_norm"]
    return (
        icd9_to_icd10_fuzzy(code, MAP_9to10)
    )

df["icd10_code"] = df.apply(map_row, axis=1)
df.drop(columns="icd_code_norm", inplace=True)

n9  = (df.icd_version == 9).sum()
nun = df["icd10_code"].isna() & (df.icd_version == 9)
print(f"Не смаппилось ICD-9 кодов: {nun.sum()} из {n9}")


Не смаппилось ICD-9 кодов: 0 из 2908741


In [29]:
mapped = df["icd10_code"].notnull() & (df["icd_version"] == 9)
unmapped = df["icd10_code"].isna() & (df["icd_version"] == 9)

print(mapped.info())
print(unmapped.info())
print(df.info())

<class 'pandas.core.series.Series'>
RangeIndex: 6364488 entries, 0 to 6364487
Series name: None
Non-Null Count    Dtype
--------------    -----
6364488 non-null  bool 
dtypes: bool(1)
memory usage: 6.1 MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 6364488 entries, 0 to 6364487
Series name: None
Non-Null Count    Dtype
--------------    -----
6364488 non-null  bool 
dtypes: bool(1)
memory usage: 6.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6364488 entries, 0 to 6364487
Data columns (total 10 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   subject_id    int64 
 1   hadm_id       int64 
 2   seq_num       int64 
 3   icd_code      object
 4   icd_version   int64 
 5   admittime     object
 6   dischtime     object
 7   deathtime     object
 8   icd10_code    object
 9   icd10_mapped  object
dtypes: int64(4), object(6)
memory usage: 485.6+ MB
None


In [30]:
def is_onco(code: str) -> int:
    """
    Malignant neoplasms  C00–C97
    In-situ + uncertain   D00–D09, D37–D48
    Берём три ведущих символа и проверяем диапазоны.
    """
    c = code.upper()
    if c[0] == "C":
        return 1
    if re.match(r"D0[0-9]|D[3-4][7-8]", c):
        return 1
    return 0

df["is_oncology"] = df["icd10_code"].map(is_onco).astype("int8")
df["category"] = df["icd10_code"].str[:3]
df.head(5)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,admittime,dischtime,deathtime,icd10_code,icd10_mapped,is_oncology,category
0,10000032,22595853,1,5723,9,2180-05-06 22:23:00,2180-05-07 17:15:00,,K766,K766,0,K76
1,10000032,22595853,2,78959,9,2180-05-06 22:23:00,2180-05-07 17:15:00,,R188,R188,0,R18
2,10000032,22595853,3,5715,9,2180-05-06 22:23:00,2180-05-07 17:15:00,,K740,K740,0,K74
3,10000032,22595853,4,7070,9,2180-05-06 22:23:00,2180-05-07 17:15:00,,B1920,B1920,0,B19
4,10000032,22595853,5,496,9,2180-05-06 22:23:00,2180-05-07 17:15:00,,J449,J449,0,J44


In [31]:
df['is_oncology'].value_counts()

is_oncology
0    6229916
1     134572
Name: count, dtype: int64

In [32]:
df.to_csv("prepared_data.csv")  

In [33]:
# чистим, где один и тот же диагноз в одной госп-ии
df_unique = df[['subject_id', 'icd10_code']].drop_duplicates()

unique_icds = df_unique['icd10_code'].dropna().unique()
icd2idx = {icd: i for i, icd in enumerate(unique_icds)}
idx2icd = {i: icd for icd, i in icd2idx.items()}
n = len(unique_icds)

# сколько пациентов имели одновременно оба кода
co_occurrence = np.zeros((n, n), dtype=int)


for subject_id, group in df_unique.groupby('subject_id'):
    codes = group['icd10_code'].unique()
    indices = [icd2idx[code] for code in codes if code in icd2idx]
    for i in indices:
        for j in indices:
            co_occurrence[i, j] += 1

total_patients = df['subject_id'].nunique()

# доля пациентов с обоими диагнозами от общего числа пациентов
co_occurrence_fraction = co_occurrence / total_patients

# условная вероятность 
patients_per_code = co_occurrence.diagonal().copy()
patients_per_code[patients_per_code == 0] = 1  # чтобы не делить на 0
conditional_prob = co_occurrence / patients_per_code[:, None]

co_df = pd.DataFrame(co_occurrence, index=unique_icds, columns=unique_icds)
fraction_df = pd.DataFrame(co_occurrence_fraction, index=unique_icds, columns=unique_icds)
cond_df = pd.DataFrame(conditional_prob, index=unique_icds, columns=unique_icds)

co_df.to_csv("data/co_matrix_counts.csv") # сколько уникальных пациентов имели одновременно диагнозы i и j
fraction_df.to_csv("data/co_matrix_fraction.csv") # доля пациентов из всей выборки у которых одновременно были диагнозы i и j
cond_df.to_csv("data/co_matrix_conditional_prob.csv") # усл вероятность

In [34]:
co_df = pd.read_csv("data/co_matrix_counts.csv")
fraction_df = pd.read_csv("data/co_matrix_fraction.csv") 
cond_df = pd.read_csv("data/co_matrix_conditional_prob.csv")

In [36]:
N = 20
diag_values = np.diag(fraction_df)
top_20_indices = np.argsort(diag_values)[::-1][:N]
top_icds = [idx2icd[i] for i in top_20_indices]


top_cond_df = cond_df.loc[top_icds, top_icds]

plt.figure(figsize=(10, 8))
sns.heatmap(top_cond_df, annot=False, cmap="viridis", xticklabels=True, yticklabels=True)
plt.title("Co-occurance probability of ICD codes (top 20)")
plt.xticks(rotation=90, fontsize=12)
plt.yticks(rotation=0, fontsize=12)
plt.tight_layout()
plt.show()


: 

TBA

In [None]:
co_long = co_df.stack().reset_index()
co_long.columns = ['icd_i', 'icd_j', 'count']

co_long = co_long[co_long['icd_i'] != co_long['icd_j']]

co_long = co_long[pd.to_numeric(co_long['count'], errors='coerce').notnull()]
co_long['count'] = co_long['count'].astype(int)

top20_co = co_long.sort_values('count', ascending=False).head(20)

NameError: name 'co_df' is not defined

In [None]:
# по условной вероятности 
cond_long = cond_df.stack().reset_index()
cond_long.columns = ['icd_i', 'icd_j', 'p_j_given_i']
cond_long = cond_long[cond_long['icd_i'] != cond_long['icd_j']]
top20_cond = cond_long.sort_values('p_j_given_i', ascending=False).head(20)

In [None]:
top20_co.to_csv("data/top20_co.csv")
top20_cond.to_csv("data/top20_cond.csv")

In [None]:
def plot_heatmap(dataframe, value_col, title):
    pivot = dataframe.pivot(index='icd_i', columns='icd_j', values=value_col)
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot, annot=True, fmt='.2f', cmap='viridis')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()


plot_heatmap(top20_co, 'count', 'Top 20 Co-occurrence Counts (Patients with both ICDs)')

plot_heatmap(top20_cond, 'p_j_given_i', 'Top 20 Conditional Probabilities P(j | i)')

