In [38]:
from collections import defaultdict
import itertools as it
import json

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

plt.style.use("seaborn-colorblind")
%matplotlib inline

In [2]:
with open("/Volumes/thesis-data/dtype_dict.json") as f:
    dtypes = json.load(f)

main = pd.read_csv(
    "/Volumes/thesis-data/main.csv",
    dtype=dtypes,
    parse_dates=["PERIOD", "ADMDATE", "DISCDATE"],
)

In [11]:
icd_cols = [f"ICD_{i}" for i in range(1, 14)]
main["icd_codes"] = main[icd_cols].values.tolist()

In [24]:
main["icd_codes"]

0          [R410, F102, M702, L031, M549, J459, Z866, nan...
1          [R53X, F03X, E119, I451, nan, nan, nan, nan, n...
2          [N764, L739, nan, nan, nan, nan, nan, nan, nan...
3          [I639, M8885, nan, nan, nan, nan, nan, nan, na...
4          [R560, nan, nan, nan, nan, nan, nan, nan, nan,...
                                 ...                        
2440376    [M201, M214, E039, nan, nan, nan, nan, nan, na...
2440377    [C445, nan, nan, nan, nan, nan, nan, nan, nan,...
2440378    [G459, I489, Z921, I10X, nan, nan, nan, nan, n...
2440379    [ZZ99, nan, nan, nan, nan, nan, nan, nan, nan,...
2440380    [L409, nan, nan, nan, nan, nan, nan, nan, nan,...
Name: icd_codes, Length: 2440381, dtype: object

In [26]:
def strip_codes(codes):

    stripped_codes = []
    for code in codes:
        if code is not np.nan:
            stripped_codes.append(code.strip())

    return stripped_codes

In [27]:
main_icds = (
    pd.DataFrame(
        main["icd_codes"]
        .apply(lambda codes: strip_codes(codes))
        .tolist(),
        index=main["EPISODE_ID"],
    )
    .stack()
    .reset_index()
    .drop("level_1", axis=1)
)

main_icds.columns = ["episode_id", "icd_code"]
main_icds.head(15)

Unnamed: 0,episode_id,icd_code
0,M2147765-1,R410
1,M2147765-1,F102
2,M2147765-1,M702
3,M2147765-1,L031
4,M2147765-1,M549
5,M2147765-1,J459
6,M2147765-1,Z866
7,M500865472-3,R53X
8,M500865472-3,F03X
9,M500865472-3,E119


In [29]:
icd_ranges = {
    "infectious": ("A00", "B99"),
    "neoplasms": ("C00", "D48"),
    "blood": ("D50", "D89"),
    "endocrine": ("E00", "E90"),
    "mental": ("F00", "F99"),
    "nervous": ("G00", "G99"),
    "eye": ("H00", "H59"),
    "ear": ("H60", "H95"),
    "circulatory": ("I00", "I99"),
    "respiratory": ("J00", "J99"),
    "digestive": ("K00", "K93"),
    "skin": ("L00", "L99"),
    "muscoloskeletal": ("M00", "M99"),
    "genitourinary": ("N00", "N99"),
    "pregnancy": ("O00", "O99"),
    "perinatal": ("P00", "P99"),
    "congenital": ("Q00", "Q99"),
    "abnormal_findings": ("R00", "R99"),
    "injury": ("S00", "T98"),
    "external_causes": ("V01", "Y98"),
    "contact_factors": ("Z00", "Z99"),
    "special_use": ("U00", "U89"),
}

In [30]:
main_icds["icd_code_stem"] = (
    main_icds["icd_code"].copy().str.slice(start=0, stop=3, step=1)
)

main_icds["category"] = main_icds["icd_code_stem"].copy()
for category, limits in icd_ranges.items():
    main_icds["category"] = np.where(
        main_icds["icd_code_stem"].between(*limits), category, main_icds["category"]
    )

main_icds.head()

Unnamed: 0,episode_id,icd_code,icd_code_stem,category
0,M2147765-1,R410,R41,abnormal_findings
1,M2147765-1,F102,F10,mental
2,M2147765-1,M702,M70,muscoloskeletal
3,M2147765-1,L031,L03,skin
4,M2147765-1,M549,M54,muscoloskeletal


In [31]:
episode_icds_list = main_icds.groupby("episode_id")["category"].apply(list).reset_index()
episode_icds_list.head()

Unnamed: 0,episode_id,category
0,1873465000-1,"[endocrine, abnormal_findings, endocrine, endo..."
1,Acute:479179:1,[skin]
2,Acute:479180:1,[skin]
3,Acute:479181:1,[skin]
4,Acute:479182:1,[skin]


In [32]:
stacked_episode_icds = episode_icds_list["category"].apply(pd.Series).stack()

episode_icds = pd.get_dummies(stacked_episode_icds).sum(level=0)
episode_icds["n_icds"] = episode_icds.sum(axis=1)
episode_icds = episode_icds.join(episode_icds_list["episode_id"], how="outer")

episode_icds.head()

Unnamed: 0,PRI,ZZ9,abnormal_findings,blood,circulatory,congenital,contact_factors,digestive,ear,endocrine,...,n01,neoplasms,nervous,perinatal,pregnancy,respiratory,skin,special_use,n_icds,episode_id
0,0,0,1,2,0,0,0,0,0,3,...,0,0,0,0,0,0,1,0,7,1873465000-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,Acute:479179:1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,Acute:479180:1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,Acute:479181:1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,Acute:479182:1


In [33]:
episode_icds["EPISODE_ID"] = episode_icds.loc[:, "episode_id"]
episode_icds = episode_icds.drop("episode_id", axis=1)

In [34]:
if not set(episode_icds.columns).issubset(main.columns):
    main = main.merge(episode_icds, how="inner", on="EPISODE_ID")

In [35]:
categories = list(icd_ranges.keys())
adjacency = np.zeros((len(categories), len(categories)), dtype=int)
adjacency = pd.DataFrame(adjacency, columns=categories, index=categories)

for _, row in main[categories].iterrows():
    present = row[row > 0]
    for c1, c2 in it.product(present.index, repeat=2):
        if c1 != c2:
            adjacency.loc[c1, c2] += 1

In [36]:
order = sorted(adjacency.columns, key=lambda col: -adjacency[col].sum())
adjacency.loc[order, order]

Unnamed: 0,contact_factors,circulatory,endocrine,respiratory,abnormal_findings,genitourinary,mental,digestive,muscoloskeletal,infectious,...,injury,external_causes,blood,skin,eye,pregnancy,special_use,ear,congenital,perinatal
contact_factors,0,494072,304144,281445,268434,223699,193679,188812,177958,87264,...,78594,65225,68321,52805,60532,86085,13967,15754,11681,7712
circulatory,494072,0,359321,280102,253795,224433,173700,183171,182089,86437,...,71115,62687,71891,55492,61897,954,15701,15808,6282,147
endocrine,304144,359321,0,176168,167702,156078,121817,119306,115331,64282,...,44307,41460,49541,40861,43466,7707,10780,9673,4676,275
respiratory,281445,280102,176168,0,145993,117020,133680,106156,103178,62777,...,39831,42401,41055,33420,26548,11389,9640,10212,6574,700
abnormal_findings,268434,253795,167702,145993,0,119213,134187,103348,98189,65962,...,43894,41771,39841,31567,25061,24521,8628,10317,6686,1450
genitourinary,223699,224433,156078,117020,119213,0,78511,73760,72639,74478,...,29904,28739,37478,27423,21271,8223,18078,6516,5152,69
mental,193679,173700,121817,133680,134187,78511,0,87807,69594,40612,...,52030,42314,26098,25385,16836,20147,6394,7000,4797,92
digestive,188812,183171,119306,106156,103348,73760,87807,0,67128,42941,...,21415,22226,40511,17704,13371,3659,5724,5103,4683,1020
muscoloskeletal,177958,182089,115331,103178,98189,72639,69594,67128,0,32554,...,31846,28365,27085,24644,21143,4003,5819,6699,3013,82
infectious,87264,86437,64282,62777,65962,74478,40612,42941,32554,0,...,18681,16990,19514,24045,7826,6814,23108,3289,3054,869


In [37]:
G = nx.from_pandas_adjacency(adjacency)

nx.write_gml(G, "categories.gml")

Create the graph image and tables with Gephi.