In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 

In [2]:
# admissions, icd9, d_patients
admissions_path = os.path.join('.', 'page 19 major MIMIC 2 clinical database component', 'admissions.txt')
icd9_path = os.path.join('.', 'page 19 major MIMIC 2 clinical database component', 'icd9.txt')
d_patients_path = os.path.join('.', 'page 19 major MIMIC 2 clinical database component', 'd_patients.txt')

In [3]:
admissions_df = pd.DataFrame(pd.read_csv(admissions_path, sep='|'))
icd9_df = pd.DataFrame(pd.read_csv(icd9_path, sep='|'))
d_patients_df = pd.DataFrame(pd.read_csv(d_patients_path, sep='|'))

FileNotFoundError: File b'./page 19 major MIMIC 2 clinical database component/admissions.txt' does not exist

In [None]:
icd9_class = {
    ("001", "139"): "infectious and parasitic diseases",
    ("140", "239"): "neoplasms",
    ("240", "279"): "metabolic diseases",
    ("280", "289"): "diseases of the blood and blood-forming organs",
    ("290", "319"): "mental disorders",
    ("320", "389"): "neurologic disease",
    ("390", "392"): "acute rheumatic fever",
    ("393", "398"): "chronic rheumatic heart disease",
    ("401", "405"): "hypertensive disease",
    ("410", "414"): "ischemic heart disease",
    ("415", "417"): "diseases of pulmonary circulation",
    ("428", "428"): "heart failure",
    ("420", "429"): "other forms of heart disease",
    ("430", "438"): "cerebrovascular disease",
    ("440", "459"): "arteries and veins",
    ("460", "519"): "pulmonary disease",
    ("520", "579"): "digestive disease",
    ("580", "629"): "renal insufficiency",
    ("630", "677"): "Complications of pregnancy, childbirth, and the puerperium",
    ("680", "709"): "diseases of the skin and subcutaneous tissue",
    ("710", "739"): "diseases of the musculoskeletal system & connective tissue",
    ("740", "759"): "congenital anomalies",
    ("780", "799"): "symptoms, signs, and ill-defined conditions",
    ("800", "959"): "trauma",
    ("960", "989"): "poisoning",
    ("990", "995"): "other and unspecified effects of external causes",
    ("996",): "complications peculiar to certain specified procedures",
    ("997",): "complications affecting specified body systems, not elsewhere classified",
    ("998",): "other complications of procedures, NEC",
    ("999",): "complications of medical care, not elsewhere classified",
    ("E800", "E999"): "supplementary classification of external causes of injury and poisoning",
    ("V81", "V86"): "supplementary classification of factors influencing health status and contact with health services",
}

# lower bounds: description
lbs = []
lb_desc = {}

for bounds, description in icd9_class.items():
    lbs.append(bounds[0])
    lb_desc[bounds[0]] = description

    
def find_largest_lb(code, sorted_lbs=lbs):
    left = 0 
    right = len(sorted_lbs) - 1 
    while left <= right: 
        mid = (left + right) // 2 
        if sorted_lbs[mid] < code:
            left += 1 
        else:
            right -= 1 
            
    return sorted_lbs[right]

In [None]:
lbs[:5]

In [None]:
d_patients_df.head()

In [None]:
admissions_df.head()

In [None]:
# 选出反复入院的

ad_dup = admissions_df[admissions_df.duplicated(["subject_id"], keep=False)]
ad_dup.head()

In [None]:
icd9_df.head()

In [None]:
# 病症分类
icd9_df["main_code"] = icd9_df["code"].map(find_largest_lb)

In [None]:
icd9_df.head()

In [None]:
# 每次入院主病症
icd9_df["primary"] = icd9_df.groupby(['hadm_id'])["main_code"].transform(lambda x: x.value_counts().index[0])

In [None]:
icd9_df.head(10)

In [None]:
icd9_primary = icd9_df[["subject_id", "hadm_id", "primary"]].drop_duplicates()
icd9_primary.head()

In [None]:
df = pd.merge(ad_dup, icd9_primary, on=['subject_id', 'hadm_id'])
df.head()

In [None]:
df = pd.merge(df, d_patients_df, on=['subject_id'])
df.head()

In [None]:
# age 
import time

DT_FORMAT='%Y-%m-%d %H:%M:%S'

def float_year(time_: str) -> float:
    t = time.strptime(time_, DT_FORMAT)
    return time.mktime(t) / (365 * 86400)

In [None]:
df_selected = df[['subject_id', 'admit_dt', 'primary', 'sex', 'dob']]
df_selected.head()

In [None]:
df_selected['age'] = df_selected['admit_dt'].map(float_year) - df_selected['dob'].map(float_year)

In [None]:
df_selected2 = df_selected[['subject_id', 'sex', 'age', 'primary']]
df_selected2.head()

In [None]:
primary_id = {}

for i, lb in enumerate(lbs):
    primary_id[lb] = i 

In [None]:
df_selected2['primary_id'] = df_selected2['primary'].map(primary_id)
df_selected2.head()

In [None]:
df_selected3 = df_selected2[['subject_id', 'sex', 'age', 'primary_id']]
df_selected3.head()

In [None]:
df_selected3.to_csv('tpp.csv', index=None)

In [None]:
# pd.DataFrame.from_dict(primary_id)
primary_id

In [None]:
lb_desc

In [None]:
index = []
lower_bound = []
desc = []

for lb, i in primary_id.items():
    index.append(i)
    lower_bound.append(lb)
    desc.append(lb_desc[lb])

In [None]:
df_dict = {'id': index, 'lb': lower_bound, 'desc': desc}

disease = pd.DataFrame.from_dict(df_dict)
disease.head()

In [None]:
disease.to_csv("disease.csv", index=None)

In [None]:
disease['id'].max()