In [1]:
%load_ext autoreload 
%autoreload 2
import pandas as pd
import os

from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_helper_fs import get_coocurring_symptoms_codes, get_coocurring_symptoms_kws

from ipv_codes import NHAS_IPV_CODES, OREGON_IPV_CODES, USED_IPV_CODES, ICD10_IPV_CODES
from ipv_codes import KW_SETS, CODE_SETS

pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)

# Comparison of IPV coding strategies

Looks at three ICD9 code sets & three keyword sets. Compares the co-occurring diagnoses for each code set. CPT code strategy is a different notebook.

In [2]:
def column_names_to_lower(df):
    df.columns =  df.columns.str.lower()
    return df

# likely need to change this for it tow ork
data_path = '/home/t-dshanmugam/physionet.org/files/mimiciv/1.0/hosp/'
# Convert all filenames to lowercase for compatibility
files = os.listdir(data_path)
for file in files:        
    os.rename(data_path + file, data_path + file.lower())

english_names = pd.read_csv(data_path + 'd_icd_diagnoses.csv.gz')
diagnoses = pd.read_csv(data_path + 'diagnoses_icd.csv.gz')

english_names = column_names_to_lower(english_names)
diagnoses = column_names_to_lower(diagnoses)

old_len = len(diagnoses)
diagnoses = pd.merge(diagnoses, 
                     english_names, 
                     how='inner', 
                     on=['icd_code', 'icd_version'], 
                     validate='many_to_one')
if  old_len != len(diagnoses):
    print("Warning: not all diagnoses are in ICD table: old len %i, new len %i (%2.3f%%)" % 
        (old_len, len(diagnoses), 100*len(diagnoses)/old_len))

# Estimate p(y) for a code set

In [3]:
id_type = 'subject_id'
for ipv_code_name, ipv_codes in CODE_SETS:
    n_patients = len(get_ids_with_icd_codes(diagnoses, id_type, ipv_codes))
    p_y = n_patients/len(set(diagnoses[id_type]))
    print("# IPV patients using " +  ipv_code_name + " codes: ", n_patients, '\t p(y): ', p_y)

for kw_set in KW_SETS:
    n_patients = len(get_ids_with_kws(diagnoses, id_type, kw_set))
    p_y = n_patients/len(set(diagnoses[id_type]))
    print("# IPV patients w/ kwds [" + ".".join(kw_set) + "]: ", n_patients, '\t p(y): ', p_y)

# 5 patients were admitted twice for IPV under NHAS Study codes
# 13 patients were admitted twice for IPV under Oregon Study codes

# IPV patients using NHAS Study codes:  918 	 p(y):  0.0035985041512155732
# IPV patients using Oregon Study codes:  1369 	 p(y):  0.005366396713523006
# IPV patients using US ED Study codes:  122 	 p(y):  0.0004782325778303921
# IPV patients using ICD10 Codes codes:  95 	 p(y):  0.00037239422044169876
# IPV patients w/ kwds [adult physical abuse.adult abuse]:  328 	 p(y):  0.0012857400453144968
# IPV patients w/ kwds [adult physical abuse.adult abuse.assault]:  1810 	 p(y):  0.007095089884204997
# IPV patients w/ kwds [adult physical abuse.adult abuse.maltreatment]:  361 	 p(y):  0.0014150980376784552
# IPV patients w/ kwds [partner]:  179 	 p(y):  0.0007016691100954113
# IPV patients w/ kwds [abuse]:  19968 	 p(y):  0.07827334519768253


# Co-occurring symptoms for each code set

In [4]:
ids, sub_d = get_coocurring_symptoms_codes(diagnoses, id_type=id_type, codes=NHAS_IPV_CODES)

# Codes: 11, subject_id. Total IDs: 918; total diagnoses: 70776
                                                                                  # rows  # rows/# IDs
Depressive disorder, not elsewhere classified                                       1452      1.581699
Suicidal ideation                                                                   1356      1.477124
Unspecified essential hypertension                                                  1313      1.430283
Posttraumatic stress disorder                                                       1232      1.342048
Tobacco use disorder                                                                 952      1.037037
Anxiety state, unspecified                                                           934      1.017429
Esophageal reflux                                                                    855      0.931373
History of physical abuse                                                            830      0.904139
Asthma, u

In [5]:
ids, sub_d = get_coocurring_symptoms_codes(diagnoses, id_type=id_type, codes=OREGON_IPV_CODES)

KeyboardInterrupt: 

In [None]:
ids, sub_d = get_coocurring_symptoms_codes(diagnoses, id_type=id_type, codes=USED_IPV_CODES)

In [None]:
ids, sub_d = get_coocurring_symptoms_codes(diagnoses, id_type=id_type, codes=ICD10_IPV_CODES)

In [None]:
ids, sub_d = get_coocurring_symptoms_kws(diagnoses, id_type=id_type, query=['adult physical abuse', 'adult abuse'])

In [None]:
ids, sub_d = get_coocurring_symptoms_kws(diagnoses, id_type=id_type, query=['adult physical abuse', 'adult abuse', 'assault'])

#  Individual prevalence of all IPV-related codes 

(any code, across all related work)

In [6]:
from ipv_codes import IPV_RELATED_CODES, IPV_RELATED_KWS

In [7]:
from tqdm.notebook import tqdm
from mimic_helper_fs import get_icd_code_long_title
prevalence_dicts = []
for kw in tqdm(IPV_RELATED_KWS):
    descr = "Contains: " + kw
    n_patients = len(get_ids_with_kws(diagnoses, id_type, [kw]))
    p_y = n_patients/len(set(diagnoses[id_type]))
    prevalence_dicts.append({'code': kw, 'long_title': descr, 'prevalence': p_y,
                             'count': n_patients})
for code in tqdm(IPV_RELATED_CODES):
    descr = get_icd_code_long_title(english_names, code)
    n_patients = len(get_ids_with_icd_codes(diagnoses, id_type, [code]))
    p_y = n_patients/len(set(diagnoses[id_type]))
    prevalence_dicts.append({'code': code, 'long_title': descr, 'prevalence': p_y,
                             'count': n_patients})

prevalence_df = pd.DataFrame(prevalence_dicts)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

In [8]:
prevalence_df.sort_values('prevalence', ascending=False)

Unnamed: 0,code,long_title,prevalence,count
5,abuse,Contains: abuse,0.078273,19968
4,assault,Contains: assault,0.006084,1552
10,V1541,History of physical abuse,0.002571,656
23,E9600,Unarmed fight or brawl,0.001541,393
3,adult abuse,Contains: adult abuse,0.001121,286
6,E9689,Assault by unspecified means,0.001058,270
28,E966,Assault by cutting and piercing instrument,0.000851,217
27,E9688,Assault by other specified means,0.000847,216
0,partner,Contains: partner,0.000702,179
18,E9682,Assault by striking by blunt or thrown object,0.00051,130


## Counts of subtypes of ICD-10  codes

In [None]:
diagnoses.loc[diagnoses['icd_code'].map(lambda x:x.startswith('T74')), 'long_title'].value_counts()

In [None]:
diagnoses.loc[diagnoses['icd_code'].map(lambda x:x.startswith('T76')), 'long_title'].value_counts()