In [146]:
import sys
sys.path.append('../')
import pandas as pd

pd.options.mode.chained_assignment = None

In [147]:
GLOBAL_DATA_PATH = "./Data"

MULTI_VISIT_DATA_PATH = GLOBAL_DATA_PATH + '/multi_visit_temporal.pkl'
SINGLE_VISIT_DIAG_DATA_PATH = GLOBAL_DATA_PATH + '/single_visit_diag.pkl'
SINGLE_VISIT_MED_DATA_PATH = GLOBAL_DATA_PATH + '/single_visit_med.pkl'

UNIQUE_ATC_CSV= "unique-atc4.csv"
UNIQUE_ICD_CSV = "unique-icd.csv"

ADMISSIONS_CSV = GLOBAL_DATA_PATH + '/ADMISSIONS.csv'
DIAGNOSES_CSV = GLOBAL_DATA_PATH + '/DIAGNOSES_ICD.csv'
MEDICATIONS_CSV = GLOBAL_DATA_PATH + '/PRESCRIPTIONS.csv'

SUBJECT_ID = 'SUBJECT_ID'
HADM_ID = 'HADM_ID'
ICD9 = 'ICD9_CODE'
NDC = 'NDC'
ATC4 = 'ATC4'

In [148]:
def get_average_max(data,):
    patients = data[SUBJECT_ID].unique()
    num_patients = len(patients)

    num_diag = 0
    num_meds = 0
    max_diag = 0
    max_meds = 0
    cnt = 0
    max_visits = 0
    num_visits = 0

    for subject_id in patients:
        item_data = data[data['SUBJECT_ID'] == subject_id]
        x = []
        y = []
        visit_cnt = 0
        for row in item_data.iterrows():
            visit_cnt += 1
            cnt += 1
            for codes in row[1]['ICD9_CODE']:
                x.extend(codes)
            
            for codes in row[1][ATC4]:
                y.extend(codes)

        x = set(x)
        y = set(y)
        num_diag += len(x)
        num_meds += len(y)
        num_visits += visit_cnt
        if len(x) > max_diag:
            max_diag = len(x)
        if len(y) > max_meds:
            max_meds = len(y) 
        if visit_cnt > max_visits:
            max_visits = visit_cnt

    avg_diag = num_diag / cnt
    avg_meds = num_meds / cnt
    avg_visits = num_visits / num_patients
    

    return avg_diag, avg_meds, avg_visits, max_diag, max_meds, max_visits

In [163]:
def get_unique_values(column):
    unique_values = []

    for row in column:
        for j in row:
            unique_values.extend(j)

    unique_values = set(unique_values)
    return unique_values

In [166]:
def statistics(data: pd.DataFrame):    
    avg_diag, avg_meds, avg_visits, max_diag, max_meds, max_visits = get_average_max(data)
        
    print('Number of Patients: ', data[SUBJECT_ID].unique().shape)
    print('Number of Clinical Events ', len(data))
    
    diagnoses = data[ICD9].values
    meds = data[ATC4].values

    unique_diagnoses = get_unique_values(diagnoses)
    unique_meds = get_unique_values(meds)
    
    print('Number of Unique Diagnoses: ', len(unique_diagnoses))
    print('Number of Unique Medications ', len(unique_meds))

    print('Average Diagnoses: ', avg_diag)
    print('Average Medications: ', avg_meds)
    print('Average Visits: ', avg_visits)
    

    print('Max Diagnoses: ', max_diag)
    print('Max Medications: ', max_meds)
    print('Max Visits: ', max_visits)




In [167]:
data_pd = pd.read_pickle(MULTI_VISIT_DATA_PATH)
statistics(data_pd)

Number of Patients:  (6290,)
Number of Clinical Events  10380
Number of Unique Diagnoses:  4162
Number of Unique Medications  374
Average Diagnoses:  10.9368978805395
Average Medications:  17.745472061657033
Average Visits:  1.6502384737678855
Max Diagnoses:  140
Max Medications:  134
Max Visits:  40
