In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils_1_1

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
from os.path import join
import math

from constants_1_1 import SITE_FILE_TYPES
from utils_1_1 import (
    read_loinc_df,
    get_site_file_paths,
    get_site_file_info,
    get_site_ids,
    read_full_lab_df,
    get_visualization_subtitle,
    get_country_color_map,
    get_siteid_color_maps,
    apply_theme,
)
from web import for_website

alt.data_transformers.disable_max_rows() # Allow using rows more than 5000

In [None]:
df = read_full_lab_df()
df
df['siteid'].unique().tolist()

In [None]:
loinc_df = read_loinc_df()
loinc_df['loinc_name'] = loinc_df['LOINC']

loinc_df = loinc_df.set_index('LOINC')

LOINC_NAMES = loinc_df["Name"].unique().tolist()
LOINC_IDS = loinc_df.index.values.tolist()
LOINC_UNITS = loinc_df['Units'].values.tolist()
LOINC_MAP = { LOINC_IDS[i]: f'{LOINC_IDS[i]}, {LOINC_NAMES[i]} ({LOINC_UNITS[i]})' for i in range(len(LOINC_IDS)) }

loinc_df['loinc_name'] = loinc_df['loinc_name'].apply(lambda x: LOINC_MAP[x])

loinc_df

In [None]:
# loinc_df = loinc_df.reset_index()

# # Use consistent loinc names
# loinc_df['loinc_name'] = loinc_df['LOINC'].apply(lambda x: LOINC_MAP[x])

# # Add 'days_since_admission' 0~30
# loinc_df['days_since_admission'] = 0
# reference_df = None
# for i in range(31):
#     for severity in ["Ever Severe", "Never Severe"]:
#         next_df = loinc_df.copy()
#         next_df['days_since_admission'] = i
#         next_df['severity'] = severity
#         if reference_df is None:
#             reference_df = next_df
#         else:
#             reference_df = pd.concat([reference_df, next_df])

# reference_df

In [None]:
def process_labs_df(df):
    df_lb = df.copy()
    
    # Negative values to zeros
    df_lb.loc[df_lb['mean_value_all'] < 0, 'mean_value_all'] = np.nan
    df_lb.loc[df_lb['stdev_value_all'] < 0, 'stdev_value_all'] = np.nan
    df_lb.loc[df_lb['mean_log_value_all'] < 0, 'mean_log_value_all'] = np.nan
    df_lb.loc[df_lb['stdev_log_value_all'] < 0, 'stdev_log_value_all'] = np.nan
    
    df_lb.loc[df_lb['mean_value_ever_severe'] < 0, 'mean_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['stdev_value_ever_severe'] < 0, 'stdev_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['mean_log_value_ever_severe'] < 0, 'mean_log_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['stdev_log_value_ever_severe'] < 0, 'stdev_log_value_ever_severe'] = np.nan
    
    # trim x domain
    df_lb = df_lb[df_lb['days_since_admission'] >= 0]
    df_lb = df_lb[df_lb['days_since_admission'] <= 30]
    
    # Remove white space which shouldn't be provided
    df_lb['loinc'] = df_lb['loinc'].apply(lambda loinc: loinc.replace(' ', ''))
    
    # Add readable names for LOINC and units
    df_lb['loinc_name'] = df_lb['loinc'].apply(lambda loinc: LOINC_MAP[loinc])
        
    # Calculate never severe group and apply to '*_all' columns
    mu_all = df_lb['mean_value_all']
    sd_all = df_lb['stdev_value_all']
    n_all = df_lb['num_patients_all']
    mu_severe = df_lb['mean_value_ever_severe']
    sd_severe = df_lb['stdev_value_ever_severe']
    n_severe = df_lb['num_patients_ever_severe']
    
    df_lb['mean_value_all'] =  (
        (mu_all * n_all - mu_severe * n_severe) / (n_all - n_severe)
    )
    
    mu_never = df_lb['mean_value_all']
    df_lb['num_patients_all'] = df_lb['num_patients_all'] - df_lb['num_patients_ever_severe']

    n_never = df_lb['num_patients_all']    
    df_lb['stdev_value_all'] = (
        (
            sd_all**2 * (n_all-1) + n_all * mu_all**2
            -
            sd_severe**2 * (n_severe-1) + n_severe * mu_severe**2
            -
            n_never * mu_never**2
        ) 
        / 
        (n_never - 1)
    )
    df_lb['stdev_value_all'] = df_lb['stdev_value_all'].apply(lambda x: (
        math.sqrt(x) if x > 0 else x
    ))
    
    # zero number for # of participants
    df_lb.loc[df_lb['num_patients_all'] < 0, 'num_patients_all'] = np.nan
    df_lb.loc[df_lb['num_patients_ever_severe'] < 0, 'num_patients_ever_severe'] = np.nan
    
    df_lb.loc[df_lb['mean_value_all'] < 0, 'mean_value_all'] = np.nan
    df_lb.loc[df_lb['stdev_value_all'] < 0, 'stdev_value_all'] = np.nan
    df_lb.loc[df_lb['mean_log_value_all'] < 0, 'mean_value_all'] = np.nan
    df_lb.loc[df_lb['stdev_log_value_all'] < 0, 'stdev_value_all'] = np.nan
    
    df_lb.loc[df_lb['mean_value_ever_severe'] < 0, 'mean_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['stdev_value_ever_severe'] < 0, 'stdev_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['mean_log_value_ever_severe'] < 0, 'mean_log_value_ever_severe'] = np.nan
    df_lb.loc[df_lb['stdev_log_value_ever_severe'] < 0, 'stdev_log_value_ever_severe'] = np.nan
    
    # We want to put a category for severity, instead of using wide columns
    df_lb = pd.melt(
        df_lb,
        id_vars=[
            'siteid', 'loinc', 'loinc_name', 'days_since_admission', 'units',
            
            'mean_value_all', 
            'stdev_value_all', 
            'mean_log_value_all', 
            'stdev_log_value_all',
            
            'mean_value_ever_severe', 
            'stdev_value_ever_severe', 
            'mean_log_value_ever_severe',
            'stdev_log_value_ever_severe',
            
            'country', 'color'
        ],
        value_vars=['num_patients_all', 'num_patients_ever_severe'], 
        var_name='severity', 
        value_name='num_patients'
    )
    df_lb['severity'] = df_lb['severity'].apply(lambda x: 'Never Severe' if x == 'num_patients_all' else 'Ever Severe')
    
    # Use certain values based on the 'severity' column
    ever = (df_lb['severity'] == 'Ever Severe')
    df_lb.loc[ever, 'mean_value_all'] = df_lb.loc[ever, 'mean_value_ever_severe']
    df_lb.loc[ever, 'stdev_value_all'] = df_lb.loc[ever, 'stdev_value_ever_severe']
    df_lb.loc[ever, 'mean_log_value_all'] = df_lb.loc[ever, 'mean_log_value_ever_severe']
    df_lb.loc[ever, 'stdev_log_value_all'] = df_lb.loc[ever, 'stdev_log_value_ever_severe']
    df_lb = df_lb.rename(columns={
        'mean_value_all': 'mean_value', 
        'stdev_value_all': 'stdev_value', 
        'mean_log_value_all': 'mean_log_value', 
        'stdev_log_value_all': 'stdev_log_value', 
    })
    df_lb = df_lb.drop(columns=[
        'mean_value_ever_severe', 
        'stdev_value_ever_severe',
        'mean_log_value_ever_severe',
        'stdev_log_value_ever_severe'
    ])    
    
    # TODO: Data filter for better performance
    
    return df_lb

In [None]:
pdf = process_labs_df(df)
# temp = pdf[pdf['siteid'] == 'UCLA']
# temp = temp[temp['days_since_admission'] == 0]
# temp[temp['loinc'] == '1742-6']
pdf.head()

In [None]:
pdf['loinc_name'].unique().tolist()

In [None]:
cdf = pd.read_csv("../data/meta_lab.csv", sep=",", header=0)
cdf

In [None]:
pcdf = cdf.copy()

# change column names
pcdf = pcdf.rename(columns={
    'Lab': 'loinc_name',
    'days_since_positive': 'days_since_admission',
    'mean': 'mean_value',
    'se': 'stdev_value',
    'total_n': 'num_patients'
})

# Mark as 'All Country'
pcdf['siteid'] = 'All Country'
pcdf['country'] = 'All Country'

# Use consistent category names
pcdf['severity'] = pcdf['severity'].apply(lambda x: (
    'Ever Severe' if x == 'ever' else "Never Severe"
))
loinc_map = {
    'alanine aminotransferase (ALT)': '1742-6, alanine aminotransferase (ALT) (U/L)',
    'albumin':  '1751-7, albumin (g/dL)',
    'aspartate aminotransferase (AST)':  '1920-8, aspartate aminotransferase (AST) (U/L)',
    'C-reactive protein (CRP) (Normal Sensitivity)':  '1988-5, C-reactive protein (CRP) (Normal Sensitivity) (mg/L)',
    'cardiac troponin (High Sensitivity)':  '49563-0, cardiac troponin (High Sensitivity) (ng/mL)',
    'cardiac troponin (Normal Sensitivity)': '6598-7, cardiac troponin (Normal Sensitivity) (ug/L)',
    'creatinine':  '2160-0, creatinine (mg/dL)',
    'D-dimer (DDU)':  '48066-5, D-dimer (DDU) (ng/mL{DDU})',
    'D-dimer (FEU)':  '48065-7, D-dimer (FEU) (ng/mL{FEU})',
    'Ferritin':  '2276-4, Ferritin (ng/mL)',
    'Fibrinogen':  '3255-7, Fibrinogen (mg/dL)',
    'lactate dehydrogenase (LDH)':  '2532-0, lactate dehydrogenase (LDH) (U/L)',
    'lymphocyte count':  '731-0, lymphocyte count (10*3/uL)',
    'neutrophil count':  '751-8, neutrophil count (10*3/uL)',
    'procalcitonin':   '33959-8, procalcitonin (ng/mL)',
    'prothrombin time (PT)':  '5902-2, prothrombin time (PT) (s)',
    'total bilirubin':  '1975-2, total bilirubin (mg/dL)',
    'white blood cell count (Leukocytes)': '6690-2, white blood cell count (Leukocytes) (10*3/uL)' 
}
pcdf['loinc_name'] = pcdf['loinc_name'].apply(lambda x: loinc_map[x])


pcdf

In [None]:
df = pd.concat([pcdf, pdf])
df

In [None]:
# Add references
df = df.set_index('loinc_name').join(loinc_df.set_index('loinc_name')).reset_index()

df

In [None]:
COUNTRY_NAMES = ['All Country'] + list(get_country_color_map().keys())
COUNTRY_COLORS = ['#000000'] + list(get_country_color_map().values())
COUNTRY_COLOR_MAP = {COUNTRY_NAMES[i]: COUNTRY_COLORS[i] for i in range(len(COUNTRY_NAMES))}

COUNTRY_COLOR_MAP

In [None]:
SITE_NAMES = ['All Country'] + list(get_siteid_color_maps().keys())
SITE_COLORS = ['#000000'] + list(get_siteid_color_maps().values())
SITE_COLOR_MAP = {SITE_NAMES[i]: SITE_COLORS[i] for i in range(len(SITE_NAMES))}

SITE_COLOR_MAP

In [None]:
def lab_plot(
    df=None,
    loinc=list(LOINC_MAP.values())[0], # ALT
    width=700, 
    height=400,
    bar_size=14,
    point_size=30,
    no_axis_title=False,
    no_legend=False, 
    legend_columns=None,
    y_domain_top=None, 
    y_domain_bottom=None
):
    if(df is None):
        print('No Data Frame Suggested.')
        return
    
    # data field and title definition
    loinc_name = 'loinc_name'
    x_field = 'days_since_admission'
    mean_field = 'mean_value'
    num_pat_field = 'num_patients'
    
    x_field_title = 'Days since positive'
    mean_field_title = 'Mean value'
    
    site_color = alt.Color(
        f"{'siteid'}:N", 
        title='Site ID',
        scale=alt.Scale(domain=SITE_NAMES, range=SITE_COLORS)
    )
        
    country_color = alt.Color(
        f"{'country'}:N", 
        title=None,
        scale=alt.Scale(domain=COUNTRY_NAMES, range=COUNTRY_COLORS),
        legend=None
    )

    # data selection
    base = (
        alt.Chart(df)
            # for log scale calculation
            .transform_filter(alt.datum[mean_field] > 0)
    )
    
    legend_selection = alt.selection_multi(fields=['siteid'], bind="legend")
    
    date_brush = alt.selection(type="interval", encodings=['x'])
    
    input_dropdown = alt.binding_select(options=list(LOINC_MAP.values()))
    selection = alt.selection_single(
        fields=['loinc_name'], 
        bind=input_dropdown, 
        name='Lab ', 
        init={'loinc_name': loinc}
    )
    
    y_zoom_top = alt.selection(type="interval", bind='scales', encodings=['y'])
    y_zoom_bottom = alt.selection(type="interval", bind='scales', encodings=['y'])
    
    nearest = alt.selection(type="single", nearest=True, on="mouseover", fields=[x_field], empty='none', clear="mouseout", name="nearest_selector")
    nearest_rule = alt.Chart(df).mark_rule(color="red").encode(
        x=f"{x_field}:Q",
        size=alt.value(0.5)
    ).transform_filter(
        nearest
    )
    
    tooltip = [
        alt.Tooltip('siteid', title="Site ID"),
        alt.Tooltip('country', title="Country"),
        alt.Tooltip(mean_field, title="Lab value", format=".2f"),
        alt.Tooltip(num_pat_field, title="Number of patients"),
        alt.Tooltip(x_field, title="Days since positive")
    ]
    
    """
    Meta-analysis
    """
    y_title = None if no_axis_title else mean_field_title
    no_x_axis = False

    x_axis = (
        alt.Axis(grid=True, labels=False, ticks=False, domain=True)
            if no_x_axis 
            else alt.Axis(grid=True, labels=True, ticks=True, domain=True, tickMinStep=1)
    )
    y_scale = (
        alt.Scale(zero=False, domain=y_domain_bottom, type="linear") 
            if y_domain_bottom != None 
            else alt.Scale(zero=False, type="linear")
    )

    meta_circle = (
        base
            .mark_circle(size=point_size, opacity=0.7)
            .encode(
                x=alt.X(
                    f"{x_field}:Q",
                    title=None,
                    scale=alt.Scale(zero=False, nice=False, padding=10),
                    axis=x_axis,
                ),
                y=alt.Y(
                    f"{mean_field}:Q",
                    title='Weighted mean (CI)',
                    scale=y_scale,
                    axis=alt.Axis(format='r')
                ),
                color=site_color,
                tooltip=tooltip
            )
    )
    
    meta_line = meta_circle.mark_line(size=2, opacity=0.5)
    
    errorline = meta_circle.mark_errorbar().encode(
        y=alt.Y("ci_95L:Q", title=""),
        y2="ci_95U:Q",
        size=alt.value(1),
        opacity=alt.value(1)
    )
    
    meta_plot = (
        alt.layer(meta_circle, meta_line, errorline, nearest_rule)
            .properties(height=height/2.0, width=width)
            .add_selection(y_zoom_top)
    ).facet(
        column=alt.Column("severity:N", title=None), bounds="flush"
    ).transform_filter(alt.datum['country'] == 'All Country')#.transform_filter(date_brush)
    
    """
    Lab mean values in line/dot plot
    """
    y_title = None if no_axis_title else mean_field_title
    no_x_axis = False

    x_axis = (
        alt.Axis(grid=True, labels=False, ticks=False, domain=True)
            if no_x_axis 
            else alt.Axis(grid=True, labels=True, ticks=True, domain=True, tickMinStep=1)
    )
    y_scale = (
        alt.Scale(zero=False, domain=y_domain_bottom, type="log") 
            if y_domain_bottom != None 
            else alt.Scale(zero=False, type="log")
    )
    
    circle = (
        base
            .mark_circle(size=point_size, opacity=0.7)
            .encode(
                x=alt.X(
                    f"{x_field}:Q",
                    title=None,
                    scale=alt.Scale(
                        zero=False, nice=False, padding=10,
                        domain=[0,30]
                    ),
                    axis=x_axis,
                ),
                y=alt.Y(
                    f"{mean_field}:Q",
                    title=y_title,
                    scale=y_scale,
                    axis=alt.Axis(format='r')
                ),
                color=site_color,
                tooltip=tooltip
            )
    )
    
    line = circle.mark_line(size=2, opacity=0.5)
    
    reference_band = circle.mark_errorband().encode(
        x=f'{x_field}:Q',
        y=alt.Y("mean(Reference High):Q", title=""),
        y2="mean(Reference Low):Q",
        opacity=alt.value(0.3),
        color=alt.value('gray')
    )
        
    lab_value_plot = (
        alt.layer(circle, line, nearest_rule)
            .properties(height=height, width=width)
            .add_selection(y_zoom_bottom)
    ).facet(
        column=alt.Column("severity:N", title=None), bounds="flush"
    ).transform_filter(alt.datum['country'] != 'All Country')#.transform_filter(date_brush)


    """
    # of patients in bar charts
    """
    num_pat_field_title = None if no_axis_title else "# of patients"

    bar = base.mark_bar(size=bar_size).encode(
        y=alt.Y(
            f"sum({num_pat_field}):Q", 
            title=num_pat_field_title,
            axis=alt.Axis(
                format='r',
                tickMinStep=1
            ),
            scale=alt.Scale(zero=True, padding=0, nice=True)
        ),
        x=alt.X(
            f"{x_field}:Q",
            title=x_field_title,
            scale=alt.Scale(zero=False, nice=False, padding=10),
            axis=alt.Axis(
                grid=True,
                labels=True,
                ticks=True,
                domain=True
            )
        ),
        color=site_color,
        order=alt.Order(
            'country',
            sort='ascending'
        ),
        tooltip=tooltip
    )
    
    middle_chart = (
        (bar + nearest_rule)
            .properties(height=100, width=width)
            #.add_selection(date_brush)
    ).facet(
        column=alt.Column("severity:N", title=None, header=alt.Header(labels=False)), bounds="flush"
    ).transform_filter(alt.datum['country'] != 'All Country')

    result_vis = (
        alt.vconcat(
            meta_plot,
            lab_value_plot, 
            middle_chart,
            spacing=5
        ).properties(
            title={
                "text": "Lab Values By Site",
                'subtitle': get_visualization_subtitle(data_release='2020-06-19'),
                'subtitleColor': 'gray'
            }
        ).resolve_scale(
            y="independent", 
            x="shared",#independent",
            color="shared"
        )
        .add_selection(selection)
        .add_selection(nearest)
        .transform_filter(selection)
        .add_selection(legend_selection)
        .transform_filter(legend_selection)
    )
    
    return result_vis

In [None]:
final = lab_plot(df=df, width=500)

final = apply_theme(final, header_label_font_size=16, legend_orient='left')

for_website(final, "lab1.1", "lab1.1")

final