In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils_1_1

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
import datetime
import dateutil.parser
from os.path import join

from constants_1_1 import SITE_FILE_TYPES
from utils_1_1 import (
    get_site_file_paths,
    get_site_file_info,
    get_site_ids,
    get_visualization_subtitle,
    get_country_color_map,
)
from theme import apply_theme
from web import for_website

alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

In [None]:
data_release='2021-04-27'

df = pd.read_csv(join("..", "data", "Phase2.1SurvivalRSummariesPublic", "ToShare", "table.labdist.toShare.csv"))

df = df.drop(columns=['Unnamed: 0'])
df = df.rename(columns={"nm.lab": "lab"})

scale = 'log'
# df = df[df.scale != scale]
LOG_LABS = ['alanine aminotransferase (ALT)', 'aspartate aminotransferase (AST)', 'C-reactive protein (CRP) (Normal Sensitivity)', 'D-dimer', 'Ferritin', 'lactate dehydrogenase (LDH)']

consistent_loinc = {
    "C_reactive_protein_CRP_Normal_Sensitivity": "C-reactive protein (Normal Sensitivity) (mg/dL)",
    "creatinine": "Creatinine (mg/dL)",
    "Ferritin": "Ferritin (ng/mL)",
    "D_dimer": "D-dimer (ng/mL)",
    "albumin": "Albumin (g/dL)",        

    "Fibrinogen": "Fibrinogen (mg/dL)",
    "alanine_aminotransferase_ALT": "Alanine aminotransferase (U/L)",
    "aspartate_aminotransferase_AST": "Aspartate aminotransferase (U/L)",
    "total_bilirubin": "Total bilirubin (mg/dL)",
    "lactate_dehydrogenase_LDH": "Lactate dehydrogenase (U/L)",
    "cardiac_troponin_High_Sensitivity": "Cardiac troponin High Sensitivity (ng/mL)",
    "cardiac_troponin_Normal_Sensitivity": "Cardiac troponin Normal Sensitivity (ng/mL)",
    "prothrombin_time_PT": "Prothrombin time (s)",
    "white_blood_cell_count_Leukocytes": "White blood cell count (10*3/uL)",
    "lymphocyte_count": "Lymphocyte count (10*3/uL)",
    "neutrophil_count": "Neutrophil count (10*3/uL)",
    "procalcitonin": "Procalcitonin (ng/mL)",
}

colors = ['#E79F00', '#0072B2', '#D45E00', '#CB7AA7', '#029F73', '#57B4E9']

# sites = ['BIDMC', 'FRBDX', 'NWU', 'MGB', 'UPENN', 'VA1', 'VA2', 'VA3', 'VA4', 'VA5']
sites = ['META', 'APHP', 'FRBDX', 'ICSM', 'BIDMC', 'MGB', 'UCLA', 'UMICH', 'UPENN', 'VA1', 'VA2', 'VA3', 'VA4', 'VA5']
site_colors = ['black', '#D45E00', '#0072B2', '#CB7AA7', '#E79F00', '#029F73', '#DBD03C', '#57B4E9', '#57B4E9', '#57B4E9', '#57B4E9', '#57B4E9']
sites = ['META', 'APHP', 'FRBDX', 'ICSM', 'UKFR', 'NWU', 'BIDMC', 'MGB', 'UCLA', 'UMICH', 'UPENN', 'UPITT', 'VA1', 'VA2', 'VA3', 'VA4', 'VA5']
site_colors = ['black', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#CB7AA7', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00','#D45E00','#D45E00']

df.siteid = df.siteid.apply(lambda x: x.upper())
df = df.sort_values(by=['siteid'])

meta = df[df.siteid == 'META']
nometa = df[df.siteid != 'META']

df = meta.append(nometa)

df.lab = df.lab.apply(lambda x: consistent_loinc[x])

df = df[df.y != 0]

unique_labs = df.lab.unique().tolist()
print(df.lab.unique().tolist())
print(df.siteid.unique().tolist())

df

# All Sites

In [None]:
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)'):
    d = df.copy()
    d = d[d.lab == lab]
    
    plot = alt.Chart(
        d
    ).mark_bar(
#         point=True,
#         size=1,
#         stroke='black',
        opacity=0.5
    ).encode(
        x=alt.X("x:Q", title=None, bin=alt.Bin(maxbins=20), axis=alt.Axis(labelAngle=0, tickCount=3), scale=alt.Scale(clamp=True)),
        y=alt.Y("sum(y):Q", title=None, scale=alt.Scale(zero=True), stack=None),
        color=alt.Color("period:N", title='Wave', scale=alt.Scale(domain=['early', 'late'], range=['#D45E00', '#0072B2'])),
    ).properties(
        width=150,
        height=150
    )

    print(sites)
    plot = plot.facet(
        column=alt.Column("siteid:N", header=alt.Header(title=None), sort=sites)
    ).resolve_scale(color='shared')

    plot = plot.properties(
        title={
            "text": [
                f"{lab} ({scale.capitalize()} Scale)"
            ],
            "dx": 30,
            "subtitle": [
                get_visualization_subtitle(data_release=data_release, with_num_sites=False)
            ],
            "subtitleColor": "gray",
        }
    )

    return plot

# plot = plot_lab(df=df)

plot = alt.vconcat(*(
    plot_lab(df=df, lab=lab) for lab in unique_labs
), spacing=30).resolve_scale(color='independent')

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='bottom',
    legend_title_orient='left',
    axis_label_font_size=14,
    header_label_font_size=16,
    point_size=100
)

plot

# Meta

In [None]:
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)'):
    d = df.copy()
    d = d[d.lab == lab]
    d = d[d.siteid == 'META']
    
    plot = alt.Chart(
        d
    ).mark_bar(
#         point=True,
#         size=1,
        stroke='black',
        strokeWidth=0,
        opacity=0.5
    ).encode(
        x=alt.X("x:Q", title=None, bin=alt.Bin(maxbins=20), axis=alt.Axis(labelAngle=0, tickCount=0), scale=alt.Scale(clamp=True)),
        y=alt.Y("sum(y):Q", title=None, scale=alt.Scale(zero=True), stack=None),
        color=alt.Color("period:N", title='Wave', scale=alt.Scale(domain=['early', 'late'], range=['#D45E00', '#0072B2'])),
    ).properties(
        width=200,
        height=200
    )

    print(sites)
    plot = plot.facet(
        column=alt.Column("siteid:N", header=alt.Header(title=None, labels=False),  sort=sites)
    ).resolve_scale(color='shared')

    plot = plot.properties(
        title={
            "text": f"{lab}",
#             "dx": 20,
            "fontSize": 13,
            "anchor": "middle"
        }
#         {
#             "text": [
#                 f"{lab}"
#             ],
#             "dx": 30,
#             "subtitle": [
#                 get_visualization_subtitle(data_release=data_release, with_num_sites=False)
#             ],
#             "subtitleColor": "gray",
#         }
    )

    return plot

# plot = plot_lab(df=df)
# 6
r1 = alt.hconcat(*(
    plot_lab(df=df, lab=lab) for lab in unique_labs[:6]
), spacing=30).resolve_scale(color='independent')

r2 = alt.hconcat(*(
    plot_lab(df=df, lab=lab) for lab in unique_labs[6:12]
), spacing=30).resolve_scale(color='independent')

r3 = alt.hconcat(*(
    plot_lab(df=df, lab=lab) for lab in unique_labs[12:18]
), spacing=30).resolve_scale(color='independent')

plot = alt.vconcat(r1, r2, r3, spacing=30)

plot = plot.properties(title={
        "text": [
            f"Meta Analysis on Lab Distribution ({scale.capitalize()} Scale)"
        ],
        "dx": 30,
        "subtitle": [
            get_visualization_subtitle(data_release=data_release, with_num_sites=False)
        ],
        "subtitleColor": "gray",
    }
)


plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='bottom',
    legend_title_orient='left',
    axis_label_font_size=14,
    header_label_font_size=16,
    point_size=100
)

plot

# Violin

In [None]:
# 'Lymphocyte count (10*3/uL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)', 'Prothrombin time (s)', 'Cardiac troponin Normal Sensitivity (ng/mL)', 'White blood cell count (10*3/uL)', 'Ferritin (ng/mL)', 'Cardiac troponin High Sensitivity (ng/mL)', 'Fibrinogen (mg/dL)', 'Aspartate aminotransferase (U/L)', 'Alanine aminotransferase (U/L)', 'Neutrophil count (10*3/uL)', 'Total bilirubin (mg/dL)', 'Lactate dehydrogenase (U/L)', 'Albumin (g/dL)', 'C-reactive protein (Normal Sensitivity) (mg/dL)']
LOG_LABS = ['Alanine aminotransferase (U/L)', 'Aspartate aminotransferase (U/L)', 'C-reactive protein (Normal Sensitivity) (mg/dL)', 'D-dimer (ng/mL)', 'Ferritin (ng/mL)', 'Lactate dehydrogenase (U/L)']
scale = 'log'    
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)'):
    d = df.copy()
    d = d[d.lab == lab]
    d = d[d.siteid == 'META']

    d = d[d.scale == scale] # ('log' if lab in LOG_LABS else 'original')]

    d.period = d.period.apply(lambda x: {'early': 'First', 'late': 'Second'}[x])
    
    plot = alt.Chart(
        d
    ).mark_area(
#         point=True,
#         size=1,
        stroke='black',
        strokeWidth=1,
        opacity=1
    ).encode(
        y=alt.Y("x:Q", title=None, scale=alt.Scale(clamp=True), bin=alt.Bin(maxbins=30)), 
        x=alt.X("sum(y):Q", stack='center', title=None, impute=None, scale=alt.Scale(zero=True), axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True)),
        color=alt.Color("period:N", title='Wave', scale=alt.Scale(domain=['First', 'Second'], range=['#D45E00', '#0072B2']), legend=None),
    ).properties(
        width=70,
        height=200
    )

    print(sites)
    plot = plot.facet(
        column=alt.Column("period:N", header=alt.Header(title=None, labels=True)),
        spacing=0
    ).resolve_scale(color='shared')

    plot = plot.properties(
        title={
            "text": f"{lab.replace(' (Normal Sensitivity) ', '')}",
            "dx": 15,
            "fontSize": 13,
            "anchor": "middle"
        }
#         {
#             "text": [
#                 f"{lab}"
#             ],
#             "dx": 30,
#             "subtitle": [
#                 get_visualization_subtitle(data_release=data_release, with_num_sites=False)
#             ],
#             "subtitleColor": "gray",
#         }
    )

    return plot

# plot = plot_lab(df=df)
# 6

# 'Lymphocyte count (10*3/uL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)', 'Prothrombin time (s)', 'Cardiac troponin Normal Sensitivity (ng/mL)', 'White blood cell count (10*3/uL)', 'Ferritin (ng/mL)', 'Cardiac troponin High Sensitivity (ng/mL)', 'Fibrinogen (mg/dL)', 'Aspartate aminotransferase (U/L)', 'Alanine aminotransferase (U/L)', 'Neutrophil count (10*3/uL)', 'Total bilirubin (mg/dL)', 'Lactate dehydrogenase (U/L)', 'Albumin (g/dL)', 'C-reactive protein (Normal Sensitivity) (mg/dL)']
SELECTED_LABS = ['C-reactive protein (Normal Sensitivity) (mg/dL)', 'Albumin (g/dL)', 'Total bilirubin (mg/dL)', 'Creatinine (mg/dL)', 'Ferritin (ng/mL)', 'D-dimer (ng/mL)']

plot = alt.hconcat(*(
    plot_lab(df=df, lab=lab) for lab in SELECTED_LABS
), spacing=20).resolve_scale(color='independent')

# r1 = alt.hconcat(*(
#     plot_lab(df=df, lab=lab) for lab in unique_labs[:6]
# ), spacing=30).resolve_scale(color='independent')

# r2 = alt.hconcat(*(
#     plot_lab(df=df, lab=lab) for lab in unique_labs[6:12]
# ), spacing=30).resolve_scale(color='independent')

# r3 = alt.hconcat(*(
#     plot_lab(df=df, lab=lab) for lab in unique_labs[12:18]
# ), spacing=30).resolve_scale(color='independent')

# plot = alt.vconcat(r1, r2, r3, spacing=30)

plot = plot.properties(title={
        "text": [
            f"Meta Analysis Of Lab Distribution ({scale.capitalize()} Scale)"
        ],
        "dx": 33,
        "subtitle": [
            get_visualization_subtitle(data_release=data_release, with_num_sites=False)
        ],
        "subtitleColor": "gray",
    }
)

# scale = 'log'

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='bottom',
    legend_title_orient='left',
    axis_label_font_size=14,
    header_label_font_size=16
)

plot