In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils_1_1

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
import datetime
import dateutil.parser
from os.path import join

from constants_1_1 import SITE_FILE_TYPES
from utils_1_1 import (
    get_site_file_paths,
    get_site_file_info,
    get_site_ids,
    get_visualization_subtitle,
    get_country_color_map,
)
from theme import apply_theme
from web import for_website

alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

In [None]:
data_release='2021-05-12'
consistent_loinc = {
    "C_reactive_protein_CRP_Normal_Sensitivity": "C-reactive protein (Normal Sensitivity) (mg/dL)",
    "creatinine": "Creatinine (mg/dL)",
    "Ferritin": "Ferritin (ng/mL)",
    "D_dimer": "D-dimer (ng/mL)",
    "albumin": "Albumin (g/dL)",        

    "Fibrinogen": "Fibrinogen (mg/dL)",
    "alanine_aminotransferase_ALT": "Alanine aminotransferase (U/L)",
    "aspartate_aminotransferase_AST": "Aspartate aminotransferase (U/L)",
    "total_bilirubin": "Total bilirubin (mg/dL)",
    "lactate_dehydrogenase_LDH": "Lactate dehydrogenase (U/L)",
    "cardiac_troponin_High_Sensitivity": "Cardiac troponin High Sensitivity (ng/mL)",
    "cardiac_troponin_Normal_Sensitivity": "Cardiac troponin Normal Sensitivity (ng/mL)",
    "prothrombin_time_PT": "Prothrombin time (s)",
    "white_blood_cell_count_Leukocytes": "White blood cell count (10*3/uL)",
    "lymphocyte_count": "Lymphocyte count (10*3/uL)",
    "neutrophil_count": "Neutrophil count (10*3/uL)",
    "procalcitonin": "Procalcitonin (ng/mL)",
}

continents = ['USA', 'EUROPE']
continent_colors = ['#D45E00', '#57B4E9']

countries = ['USA', 'EUROPE', 'FRANCE', 'GERMANY', 'ITALY']
country_colors = ['#D45E00', '#57B4E9', '#0072B2', '#E79F00', '#029F73']

sites = ['APHP', 'FRBDX', 'UKFR', 'BIDMC', 'MGB', 'NWU', 'UCLA', 'UMICH', 'UPENN', 'UPITT', 'VA1', 'VA2', 'VA4', 'VA5']
# ['black', '#0072B2', '#0072B2', '#0072B2', '#0072B2', '#CB7AA7', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00', '#D45E00','#D45E00','#D45E00']

# Lab Trajectory

In [None]:
df = pd.read_csv(join("..", "data", "Phase2.1SurvivalRSummariesPublic", "ToShare", "table.R1.classification.phase1.csv"))

df = df.drop(columns=['Unnamed: 0'])
df = df.rename(columns={"nm.lab": "lab"})
df = pd.melt(df, id_vars=['lab', 'siteid'], var_name='day', value_name='value')
df.siteid = df.siteid.apply(lambda x: x.upper().replace('NORTH AMERICA', 'USA'))
df.lab = df.lab.apply(lambda x: consistent_loinc[x])
df.day = df.day.apply(lambda x: x.replace('day', ''))

unique_labs = df.lab.unique().tolist()
print(unique_labs)

unique_sites = df.siteid.unique().tolist()
print(unique_sites)

df

In [None]:
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)', is_country=True):
    d = df.copy()
    d = d[d.lab == lab]
    
    if is_country:
        d = d[d.siteid.str.contains('META')]
        d.siteid = d.siteid.apply(lambda x: x.replace('META-', ''))
        color_scale=alt.Scale(domain=countries, range=country_colors)
    else:
        d = d[~d.siteid.str.contains('META')]
        color_scale=alt.Scale(domain=sites, scheme='category20')
    
    plot = alt.Chart(
        d
    ).mark_line(
#         point=True,
        size=2.5,
#         stroke='black',
        opacity=1
    ).encode(
        x=alt.X("day:Q", title='Days Since Admission', axis=alt.Axis(labelAngle=0, tickCount=10), scale=alt.Scale(clamp=True, nice=False, padding=10)),
        y=alt.Y("value:Q", title=None, scale=alt.Scale(zero=False, nice=False, padding=10)),
        color=alt.Color("siteid:N", title=None, scale=color_scale),
    ).properties(
        width=450,
        height=250
    )
    
    point = plot.mark_point(
        filled=True,
        opacity=1
    ).encode(
        color=alt.value("white"),
        stroke=alt.Color("siteid:N", title=None, scale=color_scale)
    )
    
    text = plot.transform_filter(
        {'field': 'day', 'oneOf': [14]}
    ).mark_text(
        align='left',
        baseline='middle',
        dx=7
    ).encode(
        text='siteid'
    )


#     plot = plot.facet(
#         column=alt.Column("siteid:N", header=alt.Header(title=None))
#     )

    plot = (plot + point) # if is_country else plot
        
    plot = plot.properties(
        title={
            "text": [
                f"{lab}",
            ],
            "dx": 30,
            "subtitle": [
                get_visualization_subtitle(data_release=data_release, with_num_sites=False)
            ],
            "subtitleColor": "gray",
        }
    )

    return plot

# plot = plot_lab(df=df)

plot = alt.vconcat(*(
    alt.hconcat(
        plot_lab(df=df, lab=lab, is_country=True),
        plot_lab(df=df, lab=lab, is_country=False),
        spacing=30
    ).resolve_scale(color='independent', stroke='independent') for lab in unique_labs
), spacing=30).resolve_scale(color='independent', stroke='independent')

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='right',
    legend_title_orient='top',
    axis_label_font_size=14,
    header_label_font_size=16,
    subtitle_font_size=18,
    point_size=100
)

plot

# Compare

In [None]:
df = pd.read_csv(join("..", "data", "Phase2.1SurvivalRSummariesPublic", "ToShare", "table.R1.classification.comparison.csv"))

df = df.drop(columns=['Unnamed: 0'])
df = df.rename(columns={"nm.lab": "lab"})
df = pd.melt(df, id_vars=['lab', 'siteid', 'phase'], var_name='day', value_name='value')
df.siteid = df.siteid.apply(lambda x: x.replace('Eurpoe', 'Europe').upper().replace('NORTH AMERICA', 'USA'))
df.lab = df.lab.apply(lambda x: consistent_loinc[x])
df.day = df.day.apply(lambda x: x.replace('day', ''))
df['siteid-phase'] = df.siteid + df.phase.astype(str)

unique_labs = df.lab.unique().tolist()
print(unique_labs)

unique_sites = df.siteid.unique().tolist()
print(unique_sites)

df

In [None]:
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)', is_country=True):
    d = df.copy()
    d = d[d.lab == lab]
    d.siteid = d.siteid.apply(lambda x: x.replace('META-', ''))
    
    continents = ['META-USA1', 'META-USA2', 'META-EUROPE1', 'META-EUROPE2']
    continent_colors = ['#D45E00', '#D45E00', '#57B4E9', '#57B4E9']
    color_scale=alt.Scale(domain=continents, range=continent_colors)
    
    plot = alt.Chart(
        d
    ).mark_line(
#         point=True,
        size=2.5,
#         stroke='black',
        opacity=1
    ).encode(
        x=alt.X("day:Q", title='Days Since Admission', axis=alt.Axis(labelAngle=0, tickCount=10), scale=alt.Scale(clamp=True, nice=False, padding=10)),
        y=alt.Y("value:Q", title=None, scale=alt.Scale(zero=False, nice=False, padding=10)),
        color=alt.Color("siteid-phase:N", title='Country', scale=color_scale, legend=None),
        shape=alt.Shape("phase:N", title='Phase', scale=alt.Scale(domain=['1', '2'], range=['circle', 'diamond']), legend=alt.Legend(symbolStrokeWidth=2, symbolFillColor='white', symbolStrokeColor='gray'))
    ).properties(
        width=450,
        height=250
    )
    
    point = plot.mark_point(
        filled=True,
        opacity=1
    ).encode(
        color=alt.value("white"),
        stroke=alt.Color("siteid:N", title='Country', scale=alt.Scale(domain=['USA', 'EUROPE'], range=['#D45E00', '#57B4E9'])),
        shape=alt.Shape("phase:N", title='Phase', scale=alt.Scale(domain=['1', '2'], range=['circle', 'diamond']), legend=None)
    )
    
    text = plot.transform_filter(
        {'field': 'day', 'oneOf': [14]}
    ).mark_text(
        align='left',
        baseline='middle',
        dx=7
    ).encode(
        text='siteid'
    )


#     plot = plot.facet(
#         column=alt.Column("siteid:N", header=alt.Header(title=None))
#     )

    plot = (plot + point) # if is_country else plot
        
    plot = plot.properties(
        title={
            "text": [
                f"{lab}",
            ],
            "dx": 30,
            "subtitle": [
                get_visualization_subtitle(data_release=data_release, with_num_sites=False)
            ],
            "subtitleColor": "gray",
        }
    )

    return plot

# plot = plot_lab(df=df)

plot = alt.vconcat(*(
#     alt.hconcat(
#         plot_lab(df=df, lab=lab, is_country=False),
#         spacing=30
#     ).resolve_scale(color='independent', stroke='independent')
    plot_lab(df=df, lab=lab, is_country=False) for lab in unique_labs
), spacing=30).resolve_scale(color='independent', stroke='independent', shape='independent')

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='right',
    legend_title_orient='top',
    axis_label_font_size=14,
    header_label_font_size=16,
    subtitle_font_size=18,
    point_size=150
)

plot

# Violin

In [None]:
# 'Lymphocyte count (10*3/uL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)', 'Prothrombin time (s)', 'Cardiac troponin Normal Sensitivity (ng/mL)', 'White blood cell count (10*3/uL)', 'Ferritin (ng/mL)', 'Cardiac troponin High Sensitivity (ng/mL)', 'Fibrinogen (mg/dL)', 'Aspartate aminotransferase (U/L)', 'Alanine aminotransferase (U/L)', 'Neutrophil count (10*3/uL)', 'Total bilirubin (mg/dL)', 'Lactate dehydrogenase (U/L)', 'Albumin (g/dL)', 'C-reactive protein (Normal Sensitivity) (mg/dL)']
LOG_LABS = ['Alanine aminotransferase (U/L)', 'Aspartate aminotransferase (U/L)', 'C-reactive protein (Normal Sensitivity) (mg/dL)', 'D-dimer (ng/mL)', 'Ferritin (ng/mL)', 'Lactate dehydrogenase (U/L)']

scale = 'log'  

print(df[df.siteid == 'META_GERMANY'].lab.unique().tolist())
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)', country='META'):
#     print(country)
    d = df.copy()
    d = d[d.lab == lab]
    d = d[d.siteid == country]
    # d = d[d.siteid.str.contains('META')]
    # print(d.siteid.unique().tolist())

    # crop
    d = d[~((d.lab == 'Creatinine (mg/dL)') & (d.x > 10) & (d.scale == 'log-exp'))]
    d = d[~((d.lab == 'D-dimer (ng/mL)') & (d.x > 7000) & (d.scale == 'log-exp'))]

    d = d[d.scale == 'log-exp'] # ('log' if lab in LOG_LABS else 'original')]
    
#     if(country == 'META_GERMANY'):
#         print(d)
    
    d.period = d.period.apply(lambda x: {'early': 'First', 'late': 'Second'}[x])
    
    plot = alt.Chart(
        d
    ).mark_area(
#         point=True,
#         size=1,
        stroke='black',
        strokeWidth=1,
        opacity=1
    ).encode(
        y=alt.Y("x:Q", title=None, scale=alt.Scale(clamp=True), bin=alt.Bin(maxbins=50)), 
        x=alt.X("sum(y):Q", stack='center', title=None, impute=None, scale=alt.Scale(zero=True), axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True)),
        color=alt.Color("period:N", title='Wave', scale=alt.Scale(domain=['First', 'Second'], range=['#D45E00', '#0072B2']), legend=None),
    ).properties(
        width=70,
        height=200
    )

#     print(sites)
    plot = plot.facet(
        column=alt.Column("period:N", header=alt.Header(title=None, labels=True)),
        spacing=0
    ).resolve_scale(color='shared')

    plot = plot.properties(
        title={
            "text": f"{lab.replace(' (Normal Sensitivity) ', '')}",
            "dx": 15,
            "fontSize": 13,
            "anchor": "middle"
        }
#         {
#             "text": [
#                 f"{lab}"
#             ],
#             "dx": 30,
#             "subtitle": [
#                 get_visualization_subtitle(data_release=data_release, with_num_sites=False)
#             ],
#             "subtitleColor": "gray",
#         }
    )

    return plot

# plot = plot_lab(df=df)
# 6

SELECTED_LABS = ['C-reactive protein (Normal Sensitivity) (mg/dL)', 'Albumin (g/dL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)']

plot = alt.vconcat(*(
    alt.hconcat(*(
        plot_lab(df=df, lab=lab, country=c) for lab in SELECTED_LABS
    ), spacing=20).resolve_scale(color='independent').properties(title={
        "text": c.replace('META_', ''),
        "dx": 33,
    }
) for c in ['META', 'META_FRANCE', 'META_ITALY', 'META_USA']# df[df.siteid.str.contains('META')].siteid.unique().tolist()
), spacing=20).resolve_scale(color='independent', x='independent', y='independent')

plot = plot.properties(title={
        "text": [
            f"Meta Analysis Of Lab Distribution"
        ],
        "dx": 33,
        "subtitle": [
            get_visualization_subtitle(data_release=data_release, with_num_sites=False)
        ],
        "subtitleColor": "gray",
    }
)

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='bottom',
    legend_title_orient='left',
    axis_label_font_size=14,
    header_label_font_size=16
)

plot

In [None]:
# 'Lymphocyte count (10*3/uL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)', 'Prothrombin time (s)', 'Cardiac troponin Normal Sensitivity (ng/mL)', 'White blood cell count (10*3/uL)', 'Ferritin (ng/mL)', 'Cardiac troponin High Sensitivity (ng/mL)', 'Fibrinogen (mg/dL)', 'Aspartate aminotransferase (U/L)', 'Alanine aminotransferase (U/L)', 'Neutrophil count (10*3/uL)', 'Total bilirubin (mg/dL)', 'Lactate dehydrogenase (U/L)', 'Albumin (g/dL)', 'C-reactive protein (Normal Sensitivity) (mg/dL)']
LOG_LABS = ['Alanine aminotransferase (U/L)', 'Aspartate aminotransferase (U/L)', 'C-reactive protein (Normal Sensitivity) (mg/dL)', 'D-dimer (ng/mL)', 'Ferritin (ng/mL)', 'Lactate dehydrogenase (U/L)']

scale = 'log'  

# crop
# df = df[~((df.lab == 'Creatinine (mg/dL)') & (df.x > 10) & (df.scale == 'log-exp'))]
# df = df[~((df.lab == 'D-dimer (ng/mL)') & (df.x > 7000) & (df.scale == 'log-exp'))]

print(df[df.siteid == 'META_GERMANY'].lab.unique().tolist())
def plot_lab(df=None, lab='Neutrophil count (10*3/uL)', country='META'):
#     print(country)
    d = df.copy()
    d = d[d.lab == lab]
    d = d[d.siteid == country]
    # d = d[d.siteid.str.contains('META')]
    # print(d.siteid.unique().tolist())
    
        # crop
    d = d[~((d.lab == 'Creatinine (mg/dL)') & (d.x > 10) & (d.scale == 'log-exp'))]
    d = d[~((d.lab == 'D-dimer (ng/mL)') & (d.x > 7000) & (d.scale == 'log-exp'))]
    
    d = d[d.scale == 'log-exp'] # ('log' if lab in LOG_LABS else 'original')]
    
#     if(country == 'META_GERMANY'):
#         print(d)
    
    d.period = d.period.apply(lambda x: {'early': 'First', 'late': 'Second'}[x])
    
    plot = alt.Chart(
        d
    ).mark_bar(
#         point=True,
#         size=1,
        stroke='black',
        strokeWidth=1,
        opacity=0.4
    ).encode(
        x=alt.X("x:Q", title=None, scale=alt.Scale(clamp=True), bin=alt.Bin(maxbins=30), stack=None), 
        y=alt.Y("sum(y):Q", title=None, scale=alt.Scale(zero=True),stack=None, axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True)),
        color=alt.Color("period:N", title='Wave', scale=alt.Scale(domain=['First', 'Second'], range=['#D45E00', '#0072B2']), legend=None),
    ).properties(
        width=170,
        height=200
    )

#     print(sites)
#     plot = plot.facet(
#         column=alt.Column("period:N", header=alt.Header(title=None, labels=True)),
#         spacing=0
#     ).resolve_scale(color='shared')

    plot = plot.properties(
        title={
            "text": f"{lab.replace(' (Normal Sensitivity) ', '')}",
            "dx": 15,
            "fontSize": 13,
            "anchor": "middle"
        }
#         {
#             "text": [
#                 f"{lab}"
#             ],
#             "dx": 30,
#             "subtitle": [
#                 get_visualization_subtitle(data_release=data_release, with_num_sites=False)
#             ],
#             "subtitleColor": "gray",
#         }
    )

    return plot

# plot = plot_lab(df=df)
# 6

SELECTED_LABS = ['C-reactive protein (Normal Sensitivity) (mg/dL)', 'Albumin (g/dL)', 'Creatinine (mg/dL)', 'D-dimer (ng/mL)']

plot = alt.vconcat(*(
    alt.hconcat(*(
        plot_lab(df=df, lab=lab, country=c) for lab in SELECTED_LABS
    ), spacing=20).resolve_scale(color='independent').properties(title={
        "text": c.replace('META_', ''),
        "dx": 33,
    }
) for c in ['META', 'META_FRANCE', 'META_ITALY', 'META_USA']# df[df.siteid.str.contains('META')].siteid.unique().tolist()
), spacing=20).resolve_scale(color='independent', x='independent', y='independent')

plot = plot.properties(title={
        "text": [
            f"Meta Analysis Of Lab Distribution"
        ],
        "dx": 33,
        "subtitle": [
            get_visualization_subtitle(data_release=data_release, with_num_sites=False)
        ],
        "subtitleColor": "gray",
    }
)

plot = apply_theme(
    plot,
    axis_y_title_font_size=16,
    title_anchor='start',
    legend_orient='bottom',
    legend_title_orient='left',
    axis_label_font_size=14,
    header_label_font_size=16
)

plot