In [1]:
import pandas as pd
import numpy as np
from os.path import join

from constants import COLUMNS, SITE_DATA_DIR
from utils import get_site_ids, read_loinc_df

np.random.seed(1)

In [2]:
site_ids = get_site_ids()
loinc_df = read_loinc_df().set_index('loinc').rename(columns={'labTest': 'name'})
loinc_df["change"] = 0
loinc_codes = loinc_df.index.values.tolist()
loinc_df

Unnamed: 0_level_0,name,change
loinc,Unnamed: 1_level_1,Unnamed: 2_level_1
6690-2,white blood cell count (Leukocytes),0
751-8,neutrophil count,0
731-0,lymphocyte count,0
1751-7,albumin,0
2532-0,lactate dehydrogenase (LDH),0
1742-6,alanine aminotransferase (ALT),0
1920-8,aspartate aminotransferase (AST),0
1975-2,total bilirubin,0
2160-0,creatinine,0
49563-0,cardiac troponin,0


In [3]:
loinc_df["change"] = np.random.uniform(low=-1.0, high=1.0, size=loinc_df.shape[0])
loinc_df

Unnamed: 0_level_0,name,change
loinc,Unnamed: 1_level_1,Unnamed: 2_level_1
6690-2,white blood cell count (Leukocytes),-0.165956
751-8,neutrophil count,0.440649
731-0,lymphocyte count,-0.999771
1751-7,albumin,-0.395335
2532-0,lactate dehydrogenase (LDH),-0.706488
1742-6,alanine aminotransferase (ALT),-0.815323
1920-8,aspartate aminotransferase (AST),-0.62748
1975-2,total bilirubin,-0.308879
2160-0,creatinine,-0.206465
49563-0,cardiac troponin,0.077633


Fields: siteid, loinc, days_since_positive, num_patients, mean_value, stdev_value<br>
Notes:
1. One row per loinc and days_since_positive
2. days_since_positive = 1 on the date the patient has a positive COVID test result
3. Start the table at days_since_positive = -6 (seven days before the positive test)
4. Go for as many days as you have data: days_since_positive = 5, 6, 7, ...
5. Map your local loinc codes to the loinc code in Gabe's list
6. Only use Gabe's loinc codes in this list, not your local codes
7. Obfuscate small counts with "-1" as required by your institution

Examples: (Labs-BIDMC.csv)<br>
BIDMC, 1920-8, -2, -1, 27.0, -1<br>
BIDMC, 1920-8, -1, 4, 41.2, 23.567<br>
BIDMC, 1920-8, 0, 4, 47.1, 17.876<br>
BIDMC, 1920-8, 1, 38, 59.2, 40.152<br>
BIDMC, 1920-8, 2, 35, 52.2, 35.432<br>
BIDMC, 1920-8, 3, 12, 40.4, 25.678<br>
BIDMC, 1920-8, 4, -1, 52.8, -1<br>
BIDMC, 1975-2, 1, 37, 39.5, 20.100

In [4]:
def make_labs_row(site_id, loinc, dsp, num_p, mean, stdev):
    return {
        COLUMNS.SITE_ID: site_id,
        COLUMNS.LOINC: loinc,
        COLUMNS.DAYS_SINCE_POSITIVE: dsp,
        COLUMNS.NUM_PATIENTS: num_p,
        COLUMNS.MEAN_VALUE: mean,
        COLUMNS.STDEV_VALUE: stdev
    }

In [5]:
for site_id in site_ids:
    output_filepath = join(SITE_DATA_DIR, site_id, f"2020-03-29_Labs-{site_id}.csv")
    labs_at_site = np.random.choice(loinc_codes, 8, replace=False)
    data = []
    for days_since_positive in range(-6, 20):
        for loinc in labs_at_site:
            num_patients = np.random.randint(100, 1000)
            mean_value = np.random.uniform(2, 10) + loinc_df.at[loinc, "change"] * days_since_positive
            stdev_value = np.random.normal(4, 1) + (loinc_df.at[loinc, "change"]/2) * days_since_positive
            data.append(make_labs_row(site_id, loinc, days_since_positive, num_patients, mean_value, stdev_value))
            
            
    df = pd.DataFrame(data=data)
    df.to_csv(output_filepath, index=False, header=False)