# Make dataset from originals


In [83]:
import json
import pandas as pd

next_bloods = pd.read_csv("../../mnt/next_bloods.csv")

for col in ["DischargeDate", "creatinineDate"]:
    if "Date" in col:
        next_bloods[col] = pd.to_datetime(next_bloods[col], 
            dayfirst=True,
            format="mixed",
            errors="coerce",
        )
        next_bloods = next_bloods[next_bloods[col].notnull()]

next_bloods["discharge_creatinine_interval"] = next_bloods["creatinineDate"] - next_bloods["DischargeDate"]

print()
print(f"{len(next_bloods)} U+E blood tests")

  next_bloods = pd.read_csv("../../mnt/next_bloods.csv")



98473 U+E blood tests


Now load the extracted blood tests from the free text:

In [84]:
extracted_bloods = pd.read_csv("../../mnt/extracted_bloods.csv") # extracted using another script
extracted_bloods["Bloods"] = extracted_bloods["Bloods"].apply(json.loads)
extracted_bloods = extracted_bloods[extracted_bloods["Bloods"].apply(len) > 0]

test_names_to_filter = [
    "U&E",
    "U+E",
    "UE",
    "renal",
    "Renal",
    "potassium",
    "Potassium",
    "Electrolytes",
    "electrolytes",
    "Na",
    "Sodium",
    "sodium",
    "eGFR",
    "K",
    "Urea",
]

def extract_ue(data):
    timeframe_str = ""
    for test in data:
        if not isinstance(test, dict):
            continue

        for tn in test_names_to_filter:
            if tn in test.get("text", ""):
                timeframe_str = test.get("timeframe", "")
                break
    if len(timeframe_str) == 0:
        return None
    return timeframe_str

extracted_bloods["UE_timeframe"] = extracted_bloods["Bloods"].apply(extract_ue)
extracted_bloods = extracted_bloods.dropna(subset=["UE_timeframe"])
nb = next_bloods[["SpellSerial", "discharge_creatinine_interval"]]
df = extracted_bloods.merge(nb, on="SpellSerial", how="inner")

# Add notes text back in
notes = pd.read_csv("../../mnt/discharge-summaries-deid.csv")
notes = notes[["SpellSerial", "c_Action_required_deid_removed"]]
notes = notes.drop_duplicates(subset=["SpellSerial"])
df = df.merge(notes, on="SpellSerial", how="inner")

print(f"{len(df)} of which match with an extracted U+E test")

3299 of which match with an extracted U+E test


Filter for Salford CCG

In [85]:
df_ccg = pd.read_csv("../../mnt/CCGs.csv")
df = df.merge(df_ccg, on="SpellSerial", how="inner")
df = df[df["CCG"].isin(["NHS SALFORD CCG", "NHS SALFORD GM - ICB", "NHS SALFORD GM ICB"])]
print(f"{len(df)} of which are in Salford CCGs")

2233 of which are in Salford CCGs


## Calculate adherence
Convert intervals to adherence/non-adherence

First, map timeframes extracted from the free-text into number of days:

In [86]:
desc_to_days = json.load(open("desc_to_days.json"))
df["requested_days"] = df["UE_timeframe"].map(desc_to_days)

And extract the number of days it actually took to do the test:

In [87]:
df = df.dropna(subset=['discharge_creatinine_interval', 'requested_days'])

def fix_interval(row):
    #interval = row["discharge_creatinine_interval"].split(" days")[0]
    interval = row["discharge_creatinine_interval"].days
    return int(interval)
    
df['discharge_creatinine_interval'] = df.apply(fix_interval, axis=1)

Calculate the binary adherence/non-adherence label:

In [88]:
df['min_days'] = df.apply(lambda row: int(row["requested_days"].split("-")[0]), axis=1)
df['max_days'] = df.apply(lambda row: int(row["requested_days"].split("-")[-1]), axis=1)
df['adherence'] = df.apply(lambda row: row["discharge_creatinine_interval"] < row["max_days"] + 7, axis=1)

columns_to_keep = ["SpellSerial", "discharge_creatinine_interval", "min_days", "max_days", "adherence", "c_Action_required_deid_removed"]
df = df[columns_to_keep]

print("Number of adherent patients:")
print(df["adherence"].value_counts())

Number of adherent patients:
adherence
True     1534
False     626
Name: count, dtype: int64


# Save to disk

In [90]:
from datasets import Dataset, Value

df_out = df.copy()
df_out = df_out[["c_Action_required_deid_removed", "adherence"]]

ds = Dataset.from_pandas(df_out)
ds = ds.train_test_split(seed=42, test_size=0.1)
ds = ds.rename_column("c_Action_required_deid_removed", "text")
ds = ds.rename_column("adherence", "labels")

# Map label to int
new_features = ds["train"].features.copy()
new_features["labels"] = Value('int64')
ds = ds.cast(new_features)

ds["train"].to_csv("train.csv")
ds["test"].to_csv("test.csv")
ds

Casting the dataset:   0%|          | 0/1944 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/216 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 1944
    })
    test: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 216
    })
})