In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup as bsoup
from thefuzz import fuzz

from typing import Any, List, Dict, Tuple, Optional

import constants

In [2]:
DEV_MODE = True

ukbb_data_file = "small_ukbiobank.csv" if DEV_MODE else "ukbiobank.csv"
ukbb_data_path = os.path.join(constants.UK_BIOBANK_DATA_PATH, ukbb_data_file)

ukbb_index_html_path = os.path.join(constants.UK_BIOBANK_DATA_PATH, "ukbiobank.html")
ukbb_index_csv_path = os.path.join(constants.UK_BIOBANK_DATA_PATH, "ukbiobank_index.csv")

In [3]:
# Plot helpers
def titleize(label: str) -> str:
    """"""
    return label.replace("_", " ").title()


def add_plt_labels(ax, x: str, y: str, title: Optional[str] = None, **kwargs) -> None:
    ax.set_xlabel(titleize(x))
    ax.set_ylabel(titleize(y))
    if title:
        ax.set_title(titleize(title))

# Load data index table

In [4]:
ukbb_index_csv_path

'/Users/cole/Documents/_research/rabadan_lab/data/uk_biobank/ukbiobank_index.csv'

In [5]:
if os.path.exists(ukbb_index_csv_path):
    ukbb_index = pd.read_csv(ukbb_index_csv_path)
else:
    ukbb_html = bsoup(open(ukbb_index_html_path,'r').read())
    ukbb_index_html = ukbb_html.find_all("table")[1]
    ukbb_index = pd.read_html(str(ukbb_index_html))[0]
    ukbb_index.columns = [col.lower() for col in ukbb_index.columns]
    ukbb_index.to_csv(ukbb_index_csv_path, index=False)

ukbb_index["data_coding"] = ukbb_index["description"].apply(lambda desc: desc.split("Uses")[1] if "Uses" in desc else "")
ukbb_index["description"] = ukbb_index["description"].apply(lambda desc: desc.split("Uses")[0])


In [6]:
# partial_ukbb_udi_lookup_table.to_csv("resources/core_udi_lookup.csv", sep=",", index=False)

In [12]:
outlier_udi_lookup = pd.read_csv("resources/outlier_udi_lookup.csv", sep=",")
core_udi_lookup = pd.read_csv("resources/core_udi_lookup.csv", sep=",")
partial_udi_lookup = pd.concat([core_udi_lookup, outlier_udi_lookup])

partial_labeled_udis = partial_udi_lookup.loc[partial_udi_lookup["name"] != "_"]
partial_udi_to_name_map = dict(zip(partial_labeled_udis["udi"], partial_labeled_udis["name"]))

names = []
for udi in ukbb_index["udi"]:
    if "-" not in udi or udi.endswith("-0.0"):
        names.append(partial_udi_to_name_map.get(udi, None))
    else:
        udi_stem, udi_modifier = udi.split("-")
        new_name = None
        
        if udi_stem + "-0.0" in partial_udi_to_name_map:
            names.append(f"{partial_udi_to_name_map[udi_stem + '-0.0']}_{udi_modifier}")
        elif udi_stem + "-0.1" in partial_udi_to_name_map:
            names.append(f"{partial_udi_to_name_map[udi_stem + '-0.1']}_{udi_modifier}")
        else:
            names.append(None)

#         primary_udi = udi_stem + "-0.0"
#         new_name f"{partial_udi_to_name_map[primary_udi]}_{udi_modifier}" if primary_udi in partial_udi_to_name_map else Non
#         names.append(f"{partial_udi_to_name_map[primary_udi]}_{udi_modifier}"
#                      if primary_udi in partial_udi_to_name_map else None)

ukbb_index["name"] = names
udi_to_name_map = dict(zip(ukbb_index["udi"], ukbb_index["name"]))
name_to_udi_map = dict(zip(ukbb_index["name"], ukbb_index["udi"]))

In [13]:
def get_udi(name):
    if isinstance(name, str):
        return name_to_udi_map.get(name, name)

    return [get_udi(name_i) for name_i in name]


def get_name_from_udi(udi):
    if isinstance(udi, str):
        return udi_to_name_map.get(udi, udi)

    return [get_name_from_udi(udi_i) for udi_i in udi]


def udi_wrapper(function, *args, **kwargs) -> Any:
    """"""
    args = {get_udi(arg) for arg in args}
    kwargs = {key: get_udi(value) if isinstance(value, str) else value for key, value in kwargs.items()}
    return function(*args, **kwargs)


def relevant_feature_search(ukbb_index: pd.DataFrame, term: str) -> pd.DataFrame:
    """"""
    modified_names = ukbb_index["name"].apply(lambda s: s.replace("_", " ") + " " if s else "")
    found_indices = [i for (i, description) in enumerate(modified_names + ukbb_index["description"])
                     if fuzz.partial_ratio(description.lower(), term.lower()) > 95]
    return ukbb_index.iloc[found_indices]

In [14]:
ukbb_index.head()

Unnamed: 0,column,udi,count,type,description,data_coding,name
0,0,eid,502543,Sequence,Encoded anonymised participant ID,,eid
1,1,23-0.0,456606,Categorical (single),Spirometry method,data-coding 100270 comprises 5 Integer-valued...,spirometry_method
2,2,23-1.0,18135,Categorical (single),Spirometry method,data-coding 100270 comprises 5 Integer-valued...,spirometry_method_1.0
3,3,23-2.0,26790,Categorical (single),Spirometry method,data-coding 100270 comprises 5 Integer-valued...,spirometry_method_2.0
4,4,31-0.0,502543,Categorical (single),Sex,data-coding 9 comprises 2 Integer-valued memb...,sex


In [15]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):

df = ukbb_index.loc[ukbb_index["name"].isna()].copy(deep=True)
df["primary_udi"] = df["udi"].apply(lambda s: s.split("-")[0])
print(df.drop_duplicates(["primary_udi"]).column.tolist())
df.drop_duplicates(["primary_udi"])

[]


Unnamed: 0,column,udi,count,type,description,data_coding,name,primary_udi


In [16]:
relevant_feature_search(ukbb_index, "HTN")

Unnamed: 0,column,udi,count,type,description,data_coding,name
760,760,2966-0.0,134649,Integer,Age high blood pressure diagnosed,data-coding 100291 comprises 2 Integer-valued...,HTN_dx_age
761,761,2966-1.0,5856,Integer,Age high blood pressure diagnosed,data-coding 100291 comprises 2 Integer-valued...,HTN_dx_age_1.0
762,762,2966-2.0,7559,Integer,Age high blood pressure diagnosed,data-coding 100291 comprises 2 Integer-valued...,HTN_dx_age_2.0
1309,1309,6153-0.0,270871,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT
1310,1310,6153-0.1,24598,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_0.1
1311,1311,6153-0.2,2414,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_0.2
1312,1312,6153-0.3,93,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_0.3
1313,1313,6153-1.0,10402,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_1.0
1314,1314,6153-1.1,1076,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_1.1
1315,1315,6153-1.2,92,Categorical (multiple),"Medication for cholesterol, blood pressure, di...",data-coding 100626 comprises 8 Integer-valued...,medications_HLD_HTN_diabetes_HRT_1.2


# Load BioBank Data

In [None]:
ukbb_data = pd.read_csv(ukbb_data_path, low_memory=False)
ukbb_data.head()

In [None]:
ukbb_index["counts"] = np.array(ukbb_data.count().tolist())
ukbb_index["frequency"] = ukbb_index["counts"] / len(ukbb_data)
display(ukbb_index.head(10))

fig, ax = plt.subplots(figsize=(12, 6))
sns.histplot(ukbb_index["frequency"], ax=ax)
ax.set_yscale("log")
add_plt_labels(ax, "Data Frequency", "Count", "Feature Frequency Distributions")
plt.show()
fig.tight_layout()
fig.savefig("cover_plots/feature_frequency_distribution.png")

display(ukbb_index.query("frequency > .99"))

# actual EDA

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
y = "tobacco_smoking_current_start_age"
x = "recruitment_age"
udi_wrapper(sns.kdeplot, data=ukbb_data, y=y, x=x, ax=ax, fill=True)
add_plt_labels(ax, x, y)

y, x = "diabetes_dx_age", "recruitment_age"
fig, ax = plt.subplots(figsize=(12, 6))
ax = udi_wrapper(sns.kdeplot, data=ukbb_data, x=x, y=y, hue="sex", ax=ax, fill=True)
add_plt_labels(ax, x, y)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
udi_wrapper(sns.histplot, data=ukbb_data, x="tobacco_smoking_current_start_age", hue="sex",  ax=ax)

In [None]:

x, hue = "birth_year", "sex"
fig, ax = plt.subplots(figsize=(12, 6))
udi_wrapper(sns.histplot, data=ukbb_data, x=x, hue=hue, ax=ax)
add_plt_labels(ax, x, y="Count")

x = "recruitment_age"
fig, ax = plt.subplots(figsize=(12, 6))
udi_wrapper(sns.histplot, data=ukbb_data, x=x, hue=hue, ax=ax)
add_plt_labels(ax, x, y="Count")