# Initial preprocessing and EDA pipeline

## Setup libraries & load dataset

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math, boto3, tempfile
import scipy.stats as sp
from utils import *
from label_utils import *

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'

In [None]:
fdir = 'processed/8079926a-4598-4d0c-ac9a-353cf8e638b1/v1/'
fname = 'data.h5'

s3_client = boto3.client('s3',region_name='eu-west-2')
storedata = s3_client.get_object(Bucket='dataset-generator-bucket', Key=fdir+fname)['Body'].read()

with tempfile.NamedTemporaryFile(suffix='.h5') as temp:
    output_path = temp.name
    with open(output_path, 'wb') as temp_file:
        temp_file.write(storedata)
    
    with pd.HDFStore(output_path, 'r') as storedata:
        df = storedata.get('dataframe')
        metadata = storedata.get_storer('dataframe').attrs.metadata
        
df = df.reset_index(drop=True)
df.head()        

In [None]:
df.shape

In [None]:
df.groupby("HealthLabel")['pulse'].count()

## Create other cancer labels

In [None]:
file_path = 's3://ukb-colorectal-cancer/ukb52347.csv'
df = read_csv(file_path)
df = remove_not_consented_participants(df)
df = remove_consent_withdrawals(df)

In [None]:
fields_to_aggregate = [102, 4079, 4080]
df = aggregate_repeat_measurements(df, field_ids=fields_to_aggregate, visit_id=0)

In [None]:
cancer_labels_df = pd.DataFrame()

cancer_codes = [['C18','C19','C20'],['C50'], ['C61'], ['C33','C34'], ['C67'], ['C64'],['C70','C71'], ['C82','C83','C84','C85','C86'],\
                ['C73'],['C16'],['C22'],['C25'],['C53']]
labels = ['colorectal-ca', 'breast-ca', 'prostate-ca', 'lung-ca', 'bladder-ca', 'kidney-ca', 'brain-ca', 'non-hodgkins-lymphoma', 'thyroid-ca','\
stomach-ca','liver-ca','pancreas-ca','cervical-ca']

count=[]
other=[0]
k = 0
for i in cancer_codes:
    ca_label = generate_labels_from_cancer_registry(df, i)
    count.append(len(ca_label[ca_label["label_class"]==1]))
    other.append(len(ca_label[ca_label["label_class"]==2]))
    ca_label.to_csv('./labels/'+labels[k]+'_labels.csv')
    cancer_labels_df[labels[k]] = ca_label['label_class']
    k = k + 1
print(count)
print(other)
cancer_labels_df.to_csv(s3_path+'labels/all_cancer_labels.csv')

In [None]:
count.insert(0, len(ca_label[ca_label["label_class"]==0])) # add count of controls
other[0] = count[0]

In [None]:
labels = ['Healthy','Colorectal ca', 'Breast ca', 'Prostate ca', 'Lung ca', 'Bladder ca', 'Kidney ca', 'Brain ca', 'Non-Hodgkins lymphoma', 'Thyroid ca','\
Stomach ca','Liver ca','Pancreas ca','Cervical ca']

cancer_df = pd.DataFrame()
cancer_df['Cancer_type'] = labels
cancer_df['Current_N'] = count
cancer_df['Current_%'] = np.array(count)/(len(df))*100
cancer_df['Future_N'] = other
cancer_df['Future_%'] = np.array(other)/(len(df))*100
cancer_df.to_csv(s3_path+'tables/cancer_count_raw_data.csv', index=False)

cancer_df.sort_values(by=['Current_%'], ascending=False)

In [None]:
# Plot changes in cancer cases since baseline visit

df_lab = read_csv(s3_path+'labels/all_cancer_labels.csv')

labels = ['colorectal-ca', 'breast-ca', 'prostate-ca', 'lung-ca', 'bladder-ca', 'kidney-ca', 'brain-ca', 'nh-lymphoma', 'thyroid-ca','\
stomach-ca','liver-ca','pancreas-ca','cervical-ca']

count = []
other = []

for i in df_lab:
    count.append(len(df_lab[df_lab[i]==1]))
    other.append(len(df_lab[df_lab[i]==2]))
count.pop(0)
other.pop(0)

In [None]:
for i in range(len(count)):
    y = [count[i], other[i]]
    plt.plot([1,2], y, label=labels[i])
    
plt.title('Change in total number of cancer cases since baseline')
plt.legend(loc='upper left', prop={'size':7})
plt.xticks([1,2])
plt.savefig('./figures/change_in_tot_cancer_cases.jpg', dpi=150)
plt.show()

In [None]:
count.insert(0,len(df_lab[df_lab['colorectal-ca']==0]))
other.insert(0,count[0])

In [None]:
other

In [None]:
labels = ['Cancer-any', 'Healthy']
ns = [np.sum(count[1:13]), count[0]]

fig1, (ax1, ax2) = plt.subplots(1,2,figsize=(12,8))
theme = plt.get_cmap('jet')

ax1 = plt.subplot(121)
plt.pie(ns, labels=labels, colors=['tomato','navy'], autopct='%1.1f%%')
ax1.title.set_text('Distribution of health labels')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

ax2 = plt.subplot(122)
labels = ['Colorectal ca', 'Breast ca', 'Prostate ca', 'Lung ca', 'Bladder ca', 'Kidney ca', 'Brain ca', 'Non-Hodgkins lymphoma', 'Thyroid ca', \
         'Stomach ca', 'Cervical ca']
ns = count[1:11]+[count[13]]
ax2.title.set_text('Distribution of cancer subtypes')
theme = plt.get_cmap('jet')
ax2.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/label_distributions.jpg', dpi=150)
plt.show()

In [None]:
# Do people with colorectal cancer, have other types of cancer?

concurrent_cancer = []
for key in df_lab:
    concurrent_cancer.append(len(df_lab.loc[(df_lab['colorectal-ca']==1) & (df_lab[key]==1)]))
concurrent_cancer.pop(0)

In [None]:
np.sum(concurrent_cancer)

In [None]:
fig1, (ax1, ax2) = plt.subplots(1,2,figsize=(12,8))
theme = plt.get_cmap('jet')

labels = ['Healthy', 'CRC', 'Concurrent']
ns = [count[0], count[1]-np.sum(concurrent_cancer), np.sum(concurrent_cancer)]

ax1 = plt.subplot(121)
ax1.title.set_text('Distribution of health labels')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

labels = ['Healthy', 'Current CRC', 'Future CRC']
ns = count[:2]+[other[1]]
ax2 = plt.subplot(122)
ax2.title.set_text('Distribution of CRC')
theme = plt.get_cmap('jet')
ax2.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/CRC_distibutions.jpg', dpi=150)
plt.show()

In [None]:
fig1, ax1 = plt.subplots(figsize=(6,8))

labels = ['Breast ca', 'Prostate ca', 'Lung ca', 'Bladder ca', 'Kidney ca', 'Brain ca', 'NH lymphoma', 'Thyroid ca', \
        'Stomach ca','Liver ca','Pancreas ca','Cervical ca']
ns = concurrent_cancer[1:]

ax1.title.set_text('Other cancer concurrent with CRC')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')

circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/CRC_concurrent_cases.jpg', dpi=150)
plt.show()

## Remove missing values, withdrawals, duplicate rows 

In [None]:
file_path = 's3://ukb-colorectal-cancer/ukb52347.csv'
df = pd.read_csv(file_path)
df_add = pd.read_csv('s3://ukb-colorectal-cancer/Participant_table_additional_fields.csv')
df_add.rename(columns={'Participant ID':'eid'}, inplace=True)
df = pd.merge(df,df_add, on='eid', how='left')

In [None]:
df = remove_not_consented_participants(df)
df = remove_consent_withdrawals(df)

fields_to_aggregate = [102, 4079, 4080]
df = aggregate_repeat_measurements(df, field_ids=fields_to_aggregate, visit_id=0)
df.shape

In [None]:
df_lab = read_csv(s3_path+'labels/all_cancer_labels.csv')

In [None]:
df_sm = df[['31-0.0','21022-0.0','21000-0.0','189-0.0', \
            '21001-0.0','102-0','4079-0','4080-0',\
            '30000-0.0','30010-0.0','30020-0.0','30030-0.0','30080-0.0','30180-0.0','30500-0.0','30510-0.0','30520-0.0','30530-0.0',\
            '30630-0.0','30640-0.0','30670-0.0','30690-0.0','30710-0.0','30720-0.0','30760-0.0','30770-0.0','30780-0.0','30830-0.0',\
            '30850-0.0','30860-0.0','30870-0.0','30890-0.0',\
            '40008-0.0','134-0.0','40009-0.0','40007-0.0','40011-0.0','40012-0.0','190-0.0', \
            'Basophill percentage | Instance 0', 'Eosinophill percentage | Instance 0', 'Mean corpuscular haemoglobin concentration | Instance 0', 'Mean corpuscular volume | Instance 0', 'Monocyte percentage | Instance 0', \
            'Neutrophill percentage | Instance 0', 'Reticulocyte percentage | Instance 0', 'Platelet distribution width | Instance 0', 'Platelet crit | Instance 0', 'Alanine aminotransferase | Instance 0', 'Albumin | Instance 0', \
            'Alkaline phosphatase | Instance 0', 'Aspartate aminotransferase | Instance 0', 'Calcium | Instance 0', 'Creatinine | Instance 0', 'Direct bilirubin | Instance 0', 'Gamma glutamyltransferase | Instance 0', \
            'Glucose | Instance 0', 'Glycated haemoglobin (HbA1c) | Instance 0', 'Oestradiol | Instance 0', 'Phosphate | Instance 0', 'Rheumatoid factor | Instance 0', 'Total bilirubin | Instance 0', '3-Hydroxybutyrate | Instance 0', \
            'Citrate | Instance 0', 'Glutamine | Instance 0', 'Glycine | Instance 0', 'Histidine | Instance 0', 'Isoleucine | Instance 0', 'Lactate | Instance 0', 'Leucine | Instance 0', 'Monounsaturated Fatty Acids to Total Fatty Acids percentage | Instance 0', \
            'Phenylalanine | Instance 0', 'Sphingomyelins | Instance 0', 'Tyrosine | Instance 0', 'Valine | Instance 0', 'Docosahexaenoic Acid to Total Fatty Acids percentage | Instance 0', 'Hand grip strength (left) | Instance 0', 'Hand grip strength (right) | Instance 0', \
            'Trunk fat percentage | Instance 0', 'Basal metabolic rate | Instance 0', 'Adjusted T/S ratio | Instance 0', 'T/S ratio for regression dilution bias | Instance 0', 'Unadjusted T/S ratio | Instance 0', 'Z-adjusted T/S log | Instance 0']]
df_sm.columns = ['sex','age','ethnicity','townsend',\
                 'bmi','pulse','dbp','sbp',\
                 'wbc','rbc','hgb','hct','plt','lym','ualb','cr_urine','potas','sodium',\
                 'apoa','apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg',\
                 'tst','tprotein','tgly','vitd',\
                 'age_at_diagnosis','n_cancer_dx','n_cancer_occs','age_at_death','tumour_hist','tumour_beh','reason_lost', \
                 'baso', 'eos', 'mchc', 'mcv', 'mono', \
                 'np', 'ret', 'pdw', 'pct', 'alt', 'alb', \
                 'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt', \
                 'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', \
                 'cit', 'gln', 'gly', 'his', 'ile', 'bla', 'leu', 'mufa',\
                 'phe', 'sph', 'tyr', 'val', 'dha_tfa', 'grip_l', 'grip_r', \
                 'trunk_fat', 'bmr', 'adj_ts_ratio', 'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio']


In [None]:
df_sm.columns

In [None]:
df_sm['label_crc'] = df_lab['colorectal-ca']
df = df_sm
df.columns

In [None]:
df.shape

In [None]:
# Calculate how many missing values each column has

nan_df = pd.DataFrame(df.isna().sum()).reset_index()
nan_df.columns  = ['Column', 'NaN_Count']
nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
nan_df['NaN%'] = round(nan_df['NaN_Count']/len(df) * 100,1)
nan_df['Type']  = 'Missingness'
nan_df.sort_values('NaN%', inplace=True, ascending=False)

nan_df.to_csv(s3_path+'tables/missing_value_count.csv', index=False)
nan_df

In [None]:
# Find and remove participants who withdrew consent

idx = df.index[df['reason_lost'].isin([5])]
print(len(idx),'people withdrew consent')
df.drop(idx, inplace=True)
print(len(df),'rows left after removing withdrawals')

In [None]:
df.pop('reason_lost');

In [None]:
# Remove duplicate rows if any

df.drop_duplicates(keep='first', inplace=True)

In [None]:
df.reset_index(inplace=True, drop=True)
df.head()

In [None]:
# Remove participants with other or future cancer

other_idx = list(df.index[df['label_crc'] == 2]) + list(df.index[df['label_crc'] == 3])
print(len(other_idx), 'participants have other types of cancer, or will develop cancer')
df.drop(df.index[other_idx], inplace=True)
print(len(df), 'rows left after removing these participants')

In [None]:
df.to_csv(s3_path+'crc_dataset.csv', index=False)

## Visualise label and demographic information

In [None]:
df = read_csv(s3_path+'crc_dataset.csv')

In [None]:
eth_codes =[1, 1001, 1002, 1003, 2001, 2002, 2003, 2004, 3001, 3002, 3003, 3004, 4001, 4002, 5, 6]
c_ns = []
p_ns = []
for i in eth_codes:
    c_ns.append(len(df.loc[(df['ethnicity'] == i) & (df['label_crc'] == False)]))
    p_ns.append(len(df.loc[(df['ethnicity'] == i) & (df['label_crc'] == True)]))

In [None]:
c_n=[np.array(c_ns[:4]).sum(), np.array(c_ns[4:8]).sum(), (np.array(c_ns[8:12]).sum())+c_ns[14], np.array(c_ns[12:14]).sum(), c_ns[15]]
p_n=[np.array(p_ns[:4]).sum(), np.array(p_ns[4:8]).sum(), (np.array(p_ns[8:12]).sum())+p_ns[14], np.array(p_ns[12:14]).sum(), p_ns[15]]

In [None]:
c_n

In [None]:
# Visualise ethnicity

labels = ['White', 'Mixed', 'Asian', 'Black', 'Other']

fig1, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6))
ax1 = plt.subplot(121)
ax1.title.set_text('Ethnicity in Healthy controls')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(c_n, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

ax2 = plt.subplot(122)
ax2.title.set_text('Ethnicity in Colorectal ca')
theme = plt.get_cmap('jet')
ax2.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(p_n, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/ethnicity_distributions.jpg', dpi=150)
plt.show()

In [None]:
sex_codes =[0, 1]
labels = ['Female', 'Male']
ns = []
for i in sex_codes:
    ns.append(len(df.loc[(df['sex'] == i) & (df['label_crc'] == False)]))

fig1, (ax1, ax2) = plt.subplots(1,2,figsize=(10,5))
ax1 = plt.subplot(121)
ax1.title.set_text('Sex in Healthy controls')
plt.pie(ns, labels=labels, colors=['tomato','navy'], autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

ns = []
for i in sex_codes:
    ns.append(len(df.loc[(df['sex'] == i) & (df['label_crc'] == True)]))

ax2 = plt.subplot(122)
ax2.title.set_text('Sex in Colorectal ca')
plt.pie(ns, labels=labels, colors=['tomato','navy'], autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/sex_distributions.jpg', dpi=150)
plt.show()

In [None]:
# Visualise socioeconomic status

c_idx = df['label_crc'] == False
p_idx = df['label_crc'] == True

fig1, (ax1, ax2) = plt.subplots(1,2,figsize=(10,5))
ax1 = plt.subplot(121)
plt.hist(df['townsend'][c_idx], bins=50, color='midnightblue', edgecolor='none')
ax1.title.set_text('Townsend DI in Healthy Controls')
plt.axvline(df['townsend'][c_idx].median(), color='gainsboro', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(df['townsend'][c_idx].median(), max_ylim*0.9, 'Med: {:.2f}'.format(df['townsend'][c_idx].median()))

ax2 = plt.subplot(122)
plt.hist(df['townsend'][p_idx], bins=50, color='tomato', edgecolor='none')
ax2.title.set_text('Townsend DI in Colorectal ca')
plt.axvline(df['townsend'][p_idx].median(), color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(df['townsend'][p_idx].median(), max_ylim*0.9, 'Med: {:.2f}'.format(df['townsend'][p_idx].median()))

plt.savefig('./figures/townsend_distributions.jpg', dpi=150)
plt.show()

In [None]:
df2 = df[['age','townsend','bmi','pulse','sbp','dbp']]
group = df['label_crc']
plot_names = ['Age', 'Townsend DI', 'BMI', 'Pulse',' Systolic BP', 'Diastolic BP']

sns.set_style('whitegrid')
plt.subplots(2,3,figsize=(10,6)); k = 1
for col in df2:
    ax = plt.subplot(2, 3, k)
    sns.set_palette(['mediumblue','r'])
    sns.boxplot(data=df,x=group, y=col, width=0.4, boxprops=dict(alpha=.85), medianprops=dict(color="w"), \
                fliersize=1, flierprops=dict(marker='o', markeredgecolor='grey'),linewidth=0.9).set(xlabel='', ylabel='')
    ax.title.set_text(plot_names[k-1])
    k += 1
    
plt.savefig('./figures/covariate_boxplots.jpg', dpi=150) 
plt.show()

## Visualisations of the cancer-related variables

In [None]:
df2 = df[['age_at_death','n_cancer_dx','age_at_diagnosis','n_cancer_occs','tumour_hist','tumour_beh']][p_idx]
df2.head()

In [None]:
# Distributions of age

plt.subplots(2,2,figsize=(9,9))

ax = plt.subplot(221)
plt.hist(df2['age_at_diagnosis'], bins=50, color='tomato', edgecolor='none', density=True)
ax.title.set_text('Age at diagnosis')
plt.axvline(df2['age_at_diagnosis'].median(), color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(df2['age_at_diagnosis'].median()*0.65, max_ylim*0.9, 'Med: {:.2f}'.format(df2['age_at_diagnosis'].median()))

ax = plt.subplot(222)
plt.hist(df2['age_at_death'], bins=50, color='tomato', edgecolor='none',  density=True)
ax.title.set_text('Age at death')
plt.axvline(df2['age_at_death'].median(), color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(df2['age_at_death'].median()*0.80, max_ylim*0.9, 'Med: {:.2f}'.format(df2['age_at_death'].median()))

labels = ['1', '2', '>=3']
ns = []
for i in range(1,3):
    ns.append(len(df2.loc[(df2['n_cancer_dx'] == i)]))
ns.append(len(df2.loc[(df2['n_cancer_dx'] >= 3)]))

ax1 = plt.subplot(223)
ax1.title.set_text('Number of cancer diagnoses')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.78, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

labels = ['1', '2', '3', '>=4']
ns = []
for i in range(1,4):
    ns.append(len(df2.loc[(df2['n_cancer_occs'] == i)]))
ns.append(len(df2.loc[(df2['n_cancer_occs'] >= 4)]))

ax1 = plt.subplot(224)
ax1.title.set_text('Number of cancer occurrences')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(ns, labels=labels, autopct='%1.1f%%')
circle = plt.Circle((0,0), 0.78, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/cancer_variable_distributions.jpg', dpi=150) 
plt.show()

In [None]:
# Visualise tumour behaviour

beh_codes =[3, 2,6, 0, 1]
labels = ['Malignant-primary site', 'Carcinoma in situ', 'Malignant-metastatic', 'Benign', 'Uncertain']
p_ns = []
for i in beh_codes:
    p_ns.append(len(df.loc[(df['tumour_beh'] == i) & (df['label_crc'] == True)]))
p_ns


In [None]:
fig1, ax1 = plt.subplots(figsize=(6,8))

ax1.title.set_text('Tumour behaviour')
theme = plt.get_cmap('jet')
ax1.set_prop_cycle("color", [theme(1. * i / len(labels))
                             for i in range(len(labels))])
plt.pie(p_ns, labels=labels, autopct='%1.1f%%')

circle = plt.Circle((0,0), 0.75, color='white')
p=plt.gcf()
p.gca().add_artist(circle)

plt.savefig('./figures/tumour_behaviour.jpg', dpi=150)
plt.show()

## Histograms

In [None]:
df2 = df[['age','townsend',\
          'bmi','pulse','dbp','sbp',\
          'wbc','rbc','hgb','hct','plt','lym','ualb','cr_urine','potas','sodium',\
          'apoa','apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg',\
          'tst','tprotein','tgly','vitd',\
          'age_at_diagnosis','n_cancer_dx','n_cancer_occs','age_at_death', \
          'baso', 'eos', 'mchc', 'mcv', 'mono', \
          'np', 'ret', 'pdw', 'pct', 'alt', 'alb', \
          'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt', \
          'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', \
          'cit', 'gln', 'gly', 'his', 'ile', 'bla', 'leu', 'mufa',\
          'phe', 'sph', 'tyr', 'val', 'dha_tfa', 'grip_l', 'grip_r', \
          'trunk_fat', 'bmr', 'adj_ts_ratio', 'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio']]

In [None]:
df2.shape

In [None]:
plt.subplots(20,4,figsize=(13,70))

k = 1
for col in df2:
    ax = plt.subplot(20,4,k)
    plt.hist(df2[col][c_idx], bins=100,  alpha=0.5, edgecolor='none', range=[df2[col].quantile(0.0001),df2[col].quantile(0.995)], density=True, label='HC')
    plt.hist(df2[col][p_idx], bins=100, alpha=0.5, edgecolor='none', range=[df2[col].quantile(0.0001),df2[col].quantile(0.995)], density=True, label='CRC')
    ax.title.set_text(col)
    min_ylim, max_ylim = plt.ylim()
    plt.axvline(df2[col][c_idx].median(), color='whitesmoke', linestyle='dashed', linewidth=1.5)
    plt.axvline(df2[col][p_idx].median(), color='whitesmoke', linestyle='dashed', linewidth=1.5)
    plt.legend()
    k += 1

plt.savefig('./figures/biomarker_distributions.jpg', dpi=150) 
plt.show()