# Stats and visualisation for cancer data

In [None]:
pip install matplotlib==3.4.2

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math, boto3, tempfile
import scipy.stats as sp
from sklearn import manifold
from utils import *
from label_utils import *

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = read_csv(s3_path+'crc_dataset.csv')

In [None]:
df.columns

In [None]:
df['label_crc'].value_counts()

# X2 tests

In [None]:
# X2 tests

chi_res = []
pval = []

c_idx = df['label_crc'] == False
p_idx = df['label_crc'] == True
c, p = df['label_crc'].value_counts()

c_fr = np.array(df['sex'][c_idx].value_counts().to_list())
p_fr = np.array(df['sex'][p_idx].value_counts().to_list())
chi, ps = sp.chisquare(f_obs=c_fr/c*100,f_exp=p_fr/p*100)
chi_res.append(chi); pval.append(ps)

eth_codes =[1, 1001, 1002, 1003, 2001, 2002, 2003, 2004, 3001, 3002, 3003, 3004, 4001, 4002, 5]
c_fr = []
p_fr = []
for i in eth_codes:
    c_fr.append(len(df.loc[(df['ethnicity'] == i) & (df['label_crc'] == False)]))
    p_fr.append(len(df.loc[(df['ethnicity'] == i) & (df['label_crc'] == True)]))
chi, ps = sp.chisquare(f_obs=np.array(c_fr)/c*100,f_exp=np.array(p_fr)/p*100)
chi_res.append(chi); pval.append(ps)

stats_df = pd.DataFrame()
stats_df = stats_df.assign(biomarker=['sex','ethnicity'], chi2=chi_res, p=pval)
stats_df.to_csv(s3_path+'tables/chisquare_results.csv', index=False)
stats_df

# Intercorrelations

In [None]:
df2 = df[['sex','age','ethnicity','townsend',\
                 'bmi','pulse','dbp','sbp',\
                 'wbc','plt','lym','ualb','cr_urine','potas','sodium',\
                 'apob','urea','chol','crp','cysc','igf1','ldl','sph','apoa', 'hdl','shbg','e2', 'phos','gly','dha_tfa','pct', 'trunk_fat','cit','calc','tprotein','vitd',\
                 'tst','grip_l', 'grip_r','bmr', 'rbc','hgb','hct','cr_blood','mufa','tgly','ile', 'bla', 'leu', 'val','phe',  'tyr','alt', 'gln',  'his',   'alb','alp',\
          'ast',   'dbi', 'ggt', \
                 'glu', 'hgba1c','tbil', \
                 'baso', 'eos', 'mchc', 'mcv', 'mono', \
                 'np', 'ret', 'pdw',   \
          'rf',  'phbv', \
                   'adj_ts_ratio', 'unadj_ts_ratio', 'z_ts_ratio','label_crc']]

In [None]:
idx = df2.index[df2['label_crc'].isnull()]
df2.drop(idx, inplace=True)
df2.shape

In [None]:
# Calculate intercorrelations

corrs = df2.corr()
pval = df2.corr(method=lambda x, y: sp.pearsonr(x, y)[1]) - np.eye(*corrs.shape)

mask = np.triu(np.ones_like(corrs, dtype=bool))
plt.figure(figsize=(20,20))
sns.heatmap(corrs, annot=False, annot_kws = {'size':9},fmt='.2f', mask=mask, xticklabels=1, yticklabels=1, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.5, vmax=0.5).set(title='Intercorrelations - rmap')

plt.savefig('./figures/biomarker_rmap_everyone.jpg', dpi=150) 
plt.show()

In [None]:

mask = pval>0.05
plt.figure(figsize=(20,20))
sns.heatmap(corrs, annot=False, annot_kws = {'size':9},fmt='.2f', mask=mask, xticklabels=1, yticklabels=1, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.5, vmax=0.5).set(title='Intercorrelations - rmap p<0.05')

plt.savefig('./figures/biomarker_rmap_everyone_masked.jpg', dpi=150) 
plt.show()

In [None]:
# Plot MDS

dis = 1-abs(corrs)

mds = manifold.MDS(n_components=5, max_iter=3000, eps=1e-9, dissimilarity='precomputed', random_state=42)
results = mds.fit(dis)
names = df2.columns
coors = results.embedding_

fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(bottom=0.1)
plt.scatter(coors[:,0],coors[:,1])

for label, x, y in zip(names, coors[:,0], coors[:,1]):
    plt.annotate(label, xy=(x,y), xytext=(-15,15), textcoords='offset points')
plt.savefig('./figures/mds_dissimilarity.jpg', dpi=150) 
plt.show()

In [None]:
df_lab = read_csv(s3_path+'labels/all_cancer_labels.csv')
labels = df_lab.columns.to_list()

rho_arr = np.empty((len(labels),len(df2.columns)), float)
p_arr = np.empty((len(labels),len(df2.columns)), float)

In [None]:
for cancer in labels:
    i = labels.index(cancer)
    df2 = df[['age','townsend',\
          'bmi','pulse','dbp','sbp',\
          'wbc','rbc','hgb','hct','plt','lym','ualb','cr_urine','potas','sodium',\
          'apoa','apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg',\
          'tst','tprotein','tgly','vitd',\
          'age_at_diagnosis','n_cancer_dx','n_cancer_occs','age_at_death', \
          'baso', 'eos', 'mchc', 'mcv', 'mono', \
          'np', 'ret', 'pdw', 'pct', 'alt', 'alb', \
          'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt', \
          'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', \
          'cit', 'gln', 'gly', 'his', 'ile', 'bla', 'leu', 'mufa',\
          'phe', 'sph', 'tyr', 'val', 'dha_tfa', 'grip_l', 'grip_r', \
          'trunk_fat', 'bmr', 'adj_ts_ratio', 'unadj_ts_ratio', 'z_ts_ratio']]
    
    lb = df_lab.pop(cancer)
    df2['lb'] = lb
    idx = df2['lb'].index[df2['lb']==2].to_list() + df2['lb'].index[df2['lb']==3].to_list()
    df2.drop(idx, inplace=True)
    lb = df2.pop('lb')
    
    for col in df2:
        j = df2.columns.get_loc(col)
        rho, p = sp.spearmanr(df2[col],lb, nan_policy='omit')
        rho_arr[i,j] = rho
        p_arr[i,j] = p

In [None]:
fig1, (ax1, ax2, ax3) = plt.subplots(3,1,figsize=(25,20))

ax1 = plt.subplot(311)
sns.heatmap(rho_arr, annot=False, annot_kws = {'size':9},fmt='.2f', xticklabels=df2.columns, yticklabels=df_lab.columns, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.04, vmax=0.04).set(title='r-map')
sns.set(font_scale=1.2)

mask = p_arr>0.05
ax2 = plt.subplot(312)
sns.heatmap(rho_arr, annot=False, annot_kws = {'size':9},fmt='.2f', mask=mask, xticklabels=df2.columns, yticklabels=df_lab.columns, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.04, vmax=0.04).set(title='r-map at p<0.05 uncor')

mask = p_arr>(0.05/len(df2.columns))
ax3 = plt.subplot(313)
sns.heatmap(rho_arr, annot=False, annot_kws = {'size':9},fmt='.2f', mask=mask, xticklabels=df2.columns, yticklabels=df_lab.columns, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.04, vmax=0.04).set(title='r-map at p<0.05 cor')


plt.savefig('./figures/biomarker_cancer_label_rmap.jpg', dpi=150) 
plt.show()

# K-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, silhouette_score

df2 = df[['age', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd']]
df2.dropna(inplace=True)

In [None]:
search_range = range(1, 11)
report = {}
for k in search_range:
    temp_dict = {}
    kmeans = KMeans(init='k-means++',
                    algorithm='auto',
                    n_clusters=k,
                    max_iter=1000,
                    random_state=1,
                    verbose=0).fit(df2.T)
    inertia = kmeans.inertia_
    temp_dict['Sum of squared error'] = inertia
    try:
        cluster = kmeans.predict(df2.T)
        chs = calinski_harabasz_score(df2.T, cluster)
        ss = silhouette_score(df2.T, cluster)
        temp_dict['Calinski Harabasz Score'] = chs
        temp_dict['Silhouette Score'] = ss
        report[k] = temp_dict
    except:
        report[k] = temp_dict

In [None]:
report_df = pd.DataFrame(report).T
report_df.plot(figsize=(10, 7),
               xticks=search_range,
               grid=True,
               title=f'Selecting optimal "K"',
               subplots=True,
               marker='o',
               sharex=True)
plt.tight_layout()
plt.show()

In [None]:
k_model = KMeans(n_clusters=4, random_state=42)
k_model.fit(df2.T)
k_model.cluster_centers_.shape

In [None]:
segmented_data = pd.DataFrame()
segmented_data['Biomarkers']=df2.columns
segmented_data['Cluster']=k_model.labels_
segmented_data.sort_values(by='Cluster', ascending=True)

# Stats

In [None]:
from sklearn.preprocessing import minmax_scale

In [None]:
df2 = df[['age','townsend',\
          'bmi','pulse','dbp','sbp',\
          'wbc','rbc','hgb','hct','plt','lym','ualb','cr_urine','potas','sodium',\
          'apoa','apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg',\
          'tst','tprotein','tgly','vitd',\
          'age_at_diagnosis','n_cancer_dx','n_cancer_occs','age_at_death', \
          'baso', 'eos', 'mchc', 'mcv', 'mono', \
          'np', 'ret', 'pdw', 'pct', 'alt', 'alb', \
          'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt', \
          'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', \
          'cit', 'gln', 'gly', 'his', 'ile', 'bla', 'leu', 'mufa',\
          'phe', 'sph', 'tyr', 'val', 'dha_tfa', 'grip_l', 'grip_r', \
          'trunk_fat', 'bmr', 'adj_ts_ratio', 'unadj_ts_ratio', 'z_ts_ratio']]

c_idx = df['label_crc'] == False
p_idx = df['label_crc'] == True

In [None]:
mean_c = []; mean_p = []; mean_cn = []; mean_pn = []
sd_c = []; sd_p = []
count_c = []; count_p = []

norm_test = []
tval = []
t_pval = []
uval = []
u_pval = []

for col in df2:
    count_c.append(df2[col][c_idx].count())
    count_p.append(df2[col][p_idx].count())
    mean_c.append(np.nanmean(df2[col][c_idx]))
    mean_p.append(np.nanmean(df2[col][p_idx]))
    d = minmax_scale(df2[col], feature_range=(0,1), axis=0)
    mean_cn.append(np.nanmean(d[c_idx]))
    mean_pn.append(np.nanmean(d[p_idx]))
    sd_c.append(np.nanstd(df2[col][c_idx]))
    sd_p.append(np.nanstd(df2[col][p_idx]))
    nm = sp.normaltest(df2[col], nan_policy='omit') # Check whether the biomarkers come from normal distributions (0=not normal)
    norm_test.append(nm[1])
    res = sp.ttest_ind(df2[col][c_idx],df2[col][p_idx], nan_policy='omit') # Between samples t-tests
    tval.append(res[0]); t_pval.append(round(res[1],3))
    res = sp.mannwhitneyu(df2[col][c_idx],df2[col][p_idx]) # Mann Whitney U tests
    uval.append(round(res[0],2)); u_pval.append(round(res[1],3))

In [None]:
pd.set_option('display.max_rows', None)
stats_df = pd.DataFrame()
stats_df = stats_df.assign(biomarker=df2.columns, count_c=count_c, count_p=count_p, mean_c=mean_c, mean_cn=mean_cn, sd_c=sd_c, \
                           mean_p=mean_p, mean_pn=mean_pn, sd_p=sd_p, normality=norm_test, tval=tval, t_pval=t_pval, uval=uval, u_pval=u_pval)
stats_df.to_csv(s3_path+'tables/group_stats.csv', index=False)
stats_df = stats_df.sort_values(by=['u_pval'], ascending=True, ignore_index=True)
stats_df

In [None]:
idx = np.array(stats_df.index[stats_df['u_pval']>0.05])
x = np.arange(len(stats_df))
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(15,5))
rects1 = ax.bar(x - width/2, stats_df['mean_cn'], width, label='HC', color='navy')
rects2 = ax.bar(x + width/2, stats_df['mean_pn'], width, label='CRC', color='tomato')
ax.set_ylabel('au')
ax.set_title('Normalised means by group')
ax.set_xticks(x)
ax.set_xticklabels(stats_df['biomarker'], rotation=80)
ax.legend()
#ax.bar_label(mean_cn)
#ax.bar_label(rects2, padding=3)
for j in idx:
    rects1[j].set_color('grey')
    rects2[j].set_color('gainsboro')

fig.tight_layout()
plt.savefig('./figures/biomarker_group_effects.jpg', dpi=150) 
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = read_csv(s3_path+'crc_dataset.csv')

df2 = df[['age','townsend',\
          'bmi','pulse','dbp','sbp',\
          'wbc','rbc','hgb','hct','plt','lym','ualb','cr_urine','potas','sodium',\
          'apoa','apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg',\
          'tst','tprotein','tgly','vitd',\
          'baso', 'eos', 'mchc', 'mcv', 'mono', \
          'np', 'ret', 'pdw', 'pct', 'alt', 'alb', \
          'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt', \
          'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', \
          'cit', 'gln', 'gly', 'his', 'ile', 'bla', 'leu', 'mufa',\
          'phe', 'sph', 'tyr', 'val', 'dha_tfa', 'grip_l', 'grip_r', \
          'trunk_fat', 'bmr', 'adj_ts_ratio', 'unadj_ts_ratio', 'z_ts_ratio']]

df2.head()

In [None]:
null_perc = np.array(df2.isna().sum(axis=1)/len(df2.columns)*100) # drop rows that have more than 30% missing values
idx = list(np.where(null_perc>30))
len(idx[0])
df2['label_crc']=df['label_crc']
df2.drop(df2.index[idx], axis=0, inplace=True)
df2.shape

In [None]:
#c,p = df2['label_crc'].value_counts(); print(p)
#df2 = get_subset_of_rows_balanced(df2, stratify_key='label_crc', random_state=0)
y = df2.pop('label_crc')
df2.shape

In [None]:
scaler = MinMaxScaler() # normalise columns
df2 = pd.DataFrame(scaler.fit_transform(df2), columns = df2.columns)
imputer = KNNImputer(n_neighbors=5) # impute missing values using nearest neighbours
df2 = pd.DataFrame(imputer.fit_transform(df2), columns=df2.columns)
x = StandardScaler().fit_transform(df2) #scale columns

In [None]:
pca = PCA(n_components=3)
pc = pca.fit_transform(x)
pc_df = pd.DataFrame(data=pc, columns=['PC1','PC2','PC3'])
pc_df['label'] = y
pc_df.head()

In [None]:
pca.explained_variance_

# Odds ratios

In [None]:
from scipy.stats import fisher_exact

In [None]:
biomarkers = ['age', 'ethnicity', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd', 'age_at_diagnosis',
       'n_cancer_dx', 'n_cancer_occs', 'age_at_death', 'tumour_hist',
       'tumour_beh', 'baso', 'eos', 'mchc', 'mcv', 'mono', 'np', 'ret', 'pdw',
       'pct', 'alt', 'alb', 'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt',
       'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', 'cit', 'gln',
       'gly', 'his', 'ile', 'bla', 'leu', 'mufa', 'phe', 'sph', 'tyr', 'val',
       'dha_tfa', 'grip_l', 'grip_r', 'trunk_fat', 'bmr', 'adj_ts_ratio',
       'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio']
df2 = df[biomarkers].copy(deep=True)

In [None]:
df_lab = pd.read_csv(s3_path+'labels/all_cancer_labels.csv')
ca_sites = df_lab.columns.to_list()

for ca in ca_sites:
    
    df = pd.read_csv(s3_path+ca+'_dataset.csv')
    df2 = df[biomarkers].copy(deep=True)
    
    count_c = []; count_p = []
    perc_c = []; perc_p = []
    OR1 = []; OR2 = []; pval = []
    ci_l = []; ci_u = []

    for col in df2:
        ul = np.nanpercentile(df[col],75)
        ll = np.nanpercentile(df[col],25)

        a = len(df.loc[(df2[col] > ul) & (df['label'] == True)])
        b = len(df.loc[(df2[col] > ul) & (df['label'] == False)])
        c = len(df.loc[(df2[col] < ll) & (df['label'] == True)])
        d = len(df.loc[(df2[col] < ll) & (df['label'] == False)])
    
        count_c.append(b)
        count_p.append(a)
        if (a+c)>0:
            perc_p.append(a/(a+c)*100)
        else:
            perc_p.append(0)
        if (b+d)>0:    
            perc_c.append(b/(b+d)*100)
        else:
            perc_c.append(0)
        table = np.array([[a, b], [c, d]])
        OR, p = fisher_exact(table)
        if np.all([a, b, c, d]):
            ci_l.append(np.exp(np.log(OR) - 1.96*(np.sqrt((1/a)+(1/b)+(1/c)+(1/d)))))
            ci_u.append(np.exp(np.log(OR) + 1.96*(np.sqrt((1/a)+(1/b)+(1/c)+(1/d)))))
        else:
            ci_l.append(np.NaN)
            ci_u.append(np.NaN)
        OR1.append(OR)
        OR2.append(1/OR)
        pval.append(p/2)
        
    pd.set_option('display.max_rows', None)
    stats_df = pd.DataFrame()
    stats_df = stats_df.assign(biomarker=df2.columns, count_c=count_c, perc_c=perc_c, count_p=count_p, perc_p=perc_p, \
                           OR1=OR1, OR2=OR2, CI_L=ci_l, CI_U=ci_u, p=pval)
    stats_df.to_csv(s3_path+'tables/odds_ratio_75_25th_perc_stats'+ca+'.csv', index=False)

In [None]:
ca = ca_sites[3]
print(ca)
df = pd.read_csv(s3_path+'tables/odds_ratio_75_25th_perc_stats'+ca+'.csv')
df.sort_values(by='p', ascending=True)

# Hazard ratios - KM

In [None]:
pip install lifelines
from lifelines import KaplanMeierFitter

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = read_csv(s3_path+'crc_dataset.csv')

In [None]:
idx = df.index[df['label_crc'].isnull()]
df.drop(idx, inplace=True)
idx = df.index[df['age'].isnull()]
df.drop(idx, inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
p_idx = df.index[(df['label_crc']==1) & (df['age_at_diagnosis'].notnull())].to_list()
c_idx = df.index[(df['label_crc']==0) & (df['age_at_diagnosis'].isnull())].to_list()
T = df.iloc[c_idx+p_idx]['age']
C = df.iloc[c_idx+p_idx]['label_crc']
kmf = KaplanMeierFitter()
kmf.fit(T,C)

In [None]:
%pylab inline
figsize(12,8)
kmf.plot_survival_function()