# Stats and visualisation for cancer data

In [None]:
pip install statsmodels==0.13.2

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math, boto3, tempfile
import scipy.stats as sp
from sklearn import manifold
from utils import *
from label_utils import *
from sklearn.preprocessing import MinMaxScaler
import statsmodels as sm

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = read_csv(s3_path+'crc_dataset_outlier_rem.csv')

In [None]:
df(['label_crc','sex'])['sex'].value_counts()

# X2 tests

In [None]:
# X2 tests

perc_c = []
perc_p = []
chi_res = []
pval = []

c_idx = df['label_crc'] == False
p_idx = df['label_crc'] == True
c, p = df['label_crc'].value_counts()

cols = ['sex', 'ethnicity', 'edu_level','centre', 'weight_change', 'aspirin', 'family_cancer','family_crc',
        'hist_cvd','hist_diabetes','hist_ibd','hist_livergall','crc_screening']

for col in cols:
    codes = pd.unique(df[col])
    codes = codes[~np.isnan(codes)] # remove nan from codes
    c_n = []
    p_n = []
    for i in codes:
        c_n.append(len(df.loc[(df[col] == i) & (df['label_crc'] == False)]))
        p_n.append(len(df.loc[(df[col] == i) & (df['label_crc'] == True)]))
    if min(c_n)==0:
        ix = c_n.index(0)
        c_n[ix] = 0.000001; p_n[ix] = 0.000001
    if min(p_n)==0:
        ix = p_n.index(0)
        p_n[ix] = 0.000001; c_n[ix] = 0.000001
    perc_c.append(np.around(np.array(c_n)/np.sum(c_n)*100,2)); perc_p.append(np.around(np.array(p_n)/np.sum(p_n)*100,2))    
    chi, ps = sp.chisquare(f_obs=np.array(c_n)/np.sum(c_n)*100,f_exp=np.array(p_n)/np.sum(p_n)*100)
    chi_res.append(round(chi,3)); pval.append(round(ps,4))

stats_df = pd.DataFrame()
stats_df = stats_df.assign(biomarker=cols, perc_c=perc_c, perc_p=perc_p,chi2=chi_res, p=pval)
stats_df.to_csv(s3_path+'tables/chisquare_results.csv', index=False)
stats_df

# Intercorrelations

In [None]:
df2 = df[['age', 'sex', 'yoe','townsend', 'weight_change', 'bmi', 'waist_to_hip',
       'trunk_leg_fat', 'impedance',  'met_rate', 'pulse',
       'dbp', 'sbp', 'grip_strength', 'oily_fish', 'veg_fruit',
       'red_meat', 'sleep_dur', 'met_mins', 'alcohol', 'smoking', 
       'aspirin', 'wbc', 'rbc', 'hgb', 'hct', 'plt',
       'lym', 'u_cr', 'u_potas', 'u_sodium', 'apoa', 'apob', 'urea', 'chol',
       'crp', 'cysc', 'hdl', 'igf1', 'ldl', 'shgb', 'tst', 'tprotein', 'tgly',
       'vitd', 'mcv', 'mono', 'neut', 'eos', 'baso', 'n_rbc', 'ret', 'alb',
       'alp', 'alt', 'ast', 'dbi', 'calc', 'ggt', 'glu', 'hba1c', 'phos',
       'tbil', 'urate', 'health_rating', 'label_crc']].copy(deep=True)

In [None]:
idx = df2.index[df2['label_crc'].isnull()]
df2.drop(idx, inplace=True)
df2.shape

In [None]:
# Calculate intercorrelations

corrs = df2.corr()
pval = df2.corr(method=lambda x, y: sp.pearsonr(x, y)[1]) - np.eye(*corrs.shape)

mask = np.triu(np.ones_like(corrs, dtype=bool))
plt.figure(figsize=(20,20))
sns.heatmap(corrs, annot=False, annot_kws = {'size':9},fmt='.2f', mask=mask, xticklabels=1, yticklabels=1, \
            square=True, cbar_kws={"shrink": 0.5}, cmap='bwr', vmin=-0.5, vmax=0.5).set(title='Intercorrelations - rmap')

plt.savefig('./figures/biomarker_rmap_everyone.jpg', dpi=150) 
plt.show()

# Stats

In [None]:
from sklearn.preprocessing import minmax_scale
import seaborn as sns

In [None]:
df2 = df[['age','yoe','townsend', 'bmi',
       'waist_to_hip', 'trunk_leg_fat', 'impedance', 'health_rating',
       'met_rate', 'pulse', 'dbp', 'sbp', 'grip_strength', 'oily_fish',
       'veg_fruit', 'red_meat',
       'sleep_dur', 'met_mins', 'alcohol', 'smoking', 'wbc', 'rbc',
       'hgb', 'hct', 'plt', 'lym', 'u_cr', 'u_potas', 'u_sodium', 'apoa',
       'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1', 'ldl', 'shgb',
       'tst', 'tprotein', 'tgly', 'vitd', 'mcv', 'mono', 'neut', 'eos', 'baso',
       'ret', 'alb', 'alp', 'alt', 'ast', 'dbi', 'calc', 'ggt', 'glu',
       'hba1c', 'phos', 'tbil', 'urate']].copy(deep=True)

c_idx = df['label_crc'] == False
p_idx = df['label_crc'] == True

In [None]:
mean_c = []; mean_p = []; mean_cn = []; mean_pn = []
sd_c = []; sd_p = []
count_c = []; count_p = []
deg_f = []

norm_test = []
tval = []
t_pval = []
uval = []
u_pval = []

for col in df2:
    cc = df2[col][c_idx].count()
    count_c.append(cc)
    pc = df2[col][p_idx].count()
    count_p.append(pc)
    deg_f.append(cc+pc-2)
    mean_c.append(round(np.nanmean(df2[col][c_idx]),2))
    mean_p.append(round(np.nanmean(df2[col][p_idx]),2))
    d = minmax_scale(df2[col], feature_range=(0,1), axis=0)
    mean_cn.append(round(np.nanmean(d[c_idx]),2))
    mean_pn.append(round(np.nanmean(d[p_idx]),2))
    sd_c.append(round(np.nanstd(df2[col][c_idx]),2))
    sd_p.append(round(np.nanstd(df2[col][p_idx]),2))
    nm = sp.normaltest(df2[col], nan_policy='omit') # Check whether the biomarkers come from normal distributions (0=not normal)
    norm_test.append(round(nm[1],1))
    res = sp.ttest_ind(df2[col][c_idx],df2[col][p_idx], nan_policy='omit') # Between samples t-tests
    tval.append(round(res[0],2)); tval_abs.append(round(np.abs(res[0]),2)); t_pval.append(round(res[1],6)); 
    res = sp.mannwhitneyu(df2[col][c_idx],df2[col][p_idx]) # Mann Whitney U tests
    uval.append(round(res[0],2)); u_pval.append(round(res[1],5))

In [None]:
from statsmodels.stats.multitest import fdrcorrection
rej, pcor = fdrcorrection(t_pval, alpha=0.05, method='indep', is_sorted=False)

In [None]:
pd.set_option('display.max_rows', None)
stats_df = pd.DataFrame()
stats_df = stats_df.assign(biomarker=df2.columns, count_c=count_c, count_p=count_p, deg_f=deg_f, mean_c=mean_c, mean_cn=mean_cn, sd_c=sd_c, \
                           mean_p=mean_p, mean_pn=mean_pn, sd_p=sd_p, normality=norm_test, tval=tval, tval_abs=tval_abs, t_pval=t_pval, uval=uval, u_pval=u_pval)
stats_df['pcor'] = pcor
stats_df.to_csv(s3_path+'tables/group_stats.csv', index=False)
stats_df = stats_df.sort_values(by=['tval_abs'], ascending=False, ignore_index=True)
stats_df

In [None]:
# Bar plots

stats_df = stats_df.sort_values(by=['tval_abs'], ascending=False, ignore_index=True)

idx = np.array(stats_df.index[stats_df['pcor']>0.05])
x = np.arange(len(stats_df))
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(15,5))
rects1 = ax.bar(x - width/2, stats_df['mean_cn'], width, label='HC', color='navy')
rects2 = ax.bar(x + width/2, stats_df['mean_pn'], width, label='CRC', color='tomato')
ax.set_ylabel('au')
ax.set_title('Normalised means by group')
ax.set_xticks(x)
ax.set_xticklabels(stats_df['biomarker'])
ax.legend()
fig.autofmt_xdate(rotation=45)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('font', size=13)
plt.box()
ax.set_facecolor("white")
for j in idx:
    rects1[j].set_color('grey')
    rects2[j].set_color('gainsboro')

fig.tight_layout()
plt.savefig('./figures/biomarker_group_effects.jpg', dpi=150) 
plt.show()

# Odds ratios

In [None]:
from scipy.stats import fisher_exact

In [None]:
biomarkers = ['age', 'ethnicity', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd', 'age_at_diagnosis',
       'n_cancer_dx', 'n_cancer_occs', 'age_at_death', 'tumour_hist',
       'tumour_beh', 'baso', 'eos', 'mchc', 'mcv', 'mono', 'np', 'ret', 'pdw',
       'pct', 'alt', 'alb', 'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt',
       'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', 'cit', 'gln',
       'gly', 'his', 'ile', 'bla', 'leu', 'mufa', 'phe', 'sph', 'tyr', 'val',
       'dha_tfa', 'grip_l', 'grip_r', 'trunk_fat', 'bmr', 'adj_ts_ratio',
       'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio']
df2 = df[biomarkers].copy(deep=True)

In [None]:
df_lab = pd.read_csv(s3_path+'labels/all_cancer_labels.csv')
ca_sites = df_lab.columns.to_list()

for ca in ca_sites:
    
    df = pd.read_csv(s3_path+ca+'_dataset.csv')
    df2 = df[biomarkers].copy(deep=True)
    
    count_c = []; count_p = []
    perc_c = []; perc_p = []
    OR1 = []; OR2 = []; pval = []
    ci_l = []; ci_u = []

    for col in df2:
        ul = np.nanpercentile(df[col],75)
        ll = np.nanpercentile(df[col],25)

        a = len(df.loc[(df2[col] > ul) & (df['label'] == True)])
        b = len(df.loc[(df2[col] > ul) & (df['label'] == False)])
        c = len(df.loc[(df2[col] < ll) & (df['label'] == True)])
        d = len(df.loc[(df2[col] < ll) & (df['label'] == False)])
    
        count_c.append(b)
        count_p.append(a)
        if (a+c)>0:
            perc_p.append(a/(a+c)*100)
        else:
            perc_p.append(0)
        if (b+d)>0:    
            perc_c.append(b/(b+d)*100)
        else:
            perc_c.append(0)
        table = np.array([[a, b], [c, d]])
        OR, p = fisher_exact(table)
        if np.all([a, b, c, d]):
            ci_l.append(np.exp(np.log(OR) - 1.96*(np.sqrt((1/a)+(1/b)+(1/c)+(1/d)))))
            ci_u.append(np.exp(np.log(OR) + 1.96*(np.sqrt((1/a)+(1/b)+(1/c)+(1/d)))))
        else:
            ci_l.append(np.NaN)
            ci_u.append(np.NaN)
        OR1.append(OR)
        OR2.append(1/OR)
        pval.append(p/2)
        
    pd.set_option('display.max_rows', None)
    stats_df = pd.DataFrame()
    stats_df = stats_df.assign(biomarker=df2.columns, count_c=count_c, perc_c=perc_c, count_p=count_p, perc_p=perc_p, \
                           OR1=OR1, OR2=OR2, CI_L=ci_l, CI_U=ci_u, p=pval)
    stats_df.to_csv(s3_path+'tables/odds_ratio_75_25th_perc_stats'+ca+'.csv', index=False)

In [None]:
ca = ca_sites[3]
print(ca)
df = pd.read_csv(s3_path+'tables/odds_ratio_75_25th_perc_stats'+ca+'.csv')
df.sort_values(by='p', ascending=True)