# Select fCpGs

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import linregress, spearmanr, ttest_ind
from math import ceil
import MolecularClocks.src.methylation_util as m_util
from MolecularClocks.src.merge_pdfs import merge_pdfs
from MolecularClocks.src.invasiveCpGs_consts import getConsts
# from expandAnnotations import expand
import util as local_util

consts = getConsts()

sns.set(rc={"savefig.bbox":'tight'}, font_scale=1.2)

In [2]:
neutral_DNA_CpG_list = local_util.getNeutralDNACpGs()
data = local_util.getDataDict()

Starting with 646 tumors
Removing 4 tumors for not being from unique patients
Removing 1 tumors for not having a purity estimate


## Generalizability/Sensitivity Tests

In [3]:
# Test generalizability of the method
# Pick 1000 most variable sites and divide them in half randomly
# Calculate c_beta for each tumor using each set of sites
# Calculate correlation between the two c_beta values for each tumor

balanced_CpGs_only_ductals = local_util.gen_CpG_set(data, neutral_DNA_CpG_list, only_ductals=True)
balanced_CpGs_all_tumors = local_util.gen_CpG_set(data, neutral_DNA_CpG_list, only_ductals=False)

c_beta_only_ductals = 1 - data['tumor']['beta_values_PURE'].loc[balanced_CpGs_only_ductals].std(axis=0)
c_beta_all_tumors = 1 - data['tumor']['beta_values_PURE'].loc[balanced_CpGs_all_tumors].std(axis=0)

print(f'{np.intersect1d(balanced_CpGs_only_ductals, balanced_CpGs_all_tumors).shape[0]} sites shared')
print(f'r = {linregress(c_beta_only_ductals, c_beta_all_tumors).rvalue}')

462 sites shared
r = 0.9976461891254752


In [3]:
# Test sensitivity of the sites
# Pick 1000 most variable sites and divide them in half randomly
# Calculate c_beta for each tumor using each set of sites
# Calculate correlation between the two c_beta values for each tumor

balanced_CpGs_all_tumors = local_util.gen_CpG_set(data, neutral_DNA_CpG_list, only_ductals=False)

balanced_CpGs_all_tumors_1000 = local_util.gen_CpG_set(data, neutral_DNA_CpG_list, only_ductals=False, n_select=1000)
np.random.shuffle(balanced_CpGs_all_tumors_1000)

rand_split_1 = balanced_CpGs_all_tumors_1000[:500]
rand_split_2 = balanced_CpGs_all_tumors_1000[500:]

c_beta_all_tumors = 1 - data['tumor']['beta_values_PURE'].loc[balanced_CpGs_all_tumors].std(axis=0)
c_beta_rand_split_1 = 1 - data['tumor']['beta_values_PURE'].loc[rand_split_1].std(axis=0)
c_beta_rand_split_2 = 1 - data['tumor']['beta_values_PURE'].loc[rand_split_2].std(axis=0)

print(f'{np.intersect1d(rand_split_1, balanced_CpGs_all_tumors).shape[0]} original sites in rand split 1')
print(f'{np.intersect1d(rand_split_2, balanced_CpGs_all_tumors).shape[0]} original sites in rand split 2')

print(f'r = {linregress(c_beta_rand_split_1, c_beta_rand_split_2).rvalue}')

256 original sites in rand split 1
244 original sites in rand split 2
r = 0.9685833485738136


## Select 500 CpGs

In [4]:
balanced_CpGs_final_set = local_util.gen_CpG_set(data, neutral_DNA_CpG_list, only_ductals=False)
np.savetxt('balanced_CpGs_all_tumors.txt', balanced_CpGs_all_tumors, fmt='%s')