# DeepASM

## Install packages

In [82]:
# To manipulate HDF5 files (RUN FOR ALL MODELS)
!pip3 install --upgrade tables

## Import packages

In [1]:
import sys

# Python packages for data, stats, and visualization
from matplotlib import pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns 

# Machine learning libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras
#from tensorflow.keras.models import load_model
from tensorflow.keras import layers
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Kernel functions
from sklearn.neighbors import KernelDensity
from numpy import asarray
from matplotlib import pyplot
from numpy import exp

# Dimensionality reduction
from sklearn.decomposition import PCA, KernelPCA, NMF, TruncatedSVD
from sklearn.manifold import TSNE, LocallyLinearEmbedding, SpectralEmbedding

# To get the time
from datetime import datetime

 
# Figure parameters
mpl.rcParams['figure.figsize'] = (10, 10)
mpl.rcParams['axes.titlesize'] = 15
mpl.rcParams['axes.labelsize'] = 12
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [2]:
# Print different versions
print(sys.version)
print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)
print("Numpy version:", np.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]
TensorFlow version: 2.7.0
Keras version: 2.7.0
Numpy version: 1.19.5
Num GPUs Available:  1


## GCP Variables

In [3]:
# Import raw data from bucket. False if you want to import the processed dataset
IMPORT_RAW_FROM_BUCKET = True

# Export data after it's been prepared
EXPORT_PROCESSED_DATA = True

# Bucket name where the training datasets are
DEEPASM_BUCKET="deepasm"

# GCP variable
import os
projectid = 'hackensack-tyco'
os.environ["GOOGLE_CLOUD_PROJECT"] = 'hackensack-tyco'
cloud_bucket = "gs://deepasm/colab"


## Model variables

In [4]:
# MODELS FOR WHICH WE NEED TO RECORD THE RESULTS

models = ['linear', 'perceptron', 'simple_cnn',  'cnn', 'simple_rnn', 'rnn']
#models = ['simple_rnn']

# Loss is better than AUC for monitoring
PARAM_TO_CHANGE = "keep_chr"

#--------------------------------------------------
# Parameters common to all models

# Number of rows to take into the dataset after import
NB_ROWS_RAW_DATASET = int(1e3) # The maximum is 5e6. We use 200k to test the code

# Minimum correlation factor
MIN_CORR = 0.03

# Size of the genomic window
GENOMIC_INTERVAL = 1000

# Kernel values for probability estimates
KERNEL_FM_NB_VALUES = 10
KERNEL_FM_BANDWIDTH = 0.1
KERNEL_COV_NB_MAX = 200
KERNEL_COV_NB_STEP = 40
KERNEL_COV_BANDWIDTH = 20

# Normalization method
norm_method = "z_score" # "min_max" or "z_score"

# Keep the chromosomes in the model (correlate poorly with ASM)
KEEP_CHR_IN_MODEL = False

# Early stopping
EARLY_STOPPING = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=0,
    patience=5,
    mode='auto',
    restore_best_weights=True)

# Percentage of data points to be used in the Test dataset
TEST_SPLIT = 0.2

# Percentage of datapoints used between training and validation
VALIDATION_SPLIT = 0.3 # How to divide the training dataset for validation

EPOCHS = 100 # We have so many datapoints that 20 epochs are enough to stabilize the training
BATCH_SIZE = 1000 # to get a few identified ASM we need at a few hundreds since the
# frequency of ASM is 1.38%
# A batch size of 1000 will run into a memory error on TF 2.7

# Regularlization L1 and L2 (defaults are l1 = 0.01 and l2 = 0.01)
L1_R = 0
L2_R = 1e-3

#--------------------------------------------------
# Parameters common to neural network models
ACTIVATION_FUNCTION = 'relu' # 'tanh' # or 'relu' or 'gelu (Gaussian Error Linear Unit)'
NB_NODES_PERCEPTRON = 10
NB_LAYERS_PERCEPTRON = 5
NB_NODES_AFTER_CNN = 2
CNN_FILTERS = 8
CNN_KERNEL = 100 # Must be smaller than the genomic region (250). The av distance between CpG is 37 bp and the std dev of the distances between cpgs is 24 bp
LEARNING_RATE = 3e-4 

# Learning rate was taken from this
# http://karpathy.github.io/2019/04/25/recipe/#2-set-up-the-end-to-end-trainingevaluation-skeleton--get-dumb-baselines

#--------------------------------------------------
# Parameters common to RNN

RNN_UNITS = 64 # 64 orginally

#--------------------------------------------------
# SPECIFIC TO RANDOM FOREST ALGORITHM
use_raw_df_for_forest_models = False

## ML evaluation metrics

In [5]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='sensitivity'),
      keras.metrics.AUC(name='auc')
      ]

def plot_metrics(history):
  metrics =  ['loss', 'auc', 'precision', 'sensitivity']
  plt.figure(figsize=(10,10))
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    plt.ylim([0,1])

    plt.legend()


def display_results(df_results):
  print("Loss", np.round(df_results[0], 3))
  print("True positives", np.round(df_results[1], 3))
  print("False positives", np.round(df_results[2], 3))
  print("True negatives", np.round(df_results[3], 3))
  print("False negatives", np.round(df_results[4], 3))
  print("Accuracy", np.round(df_results[5], 3))
  print("Precision", np.round(df_results[6], 3))
  print("Sensitivity", np.round(df_results[7], 3))
  print("AUC", np.round(df_results[8], 3))

def plot_roc(name, labels, predictions, **kwargs):
  fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  plt.xlim([-0.5,80])
  plt.ylim([0,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')



2022-03-08 18:25:48.975753: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-08 18:25:49.586203: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38444 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


## Import raw data

In [6]:
if IMPORT_RAW_FROM_BUCKET == True:
    !gsutil ls gs://$DEEPASM_BUCKET/$GENOMIC_INTERVAL*bp/encode_training_data/*.json > list_to_download.txt
    files_to_download_df = pd.read_csv('list_to_download.txt', header=None)
    print("Number of files to download:", files_to_download_df.shape[0])

    imported_df = pd.DataFrame()

    for index_file in range(range(files_to_download_df.shape[0])): #range(files_to_download_df.shape[0])
        file_name_bucket = files_to_download_df[0][index_file]
        local_file_name = "training_" + str(index_file) + ".json"
        
        # Download the file from bucket
        !gsutil cp $file_name_bucket $local_file_name
        
        print("Appending file...")
        imported_df = imported_df.append(pd.read_json(local_file_name, lines = True))

Number of files to download: 21
Appending file...


In [7]:
print("Size of the imported dataset:", imported_df.shape)

Size of the imported dataset: (214876, 18)


## Prepare the features

Note: we do not randomize the rows because the scripts preceding this notebook already sampled the rows.

### Copy & clean dataframe 

In [62]:
# Copy of the dataframe
raw_df = imported_df.copy()
raw_df = raw_df.head(NB_ROWS_RAW_DATASET)

# We remove the chromosomes X and Y (no ASM)
raw_df = raw_df.loc[raw_df['chr'] != 'X']
raw_df = raw_df.loc[raw_df['chr'] != 'Y']

In [63]:
print("Size of the dataset: ", raw_df.shape)

### Calculate the distance between CpGs

In [64]:
# Create a function to calculate the distance between CpGs (~3min)
def dist_cpg(cpg_pos):
  distances = []
  for index in range(len(cpg_pos)):
    if index >= len(cpg_pos)-1:
      return distances
    else:
      distances.append(cpg_pos[index + 1] - cpg_pos[index])
  return distances

# Apply the function "distance" to the array of CpG positions
raw_df['cpg_dist'] = raw_df['cpg_pos'].apply(lambda x: dist_cpg(x))

In [65]:
raw_df

Unnamed: 0,asm_snp,sample_category,chr,region_inf,region_sup,region_nb_cpg,nb_cpg_found,nb_reads,dnase,encode_ChiP_V2,tf_motifs,global_cpg_fm,tot_nb_cpg,tot_nb_reads,read_fm,cpg_fm,cpg_cov,cpg_pos,cpg_dist
0,0,0,5,167184001,167185000,3,3,71,1,0,0,0.815,21223988,307561222,"[0, 0, 0, 1, 0, 0.5, 0, 1, 0, 1, 0.5, 0.333, 1...","[0.742, 0.788, 0.25]","[31, 33, 40]","[167184497, 167184558, 167184725]","[61, 167]"
1,0,0,5,166743001,166744000,3,3,69,0,0,2,0.815,21223988,307561222,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5, 1, 1...","[1, 0.9500000000000001, 0.929]","[38, 40, 42]","[166743591, 166743715, 166743733]","[124, 18]"
2,0,0,6,22376001,22377000,3,3,75,0,0,0,0.815,21223988,307561222,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5, 1, 0...","[0.783, 0.838, 0.765]","[46, 37, 34]","[22376280, 22376368, 22376447]","[88, 79]"
3,0,0,14,63617001,63618000,3,3,57,1,0,3,0.815,21223988,307561222,"[1, 0, 1, 1, 1, 0.667, 1, 0.667, 1, 1, 1, 1, 0...","[0.636, 0.9570000000000001, 0.979]","[44, 47, 47]","[63617746, 63617751, 63617797]","[5, 46]"
4,0,0,2,188785001,188786000,3,3,144,2,1,0,0.815,21223988,307561222,"[1, 0.5, 0, 0, 1, 0.5, 0, 0, 0, 0.5, 1, 0, 0, ...","[0.543, 0.356, 0.646]","[46, 90, 96]","[188785538, 188785705, 188785753]","[167, 48]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,3,113881001,113882000,3,3,42,1,0,8,0.786,17213478,222365425,"[1, 1, 1, 1, 0.667, 0.667, 1, 1, 1, 1, 1, 0.66...","[0.97, 0.765, 0.8290000000000001]","[33, 34, 35]","[113881397, 113881438, 113881453]","[41, 15]"
996,0,0,5,166743001,166744000,3,3,60,0,0,2,0.786,17213478,222365425,"[0, 1, 1, 1, 1, 1, 1, 1, 0.5, 1, 1, 1, 1, 1, 1...","[0.919, 0.962, 0.9490000000000001]","[37, 26, 39]","[166743591, 166743715, 166743733]","[124, 18]"
997,0,0,1,83870001,83871000,3,3,32,1,0,6,0.786,17213478,222365425,"[1, 1, 1, 1, 1, 1, 1, 1, 0.667, 1, 1, 1, 1, 0....","[0.9520000000000001, 0.857, 0.909]","[21, 28, 22]","[83870175, 83870190, 83870236]","[15, 46]"
998,0,0,1,90796001,90797000,3,3,49,0,0,0,0.786,17213478,222365425,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.667, 1, 1,...","[0.962, 0.722, 0.92]","[26, 36, 25]","[90796083, 90796160, 90796178]","[77, 18]"


### Convert arrays into numerical features

To do this, we use kernel estimates as well as simpler metrics like mean and standard deviation

#### Kernel functions

In [66]:
# FRACTIONAL METHYLATION

# Values for fractional methylation (between 0 and 1)
values_for_kernel_fm = asarray([value for value in range(0, KERNEL_FM_NB_VALUES+1)])
values_for_kernel_fm = values_for_kernel_fm / KERNEL_FM_NB_VALUES
print("X-axis values used for the FM kernel estimate:", values_for_kernel_fm)
values_for_kernel_fm = values_for_kernel_fm.reshape((len(values_for_kernel_fm), 1))

# Build Kernel model
kernel_fm_model = KernelDensity(bandwidth=KERNEL_FM_BANDWIDTH, kernel='gaussian')

# Function to be applied to each array in the columns read_fm and cpg_fm
def estimate_kernels_fm(x):
  sample = np.reshape(x, (len(x), 1))
  kernel_fm_model.fit(sample)
  probabilities = kernel_fm_model.score_samples(values_for_kernel_fm)
  probabilities = exp(probabilities)
  return np.round(probabilities, 4)

# Try function
estimate_kernels_fm(raw_df['read_fm'][1])


array([0.1156, 0.0702, 0.0169, 0.0169, 0.0702, 0.1156, 0.0714, 0.0574,
       0.5099, 2.2795, 3.7582])

In [67]:
# COVERAGE AND CPG DISTANCE

# Values for fractional methylation (between 0 and 1)
values_for_kernel_cov = asarray([value for value in range(0, KERNEL_COV_NB_MAX, KERNEL_COV_NB_STEP)])
print("Values used in kernel estimate:", values_for_kernel_cov)
values_for_kernel_cov = values_for_kernel_cov.reshape((len(values_for_kernel_cov), 1))

# Build Kernel model
kernel_cov_model = KernelDensity(bandwidth=KERNEL_COV_BANDWIDTH, kernel='gaussian')

# Function to be applied to each array in the columns read_fm and cpg_fm
def estimate_kernels_cov(x):
  sample = np.reshape(x, (len(x), 1))
  kernel_fm_model.fit(sample)
  probabilities = kernel_fm_model.score_samples(values_for_kernel_cov)
  probabilities = exp(probabilities)
  return np.round(probabilities, 4)

# Try function
estimate_kernels_cov(raw_df['cpg_cov'][1])

array([0.    , 1.3298, 0.    , 0.    , 0.    ])

#### Test kernel estimates

In [68]:
variable_to_plot = 'cpg_fm' # cpg_fm or read_fm or cpg_dist or cpg_cov

n_extract = 10
extract_asm = raw_df[raw_df['asm_snp'] == 1].sample(n=n_extract, ignore_index = True)
extract_noasm = raw_df[raw_df['asm_snp'] == 0].sample(n=n_extract, ignore_index = True)
n_x = round(np.sqrt(n_extract))

ValueError: Cannot take a larger sample than population when 'replace=False'

##### Plots for regions with ASM

In [69]:
mpl.rcParams['figure.figsize'] = (10, 10)
fig, axs = plt.subplots(n_x, n_x, sharey=True, sharex=True, tight_layout=True)

for k in range(n_x):
  for m in range(n_x):

    # Print data distribution
    data_distribution = extract_asm[variable_to_plot][k+m]
    axs[k,m].hist(data_distribution, density = True, bins = 10)

    # Print kernel density
    if 'fm' in variable_to_plot:
        #print("Using the FM kernel estimates")
        kernel_probabilities = estimate_kernels_fm(data_distribution)
        values = values_for_kernel_fm
    else:
        #print("Using the COV kernel estimates")
        kernel_probabilities = estimate_kernels_cov(data_distribution)
        values = values_for_kernel_cov
    axs[k,m].plot(values[:], kernel_probabilities)


NameError: name 'n_x' is not defined

##### Plots for regions without ASM

In [70]:
mpl.rcParams['figure.figsize'] = (10, 10)
fig, axs = plt.subplots(n_x, n_x, sharey=True, sharex=True, tight_layout=True)

for k in range(n_x):
  for m in range(n_x):

    # Print data distribution
    data_distribution = extract_noasm[variable_to_plot][k+m]
    axs[k,m].hist(data_distribution, density = True, bins = 10)

    # Print kernel density
    if 'fm' in variable_to_plot:
        #print("Using the FM kernel estimates")
        kernel_probabilities = estimate_kernels_fm(data_distribution)
        values = values_for_kernel_fm
    else:
        #print("Using the COV kernel estimates")
        kernel_probabilities = estimate_kernels_cov(data_distribution)
        values = values_for_kernel_cov
    axs[k,m].plot(values[:], kernel_probabilities)

NameError: name 'n_x' is not defined

#### Calculate the mean, std, and kernel estimates of arrays

In [71]:
def convert_arrays(df, column_name):
  """Inputs: dataframe and a column name that contains arrays"""

  # Mean and Standard deviation
  std_name = "std_" + column_name
  av_name = "mean_" + column_name

  print("Calculating the standard deviation")
  df[std_name] = df[column_name].apply(lambda x: np.round(np.std(x), 4))
  print("Calculating the average")
  df[av_name] = df[column_name].apply(lambda x: np.round(np.mean(x), 4))
  
  # Kernel density estimates
  kernel_name = "kernel_" + column_name
  if (column_name == 'cpg_cov' or column_name == 'cpg_dist'):
    print("Calculating the proba distribution for cov or dist")
    df[kernel_name] = df[column_name].apply(lambda x: estimate_kernels_cov(x))
  else:
    print("Calculating the proba distribution for fractional methylation")
    df[kernel_name] = df[column_name].apply(lambda x: estimate_kernels_fm(x))


In [72]:
# Apply the function
for col in ['read_fm', 'cpg_fm', 'cpg_cov', 'cpg_dist']:
  print("Column: ", col)
  convert_arrays(raw_df, col)

In [73]:
def export_kernel_array(col):
    # Col must be a column of kernel estimate arrays
    print("Processing:", col)
    kernel_name_list = []
    if 'fm' in col:
        values = values_for_kernel_fm
    else:
        values = values_for_kernel_cov
    # Create a list of the new column names
    for k in range(0, values.shape[0]):
        kernel_name = col + "_kernel_" + str(k)
        kernel_name_list = kernel_name_list + [kernel_name]
    print(kernel_name_list)
    
    # Create the additional columns
    kernel_estimates_column = "kernel_" + col
    raw_df[kernel_name_list] = pd.DataFrame(raw_df[kernel_estimates_column].tolist(), index= raw_df.index)

In [74]:
# Use the function
for col in ['read_fm', 'cpg_fm', 'cpg_cov', 'cpg_dist']:
            export_kernel_array(col)

In [75]:
# Delete columns that we no longer need
for col in ['read_fm', 'cpg_dist', 'kernel_cpg_cov', 
            'kernel_cpg_dist', 'kernel_cpg_fm', 'kernel_read_fm']:
    raw_df.drop(col, axis = 1, inplace = True)

### Convert epigenetic signals into dummy variables

In [76]:
def convert_epi_signal(epi_signal):
  print("Processing signal", epi_signal)
  unique_values = raw_df[epi_signal].unique()
  print(unique_values)
  min_epi_value = 0 # It's always zero (no signal) for all signals
  median_epi_value = np.median(unique_values)
  print("Median epi value:", median_epi_value)
  epi_signal_null = epi_signal + "_null"
  epi_signal_low = epi_signal + "_low"
  epi_signal_high = epi_signal + "_high"
  raw_df[epi_signal_null] = raw_df[epi_signal].apply(lambda x: 1 if x == min_epi_value else 0)
  raw_df[epi_signal_low] = raw_df[epi_signal].apply(lambda x: 1 if (x > min_epi_value and x <= median_epi_value) else 0)
  raw_df[epi_signal_high] = raw_df[epi_signal].apply(lambda x: 1 if x > median_epi_value else 0)

In [77]:
# Apply the function to all epigenetic signals
for epi_signal in ['dnase', 'encode_ChiP_V2', 'tf_motifs']:
  convert_epi_signal(epi_signal)

In [78]:
# Delete the raw epigenetic signals
for epi_signal in ['dnase', 'encode_ChiP_V2', 'tf_motifs']:
  raw_df.drop(epi_signal, axis = 1, inplace = True)

In [79]:
raw_df.columns

Index(['asm_snp', 'sample_category', 'chr', 'region_inf', 'region_sup',
       'region_nb_cpg', 'nb_cpg_found', 'nb_reads', 'global_cpg_fm',
       'tot_nb_cpg', 'tot_nb_reads', 'cpg_fm', 'cpg_cov', 'cpg_pos',
       'std_read_fm', 'mean_read_fm', 'std_cpg_fm', 'mean_cpg_fm',
       'std_cpg_cov', 'mean_cpg_cov', 'std_cpg_dist', 'mean_cpg_dist',
       'read_fm_kernel_0', 'read_fm_kernel_1', 'read_fm_kernel_2',
       'read_fm_kernel_3', 'read_fm_kernel_4', 'read_fm_kernel_5',
       'read_fm_kernel_6', 'read_fm_kernel_7', 'read_fm_kernel_8',
       'read_fm_kernel_9', 'read_fm_kernel_10', 'cpg_fm_kernel_0',
       'cpg_fm_kernel_1', 'cpg_fm_kernel_2', 'cpg_fm_kernel_3',
       'cpg_fm_kernel_4', 'cpg_fm_kernel_5', 'cpg_fm_kernel_6',
       'cpg_fm_kernel_7', 'cpg_fm_kernel_8', 'cpg_fm_kernel_9',
       'cpg_fm_kernel_10', 'cpg_cov_kernel_0', 'cpg_cov_kernel_1',
       'cpg_cov_kernel_2', 'cpg_cov_kernel_3', 'cpg_cov_kernel_4',
       'cpg_dist_kernel_0', 'cpg_dist_kernel_1', 'cpg_dist

### Convert chromosome column in dummy variable

In [80]:
raw_df = pd.get_dummies(raw_df, columns = ['chr'], dtype=int)

## Get rid of features that poorly correlate with ASM

In [81]:
df_for_corr = raw_df.copy()
corr_matrix = pd.DataFrame(abs(df_for_corr.corr()['asm_snp'])).sort_values(by = 'asm_snp')
print(corr_matrix)

model_scalar_variables = corr_matrix[corr_matrix['asm_snp'] > MIN_CORR].index.tolist()
model_scalar_variables.remove('asm_snp')
model_scalar_variables


['cpg_fm_kernel_0',
 'read_fm_kernel_9',
 'read_fm_kernel_10',
 'read_fm_kernel_7',
 'read_fm_kernel_6',
 'cpg_fm_kernel_8',
 'sample_category',
 'cpg_fm_kernel_2',
 'mean_cpg_dist',
 'nb_reads',
 'region_sup',
 'region_inf',
 'tf_motifs_high',
 'tot_nb_reads',
 'cpg_fm_kernel_6',
 'tf_motifs_null',
 'chr_14',
 'cpg_fm_kernel_10',
 'global_cpg_fm',
 'cpg_fm_kernel_9',
 'std_read_fm',
 'chr_15',
 'tf_motifs_low',
 'mean_cpg_cov',
 'cpg_fm_kernel_5',
 'cpg_fm_kernel_3',
 'cpg_fm_kernel_4',
 'chr_13']

In [82]:
corr_matrix

Unnamed: 0,asm_snp
nb_cpg_found,0.003222
cpg_cov_kernel_4,0.003222
cpg_dist_kernel_4,0.003948
chr_19,0.004561
chr_22,0.004561
...,...
chr_13,0.156580
asm_snp,1.000000
region_nb_cpg,
cpg_cov_kernel_0,


In [83]:
corr_matrix[pd.isna(corr_matrix['asm_snp'])]

Unnamed: 0,asm_snp
region_nb_cpg,
cpg_cov_kernel_0,
cpg_dist_kernel_0,


In [84]:
# Remove features that poorly correlate with ASM
model_variables_to_remove = corr_matrix[corr_matrix['asm_snp'] <= MIN_CORR].index.tolist()

print("Removing", len(model_variables_to_remove), "variables from the datset")

for var in model_variables_to_remove:
  raw_df.drop(var, axis = 1, inplace = True)

In [85]:
# Remove features where there is no correlation
model_variables_to_remove = corr_matrix[pd.isna(corr_matrix['asm_snp'])].index.tolist()

print("Removing", len(model_variables_to_remove), "variables from the datset")

for var in model_variables_to_remove:
  raw_df.drop(var, axis = 1, inplace = True)

In [86]:
print("Size of dataset:", raw_df.shape)

## Create an "image" of the genomic region

We create an 1D-image (length: genomic region interval) with 3 information per "pixel": CpG presence (0 or 1), CpG coverage, CpG fractional methylation

In [87]:
# Create arrays of positions, fractional methylation, and coverage for CpGs (~2h30)
def create_genomic_array(df):
  genomic_positions = []
  genomic_fm = []
  genomic_cov = [] 
  for position in range(df['region_inf'], df['region_sup'] + 1):
    if position in df['cpg_pos']:
      new_pos = 1
      pos_index = df['cpg_pos'].index(position)
      new_fm = df['cpg_fm'][pos_index]
      new_cov = df['cpg_cov'][pos_index]
    else:
      new_pos = 0
      new_fm = 0
      new_cov = 0
    genomic_positions = genomic_positions + [new_pos]
    genomic_fm = genomic_fm + [new_fm]
    genomic_cov = genomic_cov + [new_cov]
  return np.transpose([genomic_positions, genomic_fm, genomic_cov])

In [88]:
# Apply the function to the dataframe which we split before because it takes a lot of memory.

# Split the dataframe into a list of dataframes of ~100k rows
nb_dataframe_pieces = max(1, round(raw_df.shape[0]/100000))
raw_df_pieces = np.array_split(raw_df, nb_dataframe_pieces)
print("The dataframe has been split into", nb_dataframe_pieces, "pieces")

for df_piece in range(nb_dataframe_pieces):
        print("processing the piece at position:", df_piece)
        raw_df_pieces[df_piece]['genomic_matrix'] = raw_df_pieces[df_piece].apply(lambda x: create_genomic_array(x), 
                                                                                      axis = 1)

In [89]:
raw_df_pieces[0]

Unnamed: 0,asm_snp,sample_category,region_inf,region_sup,nb_reads,global_cpg_fm,tot_nb_reads,cpg_fm,cpg_cov,cpg_pos,...,cpg_fm_kernel_8,cpg_fm_kernel_9,cpg_fm_kernel_10,tf_motifs_null,tf_motifs_low,tf_motifs_high,chr_13,chr_14,chr_15,genomic_matrix
0,0,0,167184001,167185000,71,0.815,307561222,"[0.742, 0.788, 0.25]","[31, 33, 40]","[167184497, 167184558, 167184725]",...,2.4442,1.0919,0.1882,1,0,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
1,0,0,166743001,166744000,69,0.815,307561222,"[1, 0.9500000000000001, 0.929]","[38, 40, 42]","[166743591, 166743715, 166743733]",...,1.1904,3.2552,3.5369,0,1,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
2,0,0,22376001,22377000,75,0.815,307561222,"[0.783, 0.838, 0.765]","[46, 37, 34]","[22376280, 22376368, 22376447]",...,3.7987,2.3026,0.5683,1,0,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
3,0,0,63617001,63618000,57,0.815,307561222,"[0.636, 0.9570000000000001, 0.979]","[44, 47, 47]","[63617746, 63617751, 63617797]",...,1.0022,2.1445,2.5149,0,1,0,0,1,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
4,0,0,188785001,188786000,144,0.815,307561222,"[0.543, 0.356, 0.646]","[46, 90, 96]","[188785538, 188785705, 188785753]",...,0.4553,0.0551,0.0026,1,0,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,113881001,113882000,42,0.786,222365425,"[0.97, 0.765, 0.8290000000000001]","[33, 34, 35]","[113881397, 113881438, 113881453]",...,2.8393,2.6090,1.6636,0,1,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
996,0,0,166743001,166744000,60,0.786,222365425,"[0.919, 0.962, 0.9490000000000001]","[37, 26, 39]","[166743591, 166743715, 166743733]",...,1.4513,3.5827,3.3627,0,1,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
997,0,0,83870001,83871000,32,0.786,222365425,"[0.9520000000000001, 0.857, 0.909]","[21, 28, 22]","[83870175, 83870190, 83870236]",...,2.2835,3.6985,2.5424,0,1,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."
998,0,0,90796001,90797000,49,0.786,222365425,"[0.962, 0.722, 0.92]","[26, 36, 25]","[90796083, 90796160, 90796178]",...,1.9863,2.6735,2.2307,1,0,0,0,0,0,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, ..."


In [None]:
# Delete the variables we no longer need
for var in ['cpg_cov', 'cpg_pos', 'cpg_fm', 'region_inf', 'region_sup']:
      for df_piece in range(nb_dataframe_pieces):
            print("processing the piece at position:", df_piece)
            raw_df_pieces[df_piece].drop(var, axis = 1, inplace = True)

## Save dataframe with features on Cloud Storage

In [92]:
# Check the columns
raw_df_pieces[0].columns

Index(['asm_snp', 'sample_category', 'nb_reads', 'global_cpg_fm',
       'tot_nb_reads', 'std_read_fm', 'mean_cpg_cov', 'mean_cpg_dist',
       'read_fm_kernel_6', 'read_fm_kernel_7', 'read_fm_kernel_9',
       'read_fm_kernel_10', 'cpg_fm_kernel_0', 'cpg_fm_kernel_2',
       'cpg_fm_kernel_3', 'cpg_fm_kernel_4', 'cpg_fm_kernel_5',
       'cpg_fm_kernel_6', 'cpg_fm_kernel_8', 'cpg_fm_kernel_9',
       'cpg_fm_kernel_10', 'tf_motifs_null', 'tf_motifs_low', 'tf_motifs_high',
       'chr_13', 'chr_14', 'chr_15', 'genomic_matrix'],
      dtype='object')

In [38]:
# Obtain the date/time
now = datetime.today()
dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")
print(dt_string)

2022-03-08_18-26-50


In [41]:
# Export the variable names to the bucket
sys.stdout = open("variables.txt", "w")
%whos

In [42]:
# Export variable names to to Cloud Storage
!gsutil cp $var_file_name gs://$DEEPASM_BUCKET/notebook/$dt_string/

In [96]:
if EXPORT_PROCESSED_DATA == True:

    for df_piece in range(nb_dataframe_pieces):
        print("processing the piece at position:", df_piece)
        df_to_export = raw_df_pieces[df_piece]
        print("Size of dataframe:", df_to_export.shape)

        print("Saving the file as HDF5...")
        file_name = "prepared_df_" + str(df_piece) + ".h5"
        print("File name:", file_name)
        df_to_export.to_hdf(file_name, key = 'df', mode = 'w')

        print("Exporting file to bucket...")
        !gsutil cp $file_name gs://$DEEPASM_BUCKET/notebook/$dt_string/
else:
        print("Not exporting the scaled DF per variable")

In [None]:
## Importing prepared features from bucket