In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines, utils

import rpy2.robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
limma = importr('limma')
sva =  importr('sva')
pandas2ri.activate()

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../../configs", "config_Pa_sample.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load parameters
dataset_name = params["dataset_name"]
analysis_name = params["analysis_name"]
NN_architecture = params["NN_architecture"]
num_simulated_samples = params["num_simulated_samples"]
lst_num_experiments = params["lst_num_experiments"]
use_pca = params["use_pca"]
num_PCs = params["num_PCs"]
local_dir = params["local_dir"]
correction_method = params["correction_method"]

iterations = params["iterations"] 
num_cores = params["num_cores"]

In [4]:
# Additional parameters
file_prefix = "Experiment_corrected"
corrected = True

In [5]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo


normalized_data_file = os.path.join(
    local_dir,
    "experiment_simulated",
    analysis_name,
    "Experiment_2_0.txt.xz")

map_file = os.path.join(
                local_dir,
                "experiment_simulated",
                analysis_name,
                "Experiment_map_2_0.txt.xz")

In [6]:
# Read batch data
mapping = pd.read_table(
                map_file,
                header=0,
                index_col=0,
                sep='\t')["experiment"]

print(mapping.shape)

(6000,)


In [7]:
# Read data
data = pd.read_table(
    normalized_data_file,
    header=0,
    index_col=0,
    sep='\t').T

print(data.shape)
data.head(10)

(5549, 6000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0.641,0.666,0.694,0.648,0.619,0.701,0.724,0.567,0.555,0.667,...,0.699,0.695,0.583,0.749,0.666,0.701,0.716,0.645,0.684,0.593
1,0.611,0.561,0.617,0.598,0.575,0.627,0.662,0.66,0.609,0.636,...,0.67,0.672,0.567,0.668,0.649,0.652,0.604,0.666,0.624,0.586
2,0.397,0.495,0.447,0.501,0.386,0.509,0.464,0.345,0.329,0.409,...,0.381,0.483,0.421,0.471,0.426,0.463,0.491,0.401,0.429,0.467
3,0.561,0.561,0.576,0.625,0.518,0.601,0.6,0.536,0.421,0.606,...,0.603,0.641,0.51,0.55,0.583,0.6,0.557,0.598,0.569,0.668
4,0.407,0.39,0.396,0.404,0.386,0.413,0.417,0.303,0.304,0.409,...,0.432,0.366,0.353,0.499,0.439,0.484,0.429,0.355,0.368,0.342
5,0.31,0.46,0.359,0.348,0.389,0.396,0.482,0.28,0.303,0.373,...,0.32,0.34,0.35,0.355,0.452,0.468,0.538,0.345,0.329,0.259
6,0.343,0.288,0.414,0.255,0.336,0.24,0.218,0.358,0.367,0.291,...,0.289,0.421,0.321,0.196,0.387,0.261,0.345,0.322,0.306,0.301
7,0.539,0.492,0.546,0.545,0.493,0.573,0.593,0.483,0.512,0.579,...,0.528,0.503,0.468,0.59,0.497,0.498,0.531,0.471,0.532,0.493
8,0.455,0.416,0.551,0.47,0.402,0.532,0.516,0.413,0.367,0.513,...,0.448,0.524,0.429,0.568,0.418,0.465,0.493,0.459,0.439,0.42
9,0.183,0.282,0.221,0.265,0.255,0.252,0.239,0.187,0.202,0.225,...,0.207,0.23,0.236,0.221,0.209,0.205,0.225,0.214,0.25,0.244


In [12]:
# Apply parametric correction
corrected_experiment_data = sva.ComBat(data, batch=mapping)

Standardizing Data across genes



In [15]:
# Convert R object to pandas df
corrected_experiment_data_df = pandas2ri.ri2py_dataframe(
    corrected_experiment_data)

corrected_experiment_data_df.columns = data.columns 

corrected_experiment_data_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0.637337,0.669828,0.697968,0.651739,0.615449,0.705003,0.719913,0.563714,0.551775,0.663204,...,0.695041,0.698973,0.586414,0.744786,0.662209,0.697031,0.711954,0.641316,0.687918,0.596464
1,0.591178,0.58078,0.637032,0.617946,0.555344,0.647076,0.641943,0.639952,0.589187,0.616062,...,0.649906,0.692278,0.586807,0.647915,0.629002,0.631989,0.58421,0.645924,0.644063,0.605892
2,0.430199,0.461829,0.413792,0.467834,0.419209,0.47584,0.497134,0.378248,0.362264,0.442187,...,0.414214,0.44982,0.387772,0.504127,0.459171,0.496135,0.524108,0.434195,0.395778,0.433808
3,0.57438,0.54759,0.562607,0.61166,0.531436,0.587634,0.61333,0.549413,0.43456,0.619323,...,0.616327,0.627678,0.496534,0.563395,0.596352,0.61333,0.570386,0.611333,0.555599,0.654707
4,0.406789,0.390279,0.396254,0.404221,0.385706,0.413184,0.416829,0.302375,0.303379,0.408797,...,0.431889,0.366379,0.353433,0.499156,0.438917,0.484096,0.428877,0.354582,0.36837,0.342478
5,0.313881,0.456835,0.355334,0.344279,0.392481,0.392518,0.48501,0.284033,0.306917,0.376562,...,0.323831,0.33624,0.346289,0.358653,0.455162,0.471081,0.540727,0.348704,0.325185,0.254838
6,0.325272,0.305729,0.431699,0.272737,0.318272,0.257741,0.200267,0.340273,0.349273,0.27327,...,0.27127,0.438697,0.338721,0.178266,0.369274,0.243269,0.327273,0.304272,0.323725,0.318726
7,0.553347,0.477703,0.531663,0.530664,0.507322,0.558643,0.607376,0.497317,0.526333,0.593369,...,0.542341,0.488695,0.453721,0.604375,0.511325,0.512325,0.545343,0.485311,0.517674,0.478703
8,0.471937,0.399244,0.533913,0.453112,0.418817,0.51496,0.533075,0.429842,0.383738,0.530068,...,0.464921,0.50698,0.412213,0.585193,0.434853,0.48196,0.510023,0.475946,0.422188,0.403235
9,0.201717,0.263257,0.20229,0.246266,0.273742,0.233273,0.257737,0.205718,0.220723,0.243732,...,0.225725,0.211285,0.217282,0.23973,0.227726,0.223725,0.243732,0.232728,0.231274,0.225278


In [53]:
experiment_data_file = os.path.join(
    local_dir,
    "experiment_simulated",
    analysis_name,
    "Experiment_corrected_99_0.txt.xz")

# Read data
experiment_data = pd.read_table(
    experiment_data_file,
    header=0,
    index_col=0,
    sep='\t')

print(experiment_data.shape)
experiment_data.head(10)

(5549, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [47]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
noisy_original_data_PCAencoded = pca.fit_transform(experiment_data)

In [48]:
noisy_original_data_df = pd.DataFrame(noisy_original_data_PCAencoded,
                                                  index=experiment_data.index
                                                  )

noisy_original_data_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,10.717331,3.950263,-0.857315,0.416618,1.412342,-0.207998,0.375075,0.467051,-1.303307,0.152026
1,10.621538,-0.869241,-0.787249,0.670089,-0.862229,-0.839117,-0.171907,-0.179746,-0.027764,-0.25897
2,10.542811,1.74499,-2.519382,-0.337754,-0.762247,-0.196248,0.402175,-0.923706,0.732003,-0.107632
3,10.693554,1.891839,1.087949,0.029707,0.083028,0.375865,-3.252747,0.795731,-1.315951,1.725408
4,10.630165,-2.055678,1.097558,-1.13458,-0.673059,-0.669811,0.790781,-0.898761,0.937599,1.16424
5,10.47377,1.41758,-1.536543,0.28067,-1.584535,2.422112,-0.459079,-1.720777,-0.135477,-0.892442
6,10.570233,-2.375633,-1.822969,-0.029594,-0.203375,-0.516339,1.534753,0.704612,0.150738,-0.108992
7,10.836473,-3.258357,4.746063,0.349297,2.655712,-1.853057,0.808543,-0.65514,0.093204,-0.561571
8,-10.718066,-2.640565,-0.675198,1.322582,-0.660536,0.65083,1.190913,-1.025084,-0.45341,-0.952223
9,10.649233,0.299781,-2.367123,-0.1099,0.693549,0.87954,-0.871574,-0.393942,1.346382,-0.037545


In [49]:
import numpy as np
noisy_original_data_df.isnull().any(axis=0) 

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool