### Goal
Preprocessing rnaseq data

This is a tutorial with the main workflow for preparing a training dataset for a basic linear regression model. 
The output (X,Y) are all pair combination between tissues per each sample, where X is input, Y target.

In [1]:
# code autoreload
%load_ext autoreload
%autoreload 2

In [2]:
!pip install numpy
!pip install pandas
!pip install anndata

!pip install scanpy
!pip install argh



In [3]:
import rep.preprocessing as p
from rep.constants import ANNDATA_CST as a

%aimport

Modules to reload:
all-except-skipped

Modules to skip:



#### 1. Load raw count matrix + annotation and put it into an annodata object

In [4]:
%%writefile data.csv
T1_s1,T2_s2,T3_s1,T4_s2,T5_s1,T6_s1
G1,10,20,30,40,5,6
G2,5,10,20,30,5,6
G3,6,7,8,9,1,1

Overwriting data.csv


In [5]:
%%writefile anno_var.csv
Sample,Gender,Tissue,Individual,Technology
T1_s1,F,Tissue1,Sample1,rnaseq
T2_s2,M,Tissue2,Sample2,rnaseq
T3_s1,F,Tissue3,Sample1,rnaseq
T4_s2,M,Tissue4,Sample2,rnaseq
T5_s1,M,Tissue5,Sample1,rnaseq
T6_s1,M,Tissue6,Sample1,rnaseq

Overwriting anno_var.csv


In [11]:
%%writefile anno_obs.csv
G1,hg19,T1,chr1,1111,-
G2,hg19,T2,chr2,2222,-
G3,hg19,T3,chr3,3333,-

Writing anno_obs.csv


In [6]:
# annobj is the equivalent of the summarized experiments in R. 
# It contains 
# - annobj.X - counts
# - annobj.var - data.frame annotation object for the columns of the count matrix (in our case Tissue_Sample)
# - annobj.obs - data.frame annotation object for the rows of the count matrix (= features that we measure, in our case Genes)
# 
annobj = p.load_count_matrix("data.csv", sep=",", varanno="anno_var.csv", obsanno="anno_obs.csv")
p.print_anndata(annobj)

anndata.X
[[10. 20. 30. 40.  5.  6.]
 [ 5. 10. 20. 30.  5.  6.]
 [ 6.  7.  8.  9.  1.  1.]]
anndata.var
Index(['T1_s1', 'T2_s2', 'T3_s1', 'T4_s2', 'T5_s1', 'T6_s1'], dtype='object')
      Gender   Tissue Individual Technology
T1_s1      F  Tissue1    Sample1     rnaseq
T2_s2      M  Tissue2    Sample2     rnaseq
T3_s1      F  Tissue3    Sample1     rnaseq
T4_s2      M  Tissue4    Sample2     rnaseq
T5_s1      M  Tissue5    Sample1     rnaseq
T6_s1      M  Tissue6    Sample1     rnaseq
anndata.obs
Index(['G1', 'G2', 'G3'], dtype='object')
       1   2     3     4  5
G1  hg19  T1  chr1  1111  -
G2  hg19  T2  chr2  2222  -
G3  hg19  T3  chr3  3333  -



#### Save data to h2ad format

In [16]:
name = p.save(annobj)
name

... storing '1' as categorical
... storing '5' as categorical
... storing '1' as categorical
... storing '3' as categorical
... storing '4' as categorical


'/data/nasif12/home_if12/giurgiu/rep_gagneur/rep/notebooks/preprocessing/tmp1545316680.h5ad'

#### 2. Prepare training sample:
- select genes and tissue_samples for training
- slice the anndata object
- compute the cross tissue matrix pair (X,Y)

In [65]:
# filtering by different values in anndata.var and anndata.obs
# stored in a json format by specifying the {column over we would like to filter : array of admitted values}
# in this example we filter the anndata.var by gender (M) and anndata.obs should contain only G1 and G2
(var, obs) = p.filter_anndata_by_value(annobj, {a.VAR: {'Gender': ['M']}, a.OBS: {0: ['G1', 'G2']}})

# these are the rows and columns selected for the training set
(var,obs)

      Gender   Tissue Individual Technology
T1_s1      F  Tissue1    Sample1     rnaseq
T2_s2      M  Tissue2    Sample2     rnaseq
T3_s1      F  Tissue3    Sample1     rnaseq
T4_s2      M  Tissue4    Sample2     rnaseq
T5_s1      M  Tissue5    Sample1     rnaseq
T6_s1      M  Tissue6    Sample1     rnaseq


(['T2_s2', 'T6_s1', 'T5_s1', 'T4_s2'], ['G1', 'G2'])

In [67]:
# since we want to impute gene expression across different tissues, we
# would like to compute the cross tissue pairs per sample and not across samples

# for this purpose we can use the group_by(df, column, index_subset) 
# function to group tissues by sample

dict_samples_var = p.group_by(annobj.var, 'Individual', var)
dict_samples_var

{'Sample1': ['T6_s1', 'T5_s1'], 'Sample2': ['T2_s2', 'T4_s2']}

In [72]:
# compute cross tissue matrix pair (X, Y)
(X, Y) = p.rnaseq_cross_tissue(annobj, individuals=['Sample1','Sample2'], gene_ids=obs)

4.3 Slice anndata

anndata.X
[[20.  5. 30.  6. 10. 40.]
 [10.  5. 20.  6.  5. 30.]]
anndata.var
Index(['T2_s2', 'T5_s1', 'T3_s1', 'T6_s1', 'T1_s1', 'T4_s2'], dtype='object')
      Gender   Tissue Individual Technology
T2_s2      M  Tissue2    Sample2     rnaseq
T5_s1      M  Tissue5    Sample1     rnaseq
T3_s1      F  Tissue3    Sample1     rnaseq
T6_s1      M  Tissue6    Sample1     rnaseq
T1_s1      F  Tissue1    Sample1     rnaseq
T4_s2      M  Tissue4    Sample2     rnaseq
anndata.obs
Index(['G1', 'G2'], dtype='object')
       1   2     3     4  5
G1  hg19  T1  chr1  1111  -
G2  hg19  T2  chr2  2222  -

4.4 Build the two matrices X and Y



In [73]:
# train matrix
X

Unnamed: 0,G1,G2
T1_s1_T6_s1,10,5
T1_s1_T3_s1,10,5
T1_s1_T5_s1,10,5
T6_s1_T1_s1,6,6
T6_s1_T3_s1,6,6
T6_s1_T5_s1,6,6
T3_s1_T1_s1,30,20
T3_s1_T6_s1,30,20
T3_s1_T5_s1,30,20
T5_s1_T1_s1,5,5


In [74]:
# label matrix
Y

Unnamed: 0,G1,G2
T1_s1_T6_s1,6,6
T1_s1_T3_s1,30,20
T1_s1_T5_s1,5,5
T6_s1_T1_s1,10,5
T6_s1_T3_s1,30,20
T6_s1_T5_s1,5,5
T3_s1_T1_s1,10,5
T3_s1_T6_s1,6,6
T3_s1_T5_s1,5,5
T5_s1_T1_s1,10,5


### 3. Create toy large dataset

In [22]:
!pip install numpy
!pip install matplotlib
import random
import math
import numpy.random as nr
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import anndata



In [8]:
# genes
genes = list(map(lambda x: "Gene_" + str(x) , list(range(1, 20001))))
genes[19990:]

['Gene_19991',
 'Gene_19992',
 'Gene_19993',
 'Gene_19994',
 'Gene_19995',
 'Gene_19996',
 'Gene_19997',
 'Gene_19998',
 'Gene_19999',
 'Gene_20000']

In [9]:
raw_counts = nr.negative_binomial(3,0.001,size=20000)
# sns.distplot(raw_counts)
raw_counts[:10]

array([4223, 1803, 3079, 1555, 3025, 2965, 2215, 3119, 8627, 1596])

In [10]:
# tissues
tissues = ['Muscle', 'Blood', 'Skin', 'Lung', 'Heart', 'Stomatch', 'Colon', 'Testis', 'Brain', 'Liver']

In [11]:
# simulate how many tissues are collected per patient
distribution_tissues = nr.uniform(1,len(tissues),1000)
distribution_tissues[:10]

array([1.43544583, 2.17944366, 1.84529393, 2.3820589 , 6.49733261,
       3.90651559, 5.98537936, 8.64013829, 2.92092658, 2.4304256 ])

In [12]:
# generate samples - raw counts per gene per tissue
frames = []
for i in range(1,10): # patients
  
  # choose tissues
  how_many_tissues = math.floor(distribution_tissues[random.randint(1,len(distribution_tissues))])
  random.shuffle(tissues)
  t_set = tissues[:how_many_tissues]  
  
  # initialize count matrix
  counts = pd.DataFrame(columns=genes,index=map(lambda x: "Patient_" + str(i) + "_" + str(x), t_set))

  for j in range(0,len(t_set)):
    
    col_name = "Patient_" + str(i) + "_" + t_set[j]
    
    # generate raw counts per tissue per set of genes
    raw_counts = nr.negative_binomial(3,0.001,size=20000)
    counts.loc[col_name] = raw_counts.reshape(1,20000)
    
  # append expression per tissue for the patient 
  frames.append(counts)
      

df_big = pd.concat(frames)

In [13]:
# transpose the matrix to be genes x patient_tissue
count_matrix = df_big.transpose()
count_matrix.shape

(20000, 42)

In [27]:
# create annotation dataframes
# column = 0 its the index! be careful
samples_anno = pd.DataFrame(index=count_matrix.columns,columns=['Individual','Tissue'])

for i in range(0,samples_anno.shape[0]):
  sample = samples_anno.index[i]
  [pat,number,tissue] = sample.split("_")
  
  samples_anno.at[sample, 'Individual'] = pat + "_" + str(number)
  samples_anno.at[sample, 'Tissue'] = tissue

samples_anno.drop_duplicates().shape
samples_anno[:10]


Unnamed: 0,Individual,Tissue
Patient_1_Liver,Patient_1,Liver
Patient_1_Blood,Patient_1,Blood
Patient_2_Testis,Patient_2,Testis
Patient_2_Brain,Patient_2,Brain
Patient_2_Liver,Patient_2,Liver
Patient_3_Colon,Patient_3,Colon
Patient_3_Heart,Patient_3,Heart
Patient_3_Blood,Patient_3,Blood
Patient_3_Muscle,Patient_3,Muscle
Patient_3_Skin,Patient_3,Skin


In [28]:
genes_anno = pd.DataFrame(index=count_matrix.index)
genes_anno.shape

(20000, 0)

In [32]:
# create annobj
annobj = anndata.AnnData(count_matrix)
annobj.var = samples_anno
annobj.var_names = list(samples_anno.index)
annobj.obs = genes_anno
annobj.obs_names = list(genes_anno.index)

p.print_anndata(annobj)

anndata.X
[[5954. 3762. 2766. ... 2754. 2417. 3411.]
 [2390.  966. 6234. ... 1869. 1225. 2182.]
 [4451.  974. 3649. ...  571. 6226. 1150.]
 ...
 [2644. 2462. 5296. ... 5027. 3532. 2245.]
 [1821. 3628.  380. ... 3414. 1150. 4381.]
 [1416. 1910. 1993. ... 1053. 1768. 4921.]]
anndata.var
Index(['Patient_1_Liver', 'Patient_1_Blood', 'Patient_2_Testis',
       'Patient_2_Brain', 'Patient_2_Liver', 'Patient_3_Colon',
       'Patient_3_Heart', 'Patient_3_Blood', 'Patient_3_Muscle',
       'Patient_3_Skin', 'Patient_4_Heart', 'Patient_4_Colon',
       'Patient_4_Liver', 'Patient_4_Stomatch', 'Patient_4_Blood',
       'Patient_5_Muscle', 'Patient_5_Colon', 'Patient_5_Skin',
       'Patient_5_Brain', 'Patient_5_Lung', 'Patient_5_Testis',
       'Patient_5_Blood', 'Patient_5_Stomatch', 'Patient_5_Heart',
       'Patient_6_Brain', 'Patient_6_Blood', 'Patient_6_Muscle',
       'Patient_6_Stomatch', 'Patient_7_Skin', 'Patient_7_Heart',
       'Patient_7_Blood', 'Patient_7_Muscle', 'Patient_7_Liver',

In [33]:
# filter using a subset of the tissues
random.shuffle(tissues)
subset_tissue = tissues[:5]
subset_tissue

['Blood', 'Brain', 'Heart', 'Muscle', 'Testis']

In [34]:
# create filtering json
json_filter = {a.VAR: {'Tissue':subset_tissue}}
# json_filter[a.OBS] = {0:genes[:10]}
json_filter

{'var': {'Tissue': ['Blood', 'Brain', 'Heart', 'Muscle', 'Testis']}}

In [36]:
(var, obs) = p.filter_anndata_by_value(annobj, json_filter)
var[:10]

                   Individual    Tissue
Patient_1_Liver     Patient_1     Liver
Patient_1_Blood     Patient_1     Blood
Patient_2_Testis    Patient_2    Testis
Patient_2_Brain     Patient_2     Brain
Patient_2_Liver     Patient_2     Liver
Patient_3_Colon     Patient_3     Colon
Patient_3_Heart     Patient_3     Heart
Patient_3_Blood     Patient_3     Blood
Patient_3_Muscle    Patient_3    Muscle
Patient_3_Skin      Patient_3      Skin
Patient_4_Heart     Patient_4     Heart
Patient_4_Colon     Patient_4     Colon
Patient_4_Liver     Patient_4     Liver
Patient_4_Stomatch  Patient_4  Stomatch
Patient_4_Blood     Patient_4     Blood
Patient_5_Muscle    Patient_5    Muscle
Patient_5_Colon     Patient_5     Colon
Patient_5_Skin      Patient_5      Skin
Patient_5_Brain     Patient_5     Brain
Patient_5_Lung      Patient_5      Lung
Patient_5_Testis    Patient_5    Testis
Patient_5_Blood     Patient_5     Blood
Patient_5_Stomatch  Patient_5  Stomatch
Patient_5_Heart     Patient_5     Heart


['Patient_6_Blood',
 'Patient_7_Brain',
 'Patient_7_Heart',
 'Patient_4_Heart',
 'Patient_2_Testis',
 'Patient_6_Brain',
 'Patient_5_Muscle',
 'Patient_4_Blood',
 'Patient_3_Blood',
 'Patient_5_Brain']

In [46]:
# slice  + compute all pairs
(X, Y) = p.rnaseq_cross_tissue(annobj, individuals=['Patient_1','Patient_2'], gene_ids=obs)
print(X.shape)
print(Y.shape)

4.3 Slice anndata

anndata.X
[[3762. 2766. 5954. 4545. 5159.]
 [ 966. 6234. 2390. 3354. 1559.]
 [ 974. 3649. 4451. 4068. 2982.]
 ...
 [2462. 5296. 2644. 1339. 1233.]
 [3628.  380. 1821. 3780. 1911.]
 [1910. 1993. 1416. 2386. 1513.]]
anndata.var
Index(['Patient_1_Blood', 'Patient_2_Testis', 'Patient_1_Liver',
       'Patient_2_Brain', 'Patient_2_Liver'],
      dtype='object')
                 Individual  Tissue
Patient_1_Blood   Patient_1   Blood
Patient_2_Testis  Patient_2  Testis
Patient_1_Liver   Patient_1   Liver
Patient_2_Brain   Patient_2   Brain
Patient_2_Liver   Patient_2   Liver
anndata.obs
Index(['Gene_1', 'Gene_2', 'Gene_3', 'Gene_4', 'Gene_5', 'Gene_6', 'Gene_7',
       'Gene_8', 'Gene_9', 'Gene_10',
       ...
       'Gene_19991', 'Gene_19992', 'Gene_19993', 'Gene_19994', 'Gene_19995',
       'Gene_19996', 'Gene_19997', 'Gene_19998', 'Gene_19999', 'Gene_20000'],
      dtype='object', length=20000)
Empty DataFrameView
Columns: []
Index: [Gene_1, Gene_2, Gene_3, Gene_4, Gene_

In [44]:
# train data
X[:10]

Unnamed: 0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,Gene_10,...,Gene_19991,Gene_19992,Gene_19993,Gene_19994,Gene_19995,Gene_19996,Gene_19997,Gene_19998,Gene_19999,Gene_20000
Patient_1_Blood_Patient_1_Liver,3762,966,974,2223,4763,1724,5866,2025,4844,5496,...,4696,1132,468,775,6098,1150,7856,2462,3628,1910
Patient_1_Liver_Patient_1_Blood,5954,2390,4451,2333,3371,3715,2096,2483,2641,7564,...,3967,1500,1003,6824,2320,7065,2660,2644,1821,1416
Patient_2_Testis_Patient_2_Brain,2766,6234,3649,1282,8075,5825,4944,108,1688,1189,...,1150,6609,1465,570,1969,4336,2644,5296,380,1993
Patient_2_Testis_Patient_2_Liver,2766,6234,3649,1282,8075,5825,4944,108,1688,1189,...,1150,6609,1465,570,1969,4336,2644,5296,380,1993
Patient_2_Brain_Patient_2_Testis,4545,3354,4068,4765,2463,4179,2773,3208,2042,2168,...,1524,6672,2991,2425,2158,4529,1961,1339,3780,2386
Patient_2_Brain_Patient_2_Liver,4545,3354,4068,4765,2463,4179,2773,3208,2042,2168,...,1524,6672,2991,2425,2158,4529,1961,1339,3780,2386
Patient_2_Liver_Patient_2_Testis,5159,1559,2982,6594,2101,3903,1786,3455,1685,1449,...,6190,2781,8495,1543,2569,2710,3536,1233,1911,1513
Patient_2_Liver_Patient_2_Brain,5159,1559,2982,6594,2101,3903,1786,3455,1685,1449,...,6190,2781,8495,1543,2569,2710,3536,1233,1911,1513
Patient_3_Heart_Patient_3_Blood,1816,2330,490,2100,4136,3471,2373,1224,2062,6767,...,2462,2730,1484,3102,3307,1028,1685,2926,345,1881
Patient_3_Heart_Patient_3_Muscle,1816,2330,490,2100,4136,3471,2373,1224,2062,6767,...,2462,2730,1484,3102,3307,1028,1685,2926,345,1881


In [47]:
Y[:10]

Unnamed: 0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,Gene_10,...,Gene_19991,Gene_19992,Gene_19993,Gene_19994,Gene_19995,Gene_19996,Gene_19997,Gene_19998,Gene_19999,Gene_20000
Patient_1_Blood_Patient_1_Liver,5954,2390,4451,2333,3371,3715,2096,2483,2641,7564,...,3967,1500,1003,6824,2320,7065,2660,2644,1821,1416
Patient_1_Liver_Patient_1_Blood,3762,966,974,2223,4763,1724,5866,2025,4844,5496,...,4696,1132,468,775,6098,1150,7856,2462,3628,1910
Patient_2_Testis_Patient_2_Brain,4545,3354,4068,4765,2463,4179,2773,3208,2042,2168,...,1524,6672,2991,2425,2158,4529,1961,1339,3780,2386
Patient_2_Testis_Patient_2_Liver,5159,1559,2982,6594,2101,3903,1786,3455,1685,1449,...,6190,2781,8495,1543,2569,2710,3536,1233,1911,1513
Patient_2_Brain_Patient_2_Testis,2766,6234,3649,1282,8075,5825,4944,108,1688,1189,...,1150,6609,1465,570,1969,4336,2644,5296,380,1993
Patient_2_Brain_Patient_2_Liver,5159,1559,2982,6594,2101,3903,1786,3455,1685,1449,...,6190,2781,8495,1543,2569,2710,3536,1233,1911,1513
Patient_2_Liver_Patient_2_Testis,2766,6234,3649,1282,8075,5825,4944,108,1688,1189,...,1150,6609,1465,570,1969,4336,2644,5296,380,1993
Patient_2_Liver_Patient_2_Brain,4545,3354,4068,4765,2463,4179,2773,3208,2042,2168,...,1524,6672,2991,2425,2158,4529,1961,1339,3780,2386
