In [24]:
import numpy as np
import pandas as pd
from scipy.io import mmread

In [21]:
!zcat tar_file/GSM3271041_ATAC_sciCAR_A549_cell.txt.gz | head -3

sample,source,group,experiment
sci-RNA-A-071.AGGTCTATGG,Mouse,293T_3T3,co_assay
sci-RNA-A-071.GCGGAGTCGA,Human,A549_3h,co_assay

gzip: stdout: Broken pipe


In [22]:
!zcat tar_file/GSM3271040_RNA_sciCAR_A549_cell.txt.gz | head -3

sample,cell_name,experiment,treatment_time
sci-RNA-A-001.CGCCAGGCAT,293T,coassay,NA
sci-RNA-A-001.AAGTACGTTA,A549,coassay,3

gzip: stdout: Broken pipe


In [5]:
atac_info = pd.read_csv('ATAC/filtered/barcodes.tsv', header=None,
                        names=['sample', 'source', 'group', 'experiment'],
                        index_col='sample')
rna_info = pd.read_csv('RNA/filtered/barcodes.tsv', header=None,
                       names=['sample', 'cell_name', 'experiment', 'treatment_time'],
                       index_col='sample')

In [6]:
atac_info.head(2)

Unnamed: 0_level_0,source,group,experiment
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sci-RNA-A-071.GCGGAGTCGA,Human,A549_3h,co_assay
sci-RNA-A-071.CTGAAGAGAC,Human,A549_1h,co_assay


In [7]:
rna_info.head(2)

Unnamed: 0_level_0,cell_name,experiment,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sci-RNA-A-001.CGCCAGGCAT,293T,coassay,
sci-RNA-A-001.AAGTACGTTA,A549,coassay,3.0


In [15]:
df = pd.concat([atac_info, rna_info], axis=1, join='inner')

In [16]:
df.shape

(2919, 6)

In [17]:
df.groupby(['source', 'group', 'cell_name']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,experiment,experiment,treatment_time
source,group,cell_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Human,293T_3T3,293T,462,462,0
Human,293T_3T3,3T3,2,2,0
Human,293T_3T3,mixed,13,13,0
Human,A549_0h,A549,533,533,533
Human,A549_1h,A549,881,881,881
Human,A549_3h,A549,978,978,978
Mixed,293T_3T3,293T,3,3,0
Mixed,293T_3T3,3T3,3,3,0
Mixed,293T_3T3,mixed,44,44,0


In [25]:
atac_mtx = mmread('ATAC/filtered/matrix.mtx')
rna_mtx = mmread('RNA/filtered/matrix.mtx')

atac_count = pd.DataFrame(data=atac_mtx.toarray(), columns=atac_info.index)
rna_count = pd.DataFrame(data=rna_mtx.toarray(), columns=rna_info.index)

In [28]:
hek_atac = atac_count.loc[:, df.query('cell_name == "293T"').index]
hek_rna = rna_count.loc[:, df.query('cell_name == "293T"').index]

In [34]:
!mkdir -p HEK293T/ATAC/filtered HEK293T/RNA/filtered

for i,j in zip(['HEK293T/ATAC/filtered', 'HEK293T/RNA/filtered'],
               [hek_atac, hek_rna]):
    out = pd.concat([j.sum(0), (j > 0).sum(0)], axis=1)
    out.columns = ['nCounts', 'nFeatures']
    out.to_csv('%s/metrics.csv' % i, index_label='cell')

In [35]:
for df in [hek_atac, hek_rna]:
    print(df.sum(0).median(), (df > 0).sum(0).median())

412.0 187.0
2953.0 1672.0


# No NIH3T3 cells after fitering, check original matrix

In [40]:
raw_atac_cell_info = pd.read_csv('tar_file/GSM3271041_ATAC_sciCAR_A549_cell.txt.gz', index_col=0)
raw_rna_cell_info = pd.read_csv('tar_file/GSM3271040_RNA_sciCAR_A549_cell.txt.gz', index_col=0)

original_info = pd.concat([raw_atac_cell_info, raw_rna_cell_info],
                          axis=1, join='inner')
original_info.head()

Unnamed: 0_level_0,source,group,experiment,cell_name,experiment,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sci-RNA-A-071.AGGTCTATGG,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-071.GCGGAGTCGA,Human,A549_3h,co_assay,A549,coassay,3.0
sci-RNA-A-071.TTGCAGCATT,Human,A549_1h,co_assay,A549,coassay,1.0
sci-RNA-A-071.GCCTGATATA,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-071.GCGGCCAATC,Human,A549_3h,co_assay,A549,coassay,3.0


In [41]:
nih_cells = original_info.query('cell_name == "3T3"')
nih_cells

Unnamed: 0_level_0,source,group,experiment,cell_name,experiment,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sci-RNA-A-071.AGGTCTATGG,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-071.GCCTGATATA,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-071.AATACCAGTT,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-071.GTAGATCGTT,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-A-023.TCTGACGAGG,Mouse,293T_3T3,co_assay,3T3,coassay,
...,...,...,...,...,...,...
sci-RNA-E-070.GAATGAGGAG,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-E-022.AGGCCGGTAA,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-E-022.CGAAGGCATG,Mouse,293T_3T3,co_assay,3T3,coassay,
sci-RNA-E-022.TCTGACGAGG,Mouse,293T_3T3,co_assay,3T3,coassay,


In [42]:
raw_atac_mtx = mmread('ATAC/raw/matrix.mtx')
raw_rna_mtx = mmread('RNA/raw/matrix.mtx')

raw_atac = pd.DataFrame(data=raw_atac_mtx.toarray(), columns=raw_atac_cell_info.index)
raw_rna = pd.DataFrame(data=raw_rna_mtx.toarray(), columns=raw_rna_cell_info.index)

In [43]:
nih_atac = raw_atac.loc[:, nih_cells.index]
nih_rna = raw_rna.loc[:, nih_cells.index]

In [45]:
!mkdir -p NIH3T3/ATAC/filtered NIH3T3/RNA/filtered

for i,j in zip(['NIH3T3/ATAC/filtered', 'NIH3T3/RNA/filtered'],
               [nih_atac, nih_rna]):
    out = pd.concat([j.sum(0), (j > 0).sum(0)], axis=1)
    out.columns = ['nCounts', 'nFeatures']
    out.to_csv('%s/metrics.csv' % i, index_label='cell')