In [28]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata

from mgitools.os_helpers import listfiles

#### peng

In [2]:
adata = sc.read_h5ad('../data/single_cell/original/peng/PRJCA001063_CRC_besca2.raw.h5ad')
adata

AnnData object with n_obs × n_vars = 57530 × 18008
    obs: 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type'
    var: 'ENSEMBL', 'SYMBOL'

In [3]:
adata.obs

Unnamed: 0,CELL,CONDITION,Patient,Type,Cell_type
T1_AAACCTGAGATGTCGG,T1_AAACCTGAGATGTCGG,T,T1,T,Fibroblast cell
T1_AAACGGGGTCATGCAT,T1_AAACGGGGTCATGCAT,T,T1,T,Stellate cell
T1_AAAGATGCATGTTGAC,T1_AAAGATGCATGTTGAC,T,T1,T,Macrophage cell
T1_AAAGATGGTCGAGTTT,T1_AAAGATGGTCGAGTTT,T,T1,T,Macrophage cell
T1_AAAGATGGTCTCTCTG,T1_AAAGATGGTCTCTCTG,T,T1,T,Endothelial cell
...,...,...,...,...,...
N11_TTTGCGCGTGCGCTTG,N11_TTTGCGCGTGCGCTTG,N,N11,N,Endothelial cell
N11_TTTGGTTCATTGAGCT,N11_TTTGGTTCATTGAGCT,N,N11,N,Acinar cell
N11_TTTGGTTGTCCGACGT,N11_TTTGGTTGTCCGACGT,N,N11,N,Ductal cell type 1
N11_TTTGTCAAGGCTAGCA,N11_TTTGTCAAGGCTAGCA,N,N11,N,Acinar cell


In [4]:
set(adata.obs['Patient'])

{'N1',
 'N10',
 'N11',
 'N2',
 'N3',
 'N4',
 'N5',
 'N6',
 'N7',
 'N8',
 'N9',
 'T1',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'T15',
 'T16',
 'T17',
 'T18',
 'T19',
 'T2',
 'T20',
 'T21',
 'T22',
 'T23',
 'T24',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T8',
 'T9'}

In [7]:
set(adata.X[0].toarray().flatten())

{0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 37.0,
 38.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 63.0,
 65.0,
 66.0,
 67.0,
 69.0,
 70.0,
 75.0,
 82.0,
 86.0,
 88.0,
 91.0,
 95.0,
 100.0,
 104.0,
 109.0,
 142.0,
 176.0,
 190.0,
 198.0}

#### lin

In [59]:
def read_lin(matrix_fp, features_fp, barcodes_fp):
    mat = sc.read_mtx(matrix_fp).X.transpose()
    feats = pd.read_csv(features_fp, sep='\t', header=None, index_col=0)
    feats.index.name = 'gene_id'
    ls = list(feats.columns)
    ls[0] = 'gene_symbol'
    feats.columns = ls
    barcodes = pd.read_csv(barcodes_fp, sep='\t', header=None, index_col=0)
    a = anndata.AnnData(X=mat, obs=barcodes, var=feats)
    print(a.shape)
    return a

In [35]:
fps = sorted(listfiles('../data/single_cell/original/lin/GSE154778_RAW', regex=r'.gz$'))
fps

['../data/single_cell/original/lin/GSE154778_RAW/MET01/barcodes.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET01/features.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET01/matrix.mtx.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET02/barcodes.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET02/features.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET02/matrix.mtx.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET03/barcodes.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET03/features.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET03/matrix.mtx.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET04/barcodes.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET04/features.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET04/matrix.mtx.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET05/barcodes.tsv.gz',
 '../data/single_cell/original/lin/GSE154778_RAW/MET05/feat

In [36]:
m = {}
for fp in fps:
    sample = fp.split('/')[-2]
    if sample not in m:
        m[sample] = {}
    if 'barcodes' in fp:
        m[sample]['barcodes'] = fp
    elif 'features' in fp:
        m[sample]['features'] = fp
    elif 'matrix' in fp:
        m[sample]['matrix'] = fp
m
        

{'MET01': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET01/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET01/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET01/matrix.mtx.gz'},
 'MET02': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET02/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET02/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET02/matrix.mtx.gz'},
 'MET03': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET03/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET03/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET03/matrix.mtx.gz'},
 'MET04': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET04/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET04/features.tsv.gz',
  'matrix': '../data/single

In [60]:
adatas = []
for sample, d in m.items():
    print(sample)
    a = read_lin(d['matrix'], d['features'], d['barcodes'])
    a.obs['sample_id'] = sample
    a.obs.index = [f'{sample}_{x}' for x in a.obs.index.to_list()]
    adatas.append(a)
len(adatas)

MET01
(533, 32738)
(533, 32738)
MET02
(745, 35635)
(745, 35635)
MET03
(526, 35635)
(526, 35635)
MET04
(272, 35635)
(272, 35635)
MET05
(2905, 35635)
(2905, 35635)
MET06
(2484, 35635)
(2484, 35635)
P01
(585, 36601)
(585, 36601)
P02
(786, 32738)
(786, 32738)
P03
(837, 32738)
(837, 32738)
P04
(1026, 32738)
(1026, 32738)
P05
(913, 32738)
(913, 32738)
P06
(769, 32738)
(769, 32738)
P07
(1098, 32738)
(1098, 32738)
P08
(1139, 32738)
(1139, 32738)
P09
(898, 32738)
(898, 32738)
P10
(1570, 32738)
(1570, 32738)


16

In [61]:
pool = None
for a in adatas:
    print(a.obs['sample_id'][0])
    if pool is None:
        pool = set(a.var.index)
    else:
        pool.intersection_update(a.var.index)
    print(len(pool))

MET01
32738
MET02
32738
MET03
32738
MET04
32738
MET05
32738
MET06
32738
P01
30358
P02
30358
P03
30358
P04
30358
P05
30358
P06
30358
P07
30358
P08
30358
P09
30358
P10
30358


In [62]:
adatas[6].var

Unnamed: 0_level_0,gene_symbol,2
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277836,AC141272.1,Gene Expression
ENSG00000278633,AC023491.2,Gene Expression
ENSG00000276017,AC007325.1,Gene Expression
ENSG00000278817,AC007325.4,Gene Expression


In [63]:
adatas[10].var

Unnamed: 0_level_0,gene_symbol
gene_id,Unnamed: 1_level_1
ENSG00000243485,MIR1302-10
ENSG00000237613,FAM138A
ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8
...,...
ENSG00000215635,AC145205.1
ENSG00000268590,BAGE5
ENSG00000251180,CU459201.1
ENSG00000215616,AC002321.2


In [66]:
gene_id_to_symbol = {i:s for a in adatas for i, s in zip(a[0].var.index, a[0].var['gene_symbol'])}
for a in adatas:
    a.var['gene_symbol_v2'] = [gene_id_to_symbol[x] for x in a.var.index]

In [67]:
for a in adatas:
    a.var.index = a.var['gene_symbol_v2'].to_list()
    a.var_names_make_unique()
    a.var = a.var[[]]
adatas[0].var

MIR1302-10
FAM138A
OR4F5
RP11-34P13.7
RP11-34P13.8
...
AC145205.1
BAGE5
CU459201.1
AC002321.2
AC002321.1


In [68]:
adata = anndata.concat(adatas)
adata

AnnData object with n_obs × n_vars = 17086 × 30673
    obs: 'sample_id'

In [69]:
adata.write_h5ad('../data/single_cell/original/lin/lin.h5ad')