In [1]:
import os

import scanpy as sc
import pandas as pd
import numpy as np
import anndata

from mgitools.os_helpers import listfiles

#### htan surgical

In [2]:
root = '../data/single_cell/original/inhouse/filtered_gene_bc_matrices/'

sample_map = {
    '1555-tumor': 'htan01',
    'H_ZY-1105-07': 'htan02',
    'H_ZY-1160-06': 'htan03',
    'H_ZY-1174-06': 'htan04',
    'TWCE-HT055P-S1PAG': 'htan05',
    'TWCE-HT081P1-XB1': 'htan06'
}

adatas = []
for sample, renamed_sample in sample_map.items():
    fp = os.path.join(root, sample)
    a = sc.read_10x_mtx(fp, var_names='gene_ids')
    a.var_names_make_unique()
    sc.pp.filter_cells(a, min_genes=100)
    
    a.obs['sample_id'] = renamed_sample
    a.obs['dataset'] = 'htan_surgical'
    a.obs.index = [f'{renamed_sample}_{x}' for x in a.obs.index]
    print(renamed_sample, a.shape)
    adatas.append(a)

htan01 (2940, 33538)
htan02 (1225, 33694)
htan03 (666, 33694)
htan04 (1346, 33694)
htan05 (3462, 33538)
htan06 (4145, 33538)


In [3]:
adatas[0].var

Unnamed: 0,gene_symbols,feature_types
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277856,AC233755.2,Gene Expression
ENSG00000275063,AC233755.1,Gene Expression
ENSG00000271254,AC240274.1,Gene Expression
ENSG00000277475,AC213203.1,Gene Expression


In [4]:
adatas[0].obs

Unnamed: 0,n_genes,sample_id,dataset
htan01_AAACCTGAGACCTAGG-1,1170,htan01,htan_surgical
htan01_AAACCTGAGTGCGTGA-1,4305,htan01,htan_surgical
htan01_AAACCTGCAATCGGTT-1,559,htan01,htan_surgical
htan01_AAACCTGCACCGTTGG-1,1765,htan01,htan_surgical
htan01_AAACCTGCATCCCACT-1,340,htan01,htan_surgical
...,...,...,...
htan01_TTTGTCAGTCGCTTCT-1,411,htan01,htan_surgical
htan01_TTTGTCAGTGACGGTA-1,877,htan01,htan_surgical
htan01_TTTGTCAGTTGTACAC-1,3228,htan01,htan_surgical
htan01_TTTGTCATCACTATTC-1,2133,htan01,htan_surgical


In [5]:
gene_id_to_symbol = {i:s
                     for a in adatas
                     for i, s in zip(a.var.index, a.var['gene_symbols'])}

In [6]:
for a in adatas:
    a.obs = a.obs[['sample_id', 'dataset']]
    # get rid of var so seurat conversion doesnt fail, adds back in later
    a.var = a.var[[]]

In [7]:
htan_surgical = anndata.concat(adatas)
htan_surgical

AnnData object with n_obs × n_vars = 13784 × 32991
    obs: 'sample_id', 'dataset'

In [8]:
htan_surgical.obs

Unnamed: 0,sample_id,dataset
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical
...,...,...
htan06_TTTGGAGGTCAAACGG-1,htan06,htan_surgical
htan06_TTTGGAGGTTGTCATG-1,htan06,htan_surgical
htan06_TTTGGTTCATCCTAAG-1,htan06,htan_surgical
htan06_TTTGGTTGTGCCCACA-1,htan06,htan_surgical


#### eus-fnb

In [9]:
keep = [
    'wu01', 'wu02', 'wu03', 'wu04', 'wu05', 'wu07', 'wu09', 'wu14', 'wu15', 'wu16', 'wu18', 'wu22',
    'wu23', 'wu24', 'wu25', 'wu26', 'wu27', 'wu28', 'wu29', 'wu30', 'wu31', 'wu32', 'wu38', 'wu39',
    'wu42'
]
root = '../data/single_cell/original/inhouse/filtered_gene_bc_matrices/'

adatas = []
for sample in keep:
    fp = os.path.join(root, sample)
    a = sc.read_10x_mtx(fp, var_names='gene_ids')
    a.var_names_make_unique()
    sc.pp.filter_cells(a, min_genes=100)
    a.obs['sample_id'] = sample
    a.obs['dataset'] = 'eusfnb'
    a.obs.index = [f'{sample}_{x}' for x in a.obs.index]
    print(sample, a.shape)
    adatas.append(a)

wu01 (892, 33538)
wu02 (1419, 33538)
wu03 (1881, 33538)
wu04 (141, 33538)
wu05 (687, 33538)
wu07 (3987, 33538)
wu09 (211, 33538)
wu14 (669, 33538)
wu15 (1241, 33538)
wu16 (3530, 33538)
wu18 (1179, 33538)
wu22 (755, 33538)
wu23 (399, 33538)
wu24 (408, 33538)
wu25 (925, 33538)
wu26 (2422, 33538)
wu27 (2386, 33538)
wu28 (2918, 33538)
wu29 (1779, 33538)
wu30 (2057, 33538)
wu31 (1235, 33538)
wu32 (973, 33538)
wu38 (983, 33538)
wu39 (9175, 33538)
wu42 (1178, 33538)


In [10]:
for a in adatas:
    # in case not already in dict, add them
    for gene_id, gene_symbol in zip(a.var.index.to_list(), a.var['gene_symbols']):
        if gene_id not in gene_id_to_symbol:
            gene_id_to_symbol[gene_id] = gene_symbol
            
    a.obs = a.obs[['sample_id', 'dataset']]
    a.var = a.var[[]]

In [11]:
eusfnb = anndata.concat(adatas)
eusfnb.shape

(43430, 33538)

In [12]:
eusfnb.obs

Unnamed: 0,sample_id,dataset
wu01_AAACCTGCACATAACC-1,wu01,eusfnb
wu01_AAACGGGCAAGGCTCC-1,wu01,eusfnb
wu01_AAACGGGTCATTCACT-1,wu01,eusfnb
wu01_AAACGGGTCTGTGCAA-1,wu01,eusfnb
wu01_AAAGTAGAGAACTGTA-1,wu01,eusfnb
...,...,...
wu42_TTTGGTTGTACCGTAT-1,wu42,eusfnb
wu42_TTTGGTTTCTACTTAC-1,wu42,eusfnb
wu42_TTTGGTTTCTCTGCTG-1,wu42,eusfnb
wu42_TTTGTCATCCTTGCCA-1,wu42,eusfnb


#### peng

In [13]:
sample_map = {
    'CRR034520': 'N1',
    'CRR034521': 'N2',
    'CRR034522': 'N3',
    'CRR034523': 'N4',
    'CRR034524': 'N5',
    'CRR034525': 'N6',
    'CRR034526': 'N7',
    'CRR034527': 'N8',
    'CRR034528': 'N9',
    'CRR034529': 'N10',
    'CRR034530': 'N11',
    'CRR241805': 'T1',
    'CRR241798': 'T2',
    'CRR241799': 'T3',
    'CRR034499': 'T4',
    'CRR034500': 'T5',
    'CRR034501': 'T6',
    'CRR241800': 'T7',
    'CRR034503': 'T8',
    'CRR034504': 'T9',
    'CRR034505': 'T10',
    'CRR034506': 'T11',
    'CRR034507': 'T12',
    'CRR241801': 'T13',
    'CRR034509': 'T14',
    'CRR034510': 'T15',
    'CRR034511': 'T16',
    'CRR034512': 'T17',
    'CRR034513': 'T18',
    'CRR241802': 'T19',
    'CRR241804': 'T20',
    'CRR034516': 'T21',
    'CRR034517': 'T22',
    'CRR241803': 'T23',
    'CRR034519': 'T24',
}

In [14]:
root = '../data/single_cell/original/peng_reprocessed'

adatas = []
for sample, renamed_sample in sample_map.items():
    fp = os.path.join(root, sample)
    a = sc.read_10x_mtx(fp, var_names='gene_ids')
    a.var_names_make_unique()
    sc.pp.filter_cells(a, min_genes=100)
    a.obs['sample_id'] = renamed_sample
    a.obs['dataset'] = 'peng'
    a.obs.index = [f'{renamed_sample}_{x}' for x in a.obs.index]
    print(renamed_sample, a.shape)
    adatas.append(a)

N1 (9109, 33538)
N2 (7498, 33538)
N3 (3827, 33538)
N4 (7918, 33538)
N5 (4230, 33538)
N6 (9286, 33538)
N7 (6718, 33538)
N8 (6682, 33538)
N9 (9986, 33538)
N10 (4293, 33538)
N11 (11078, 33538)
T1 (2882, 33538)
T2 (8203, 33538)
T3 (4401, 33538)
T4 (3819, 33538)
T5 (3640, 33538)
T6 (4739, 33538)
T7 (3581, 33538)
T8 (3101, 33538)
T9 (7224, 33538)
T10 (2113, 33538)
T11 (6954, 33538)
T12 (5094, 33538)
T13 (7567, 33538)
T14 (4140, 33538)
T15 (6939, 33538)
T16 (4926, 33538)
T17 (7885, 33538)
T18 (6850, 33538)
T19 (11108, 33538)
T20 (4283, 33538)
T21 (2792, 33538)
T22 (5421, 33538)
T23 (6947, 33538)
T24 (9621, 33538)


In [15]:
for a in adatas:
    # in case not already in dict, add them
    for gene_id, gene_symbol in zip(a.var.index.to_list(), a.var['gene_symbols']):
        if gene_id not in gene_id_to_symbol:
            gene_id_to_symbol[gene_id] = gene_symbol
            
    a.obs = a.obs[['sample_id', 'dataset']]
    a.var = a.var[[]]

In [16]:
peng = anndata.concat(adatas)
peng.shape

(214855, 33538)

In [17]:
peng.obs

Unnamed: 0,sample_id,dataset
N1_AAACCTGAGAACAATC-1,N1,peng
N1_AAACCTGAGCTATGCT-1,N1,peng
N1_AAACCTGAGGTGTTAA-1,N1,peng
N1_AAACCTGAGTATGACA-1,N1,peng
N1_AAACCTGCAACCGCCA-1,N1,peng
...,...,...
T24_TTTGTCAGTCTAAAGA-1,T24,peng
T24_TTTGTCATCAGTTTGG-1,T24,peng
T24_TTTGTCATCCCATTTA-1,T24,peng
T24_TTTGTCATCCTTTCTC-1,T24,peng


###### old (not used anymore)

In [18]:
# peng = sc.read_h5ad('../data/single_cell/original/peng/PRJCA001063_CRC_besca2.raw.h5ad')
# # peng.var_names_make_unique()
# # peng.obs_names_make_unique()
# peng

In [19]:
# peng.obs

In [20]:
# peng.obs['sample_id'] = peng.obs['Patient'].to_list()
# peng.obs['dataset'] = 'peng'
# peng.obs = peng.obs[['sample_id', 'dataset']]
# peng.obs

In [21]:
# peng.var

In [22]:
# for gene_id, gene_symbol in zip(peng.var['ENSEMBL'], peng.var['SYMBOL']):
#     if gene_id not in gene_id_to_symbol:
#         gene_id_to_symbol[gene_id] = gene_symbol

In [23]:
# peng.var.index = peng.var['ENSEMBL'].to_list()

In [24]:
# pool = set(peng.var.index)
# len(pool)
# len(pool.intersection(set(eusfnb.var.index)))

In [25]:
# peng.var = peng.var[[]]

#### lin

In [26]:
def read_lin(matrix_fp, features_fp, barcodes_fp):
    mat = sc.read_mtx(matrix_fp).X.transpose()
    feats = pd.read_csv(features_fp, sep='\t', header=None, index_col=0)
    feats.index.name = 'gene_ids'
    ls = list(feats.columns)
    ls[0] = 'gene_symbols'
    feats.columns = ls
    barcodes = pd.read_csv(barcodes_fp, sep='\t', header=None, index_col=0)
    a = anndata.AnnData(X=mat, obs=barcodes, var=feats)
    return a

In [27]:
fps = sorted(listfiles('../data/single_cell/original/lin/GSE154778_RAW', regex=r'.gz$'))
m = {}
for fp in fps:
    sample = fp.split('/')[-2]
    if sample not in m:
        m[sample] = {}
    if 'barcodes' in fp:
        m[sample]['barcodes'] = fp
    elif 'features' in fp:
        m[sample]['features'] = fp
    elif 'matrix' in fp:
        m[sample]['matrix'] = fp
m       

{'MET01': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET01/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET01/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET01/matrix.mtx.gz'},
 'MET02': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET02/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET02/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET02/matrix.mtx.gz'},
 'MET03': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET03/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET03/features.tsv.gz',
  'matrix': '../data/single_cell/original/lin/GSE154778_RAW/MET03/matrix.mtx.gz'},
 'MET04': {'barcodes': '../data/single_cell/original/lin/GSE154778_RAW/MET04/barcodes.tsv.gz',
  'features': '../data/single_cell/original/lin/GSE154778_RAW/MET04/features.tsv.gz',
  'matrix': '../data/single

In [28]:
adatas = []
for sample, d in m.items():
    print(sample)
    a = read_lin(d['matrix'], d['features'], d['barcodes'])
    a.obs['sample_id'] = sample
    a.obs.index = [f'{sample}_{x}' for x in a.obs.index.to_list()]
    adatas.append(a)
len(adatas)

MET01
MET02
MET03
MET04
MET05
MET06
P01
P02
P03
P04
P05
P06
P07
P08
P09
P10


16

In [29]:
for a in adatas:
    for gene_id, gene_symbol in zip(a.var.index.to_list(), a.var['gene_symbols']):
        if gene_id not in gene_id_to_symbol:
            gene_id_to_symbol[gene_id] = gene_symbol
    a.var = a.var[[]]

In [30]:
lin = anndata.concat(adatas)
lin

AnnData object with n_obs × n_vars = 17086 × 30358
    obs: 'sample_id'

In [31]:
lin.obs['dataset'] = 'lin'

In [32]:
lin.obs

Unnamed: 0,sample_id,dataset
MET01_AAAGATGAGCCCGAAA-1,MET01,lin
MET01_AAAGATGAGTCAAGGC-1,MET01,lin
MET01_AAAGATGCAGCGAACA-1,MET01,lin
MET01_AAAGTAGGTACCTACA-1,MET01,lin
MET01_AACTCAGAGCAGCGTA-1,MET01,lin
...,...,...
P10_TTTGTCAGTTGATTGC-1,P10,lin
P10_TTTGTCATCACTGGGC-1,P10,lin
P10_TTTGTCATCAGAGACG-1,P10,lin
P10_TTTGTCATCATGGTCA-1,P10,lin


#### chan-seng-yue

In [33]:
fps = sorted(listfiles('../data/single_cell/original/chan-seng-yue/h5', regex=r'.h5$'))
fps

['../data/single_cell/original/chan-seng-yue/h5/100070_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/85948_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/87235_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/87784_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/90209_CMP_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/91412_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/91610_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/91706_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/94930_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/95092_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/original/chan-seng-yue/h5/95373_filtered_gene_bc_matrices_h5.h5',
 '../data/single_cell/origi

In [34]:
adatas = []
for fp in fps:
    sample = fp.split('/')[-1].split('_filtered')[0]
    a = sc.read_10x_h5(fp)
    a.var_names_make_unique()
    sc.pp.filter_cells(a, min_genes=100)
    a.obs['sample_id'] = sample
    a.obs['dataset'] = 'chan-seng-yue'
    a.obs.index = [f'{sample}_{x}' for x in a.obs.index]
    print(sample, a.shape)
    adatas.append(a)

  utils.warn_names_duplicates("var")


100070 (2789, 32738)
85948 (2323, 32738)
87235 (2718, 32738)
87784 (2325, 32738)
90209_CMP (1641, 32738)
91412 (2347, 32738)
91610 (1250, 32738)
91706 (2662, 32738)
94930 (1428, 32738)
95092 (2567, 32738)
95373 (2362, 32738)
96460 (1260, 32738)
97727 (2657, 32738)
COMP_0158_P (3159, 32738)
G9903 (2477, 32738)


In [35]:
adatas[0].var

Unnamed: 0,gene_ids
MIR1302-10,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945
...,...
AC145205.1,ENSG00000215635
BAGE5,ENSG00000268590
CU459201.1,ENSG00000251180
AC002321.2,ENSG00000215616


In [36]:
for a in adatas:
    for gene_id, gene_symbol in zip(a.var['gene_ids'], a.var.index.to_list()):
        if gene_id not in gene_id_to_symbol:
            gene_id_to_symbol[gene_id] = gene_symbol
    a.obs = a.obs[['sample_id', 'dataset']]
    a.var.index = a.var['gene_ids'].to_list()
    a.var = a.var[[]]

In [37]:
csy = anndata.concat(adatas)
csy.shape

(33965, 32738)

In [38]:
csy.obs

Unnamed: 0,sample_id,dataset
100070_AAACCTGAGCGTAATA-1,100070,chan-seng-yue
100070_AAACCTGCAATCCGAT-1,100070,chan-seng-yue
100070_AAACCTGCACCGATAT-1,100070,chan-seng-yue
100070_AAACCTGCACTAGTAC-1,100070,chan-seng-yue
100070_AAACGGGAGACTGGGT-1,100070,chan-seng-yue
...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue


#### combined

In [39]:
adata = anndata.concat([htan_surgical, eusfnb, lin, peng, csy], join='outer')
adata

AnnData object with n_obs × n_vars = 323120 × 35476
    obs: 'sample_id', 'dataset'

In [40]:
adata.var['gene_symbols'] = [gene_id_to_symbol[gid] for gid in adata.var.index.to_list()]
adata.var

Unnamed: 0,gene_symbols
ENSG00000000003,TSPAN6
ENSG00000000005,TNMD
ENSG00000000419,DPM1
ENSG00000000457,SCYL3
ENSG00000000460,C1orf112
...,...
ENSG00000285492,AL356417.3
ENSG00000285505,AC010616.2
ENSG00000285508,AL034430.1
ENSG00000285509,AP000646.1


In [41]:
len(set(adata.var['gene_symbols']))

34985

In [42]:
set(adata.obs['sample_id'])

{'100070',
 '85948',
 '87235',
 '87784',
 '90209_CMP',
 '91412',
 '91610',
 '91706',
 '94930',
 '95092',
 '95373',
 '96460',
 '97727',
 'COMP_0158_P',
 'G9903',
 'MET01',
 'MET02',
 'MET03',
 'MET04',
 'MET05',
 'MET06',
 'N1',
 'N10',
 'N11',
 'N2',
 'N3',
 'N4',
 'N5',
 'N6',
 'N7',
 'N8',
 'N9',
 'P01',
 'P02',
 'P03',
 'P04',
 'P05',
 'P06',
 'P07',
 'P08',
 'P09',
 'P10',
 'T1',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'T15',
 'T16',
 'T17',
 'T18',
 'T19',
 'T2',
 'T20',
 'T21',
 'T22',
 'T23',
 'T24',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T8',
 'T9',
 'htan01',
 'htan02',
 'htan03',
 'htan04',
 'htan05',
 'htan06',
 'wu01',
 'wu02',
 'wu03',
 'wu04',
 'wu05',
 'wu07',
 'wu09',
 'wu14',
 'wu15',
 'wu16',
 'wu18',
 'wu22',
 'wu23',
 'wu24',
 'wu25',
 'wu26',
 'wu27',
 'wu28',
 'wu29',
 'wu30',
 'wu31',
 'wu32',
 'wu38',
 'wu39',
 'wu42'}

In [43]:
adata.obs

Unnamed: 0,sample_id,dataset
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical
...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue


In [44]:
def get_tissue_type(sample, dataset):
    if dataset == 'peng' and sample[0] == 'N':
        return 'normal'
    if dataset == 'lin' and 'MET' in sample:
        return 'tumor_met'
    return 'tumor_primary'

adata.obs['tissue_type'] = [get_tissue_type(s, d)
                            for s, d in zip(adata.obs['sample_id'], adata.obs['dataset'])]
adata.obs

Unnamed: 0,sample_id,dataset,tissue_type
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical,tumor_primary
...,...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue,tumor_primary


In [45]:
from collections import Counter
Counter(adata.obs['tissue_type']).most_common()

[('tumor_primary', 235030), ('normal', 80625), ('tumor_met', 7465)]

In [46]:
adata.write_h5ad('../data/sandbox/temp.h5ad')

###### build count matrix

In [47]:
adata = sc.read_h5ad('../data/sandbox/temp.h5ad')
adata.shape

(323120, 35476)

In [48]:
adata.var['gene_id'] = adata.var.index.to_list()
adata.var.index = adata.var['gene_symbols'].to_list()
adata.var

Unnamed: 0,gene_symbols,gene_id
TSPAN6,TSPAN6,ENSG00000000003
TNMD,TNMD,ENSG00000000005
DPM1,DPM1,ENSG00000000419
SCYL3,SCYL3,ENSG00000000457
C1orf112,C1orf112,ENSG00000000460
...,...,...
AL356417.3,AL356417.3,ENSG00000285492
AC010616.2,AC010616.2,ENSG00000285505
AL034430.1,AL034430.1,ENSG00000285508
AP000646.1,AP000646.1,ENSG00000285509


In [49]:
adata.var = adata.var[[]]

In [50]:
adata.obs.index.name = 'cell_id'
adata.obs.to_csv('../data/single_cell/original/count_matrix/metadata.txt', sep='\t')

In [51]:
len(set(adata.var.index))

34985

In [52]:
adata.var

TSPAN6
TNMD
DPM1
SCYL3
C1orf112
...
AL356417.3
AC010616.2
AL034430.1
AP000646.1
AP006216.3


keep only first entry if there are multiple gene ids mapping to single gene

In [53]:
seen = set()
keep = []
for i, g in enumerate(adata.var.index.to_list()):
    if g not in seen:
        seen.add(g)
        keep.append(i)
len(keep), len(seen)

(34985, 34985)

In [54]:
len(set(adata.obs.index)), len(set(adata.var.index))

(323120, 34985)

In [55]:
slim = adata[:, keep]
slim

View of AnnData object with n_obs × n_vars = 323120 × 34985
    obs: 'sample_id', 'dataset', 'tissue_type'

In [56]:
len(set(slim.var.index))

34985

In [57]:
slim.write_h5ad('../data/single_cell/original/count_matrix/counts.h5ad')

In [1]:
import scanpy as sc

In [2]:
a = sc.read_h5ad('../../sc_pdac/backups/final_adata.h5ad')
a

AnnData object with n_obs × n_vars = 140612 × 30840
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.1.25', 'seurat_clusters', 'integrated_snn_res.2', 'integrated_snn_res.3', 'integrated_snn_res.1', 'integrated_snn_res.0.5', 'active.ident', 'cell_type', 'dataset', 'sample_id', 'cell_type_level_2', 'cell_type_level_3', 'tumor_original_seurat', 'malignant_exclude', 'final_cell_types'
    var: 'is_highly_variable'
    uns: 'cell_type_colors', 'cell_type_level_2_colors', 'cell_type_level_3_colors', 'dataset_colors', 'final_cell_types_colors', 'rank_genes_groups'
    obsm: 'X_umap'
    layers: 'counts'

In [3]:
a.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_SCT,nFeature_SCT,CELL,CONDITION,Patient,Type,Cell_type,...,integrated_snn_res.0.5,active.ident,cell_type,dataset,sample_id,cell_type_level_2,cell_type_level_3,tumor_original_seurat,malignant_exclude,final_cell_types
wu01_AAACCTGCACATAACC-1,wu01,11822.0,3295,3377.0,1314,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,14,Monocyte,Monocyte,In-house: EUS-FNB,wu01,Monocyte,Monocyte,Monocyte,False,Monocyte
wu01_AAACCTGCACCACGTG-1,wu01,469.0,293,2235.0,450,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,14,Monocyte,Monocyte,In-house: EUS-FNB,wu01,Monocyte,Monocyte,Monocyte,False,Monocyte
wu01_AAACCTGTCTAACTTC-1,wu01,482.0,243,2436.0,400,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,0,Malignant,Malignant,In-house: EUS-FNB,wu01,Malignant_0,Malignant_0,Malignant_0,False,Malignant - Classical Low
wu01_AAACGGGCAAGGCTCC-1,wu01,1516.0,717,2944.0,717,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,2,Monocyte,Monocyte,In-house: EUS-FNB,wu01,Monocyte,Monocyte,Monocyte,False,Monocyte
wu01_AAACGGGTCATTCACT-1,wu01,876.0,453,2491.0,524,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,14,Monocyte,Monocyte,In-house: EUS-FNB,wu01,Monocyte,Monocyte,Monocyte,False,Monocyte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA-1,G9903,8237.0,2326,9379.0,2326,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,1,CAF,CAF,Chan-Seng-Yue,G9903,CAF,CAF,CAF,False,CAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT-1,G9903,6484.0,1875,9150.0,1886,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,1,CAF,CAF,Chan-Seng-Yue,G9903,CAF,CAF,CAF,False,CAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC-1,G9903,8367.0,2286,9400.0,2286,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,1,CAF,CAF,Chan-Seng-Yue,G9903,CAF,CAF,CAF,False,CAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCATCCGAACGC-1,G9903,20314.0,3975,10684.0,3464,NA_character_,-2147483648,NA_character_,-2147483648,NA_character_,...,0,Malignant,Malignant,Chan-Seng-Yue,G9903,Malignant_0,Malignant_0,Malignant_0,False,Malignant - Classical Low


In [5]:
a.layers['counts']

<140612x30840 sparse matrix of type '<class 'numpy.float32'>'
	with 295800072 stored elements in Compressed Sparse Row format>