In [1]:
import logging
import anndata
import scanpy as sc
import os
import csv
import gzip

In [2]:
dataset_dir = '/Users/evanli/Documents/Research_datasets/Stuart_GSE128639/Stuart_GSE128639_RAW'
data_file = "GSM3681518_MNC_RNA_counts.tsv"
var_names_file = ""
obs_names_file = ""
output_h5ad_file = "Stuart_bm.h5ad"

data_path = os.path.join(dataset_dir,data_file)
var_names_path = os.path.join(dataset_dir,var_names_file)
obs_names_path = os.path.join(dataset_dir,obs_names_file)
output_h5ad_path = os.path.join(dataset_dir,output_h5ad_file)

In [None]:
with open(var_names_path, "r") as var_file:
    var_read = csv.reader(var_file, delimiter='\t')
    var_names = []
    for row in var_read:
        var_names.append(row[1])

In [11]:
with open(obs_names_path, "r") as obs_file:
    obs_read = csv.reader(obs_file, delimiter='\t')
    obs_names = []
    for row in obs_read:
        obs_names.append(row[0])

In [7]:
adata = sc.read(data_path) 

In [9]:
adata = adata.transpose()

In [10]:
adata.shape
# (33454, 17009)
# 33454 cells, 17009 genes

(33454, 17009)

In [11]:
adata.var_names  # genes

Index(['FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP11-206L10.9',
       'LINC00115', 'FAM41C', 'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1',
       ...
       'FAM19A5', 'RP3-522J7.6', 'CITF22-1A6.3', 'MOV10L1', 'MIR99AHG',
       'AP000223.42', 'DSCR9', 'AP001626.2', 'AP001046.5', 'AC004556.1'],
      dtype='object', length=17009)

In [12]:
adata.obs_names  # cells

Index(['a_AAACCTGAGCTTATCG.1', 'a_AAACCTGAGGTGGGTT.1', 'a_AAACCTGAGTACATGA.1',
       'a_AAACCTGCAAACCTAC.1', 'a_AAACCTGCAAGGTGTG.1', 'a_AAACCTGCACGGTAGA.1',
       'a_AAACCTGCACTTGGAT.1', 'a_AAACCTGCAGATGAGC.1', 'a_AAACCTGCAGATGGGT.1',
       'a_AAACCTGCAGCCTTGG.1',
       ...
       'b_TTTGTCATCAACACTG.1', 'b_TTTGTCATCACTTATC.1', 'b_TTTGTCATCAGCGACC.1',
       'b_TTTGTCATCATAGCAC.1', 'b_TTTGTCATCCAAATGC.1', 'b_TTTGTCATCCGAGCCA.1',
       'b_TTTGTCATCCGTAGGC.1', 'b_TTTGTCATCCTCGCAT.1', 'b_TTTGTCATCGCCGTGA.1',
       'b_TTTGTCATCTACGAGT.1'],
      dtype='object', length=33454)

In [22]:
import pandas as pd
import numpy as np

# Metadata
metadata = pd.read_csv('/Users/evanli/Documents/EvanPys/Progress/Stuart_bm/Stuart_metadata.csv', index_col=0)
metadata.index = metadata.index.str.replace("-", ".")

metadata.head()  # 30672 cells (rows)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_ADT,nFeature_ADT,lane,donor,celltype.l1,celltype.l2,RNA.weight,ADT.weight,wsnn_res.2,seurat_clusters
a_AAACCTGAGCTTATCG.1,bmcite,7546,2136,1350,25,HumanHTO4,batch1,Progenitor cells,Prog_RBC,0.487299,0.512701,19,19
a_AAACCTGAGGTGGGTT.1,bmcite,1029,437,2970,25,HumanHTO1,batch1,T cell,gdT,0.245543,0.754457,10,10
a_AAACCTGAGTACATGA.1,bmcite,1111,429,2474,23,HumanHTO5,batch1,T cell,CD4 Naive,0.50168,0.49832,1,1
a_AAACCTGCAAACCTAC.1,bmcite,2741,851,4799,25,HumanHTO3,batch1,T cell,CD4 Memory,0.431308,0.568692,4,4
a_AAACCTGCAAGGTGTG.1,bmcite,2099,843,5434,25,HumanHTO2,batch1,Mono/DC,CD14 Mono,0.572097,0.427903,2,2


In [23]:
metadata.shape

(30672, 13)

In [29]:
# Convert the pandas Index objects to sets and find the intersection
common_elements = set(metadata.index).intersection(set(adata.obs_names))
print('No of common cells that are present in both the metadata (30672) and the adata (33454):')
print(len(common_elements))

# Filter the adata object to only include the cells that are present in the metadata
adata_common = adata[adata.obs_names.isin(metadata.index), :]
print(adata_common.shape)
# Filter the metadata object to only include the cells that are present in the adata
metadata_common = metadata.loc[adata_common.obs_names, :]
print(metadata_common.shape)

No of common cells that are present in both the metadata (30672) and the adata (33454):
30011
(30011, 17009)
(30011, 13)


In [31]:
metadata_common.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT',
       'nFeature_ADT', 'lane', 'donor', 'celltype.l1', 'celltype.l2',
       'RNA.weight', 'ADT.weight', 'wsnn_res.2', 'seurat_clusters'],
      dtype='object')

In [32]:
adata_common.obs['celltype.l1'] = metadata_common['celltype.l1'].tolist()
adata_common.obs['celltype.l2'] = metadata_common['celltype.l2'].tolist()
adata_common.obs['nCount_RNA'] = metadata_common['nCount_RNA'].tolist()
adata_common.obs['nFeature_RNA'] = metadata_common['nFeature_RNA'].tolist()
adata_common.obs['nCount_ADT'] = metadata_common['nCount_ADT'].tolist()
adata_common.obs['nFeature_ADT'] = metadata_common['nFeature_ADT'].tolist()
adata_common.obs['lane'] = metadata_common['lane'].tolist()
adata_common.obs['donor'] = metadata_common['donor'].tolist()
adata_common.obs['RNA.weight'] = metadata_common['RNA.weight'].tolist()

  adata_common.obs['celltype.l1'] = metadata_common['celltype.l1'].tolist()


In [43]:
adata_common.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,nCount_RNA,nFeature_RNA,nCount_ADT,nFeature_ADT,lane,donor,RNA.weight
a_AAACCTGAGCTTATCG.1,Progenitor cells,Prog_RBC,7546,2136,1350,25,HumanHTO4,batch1,0.487299
a_AAACCTGAGGTGGGTT.1,T cell,gdT,1029,437,2970,25,HumanHTO1,batch1,0.245543
a_AAACCTGAGTACATGA.1,T cell,CD4 Naive,1111,429,2474,23,HumanHTO5,batch1,0.50168
a_AAACCTGCAAACCTAC.1,T cell,CD4 Memory,2741,851,4799,25,HumanHTO3,batch1,0.431308
a_AAACCTGCAAGGTGTG.1,Mono/DC,CD14 Mono,2099,843,5434,25,HumanHTO2,batch1,0.572097


In [44]:
# Briefly view adata.X
rows, cols = adata_common.X[:5,:].nonzero()
i = 0
for row, col in zip(rows, cols):
    if i == 20: break
    print(f'Row {row}, Column {col}, Value {adata_common.X[row, col]}')
    i += 1


Row 0, Column 23, Value 1.0
Row 0, Column 31, Value 1.0
Row 0, Column 49, Value 1.0
Row 0, Column 50, Value 1.0
Row 0, Column 58, Value 1.0
Row 0, Column 59, Value 2.0
Row 0, Column 75, Value 1.0
Row 0, Column 77, Value 1.0
Row 0, Column 78, Value 1.0
Row 0, Column 84, Value 19.0
Row 0, Column 88, Value 1.0
Row 0, Column 94, Value 1.0
Row 0, Column 101, Value 1.0
Row 0, Column 106, Value 1.0
Row 0, Column 108, Value 1.0
Row 0, Column 110, Value 7.0
Row 0, Column 136, Value 1.0
Row 0, Column 138, Value 2.0
Row 0, Column 147, Value 1.0
Row 0, Column 162, Value 1.0


In [45]:
output_h5ad_path

'/Users/evanli/Documents/Research_datasets/Stuart_GSE128639/Stuart_GSE128639_RAW/Stuart_bm.h5ad'

In [46]:
adata_common.shape

(30011, 17009)

In [47]:
adata_common.write(filename=output_h5ad_path)