In [1]:
import logging
import anndata
import scanpy as sc
import os
import csv
import gzip

In [2]:
dataset_dir = "/Users/evanli/Documents/Research_datasets/PBMC_68k_Zheng/filtered_matrices_mex/hg19"
data_file = "matrix.mtx"
var_names_file = "genes.tsv"
obs_names_file = "barcodes.tsv"
output_h5ad_file = "Zheng_PBMC.h5ad"

data_path = os.path.join(dataset_dir,data_file)
var_names_path = os.path.join(dataset_dir,var_names_file)
obs_names_path = os.path.join(dataset_dir,obs_names_file)
output_h5ad_path = os.path.join(dataset_dir,output_h5ad_file)

In [3]:
with open(var_names_path, "r") as var_file:
    var_read = csv.reader(var_file, delimiter='\t')
    var_names = []
    gene_symbols = []
    for row in var_read:
        var_names.append(row[0])
        gene_symbols.append(row[1])

In [8]:
var_names[:10]

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000237683',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000228463',
 'ENSG00000237094']

In [6]:
gene_symbols[:10]

['MIR1302-10',
 'FAM138A',
 'OR4F5',
 'RP11-34P13.7',
 'RP11-34P13.8',
 'AL627309.1',
 'RP11-34P13.14',
 'RP11-34P13.9',
 'AP006222.2',
 'RP4-669L17.10']

In [4]:
with open(obs_names_path, "r") as obs_file:
    obs_read = csv.reader(obs_file, delimiter='\t')
    obs_names = []
    for row in obs_read:
        obs_names.append(row[0])

In [8]:
adata = sc.read(data_path) 
adata = adata.transpose()

In [9]:
adata.var_names = var_names
adata.var_names_make_unique()
adata.obs_names = obs_names
adata.obs_names_make_unique()

In [10]:
adata.shape

(68579, 32738)

In [11]:
import pandas as pd
# meta = pd.read_csv("/home/yincheng23/Hao_PBMC/GSE164378_sc.meta.data_3P.csv", index_col=0)
annot = pd.read_csv("/Users/evanli/Documents/Research_datasets/PBMC_68k_Zheng/github_data/68k_pbmc_barcodes_annotation.tsv", sep="\t", index_col=2)
adata.obs['celltype'] = annot['celltype'].tolist()
adata.obs['TSNE.1'] = annot['TSNE.1'].tolist()
adata.obs['TSNE.2'] = annot['TSNE.2'].tolist()

In [15]:
adata.obs.head()

Unnamed: 0,celltype,TSNE.1,TSNE.2
AAACATACACCCAA-1,CD8+ Cytotoxic T,7.56554,0.44137
AAACATACCCCTCA-1,CD8+/CD45RA+ Naive Cytotoxic,2.552626,-25.786672
AAACATACCGGAGA-1,CD4+/CD45RO+ Memory,-5.771831,11.830846
AAACATACTAACCG-1,CD19+ B,1.762556,25.979346
AAACATACTCTTCA-1,CD4+/CD25 T Reg,-16.793856,-16.58997


In [16]:
adata.var['gene_symbols'] = gene_symbols
adata.var.head()

Unnamed: 0,gene_symbols
ENSG00000243485,MIR1302-10
ENSG00000237613,FAM138A
ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8


In [17]:
# Briefly view adata.X
rows, cols = adata.X[:5,:].nonzero()
i = 0
for row, col in zip(rows, cols):
    if i == 20: break
    print(f'Row {row}, Column {col}, Value {adata.X[row, col]}')
    i += 1


Row 0, Column 53, Value 1.0
Row 0, Column 70, Value 1.0
Row 0, Column 81, Value 1.0
Row 0, Column 166, Value 1.0
Row 0, Column 178, Value 3.0
Row 0, Column 229, Value 1.0
Row 0, Column 316, Value 1.0
Row 0, Column 383, Value 1.0
Row 0, Column 415, Value 1.0
Row 0, Column 492, Value 10.0
Row 0, Column 519, Value 2.0
Row 0, Column 558, Value 1.0
Row 0, Column 618, Value 1.0
Row 0, Column 631, Value 1.0
Row 0, Column 671, Value 1.0
Row 0, Column 686, Value 2.0
Row 0, Column 763, Value 1.0
Row 0, Column 798, Value 1.0
Row 0, Column 799, Value 2.0
Row 0, Column 837, Value 1.0


In [18]:
output_h5ad_path

'/Users/evanli/Documents/Research_datasets/PBMC_68k_Zheng/filtered_matrices_mex/hg19/Zheng_PBMC.h5ad'

In [19]:
adata.write(filename=output_h5ad_path)

In [20]:
adata.shape

(68579, 32738)