In [1]:
import logging
import anndata
import scanpy as sc
import os
import csv
import gzip

In [2]:
dataset_dir = r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\GSE164378_RAW\GSM5008740_RNA_3P"
data_file = "GSM5008737_RNA_3P-matrix.mtx"
var_names_file = "GSM5008737_RNA_3P-features.tsv"
obs_names_file = "GSM5008737_RNA_3P-barcodes.tsv"
output_h5ad_file = "Hao_PBMC_GSE164378.h5ad"

data_path = os.path.join(dataset_dir,data_file)
var_names_path = os.path.join(dataset_dir,var_names_file)
obs_names_path = os.path.join(dataset_dir,obs_names_file)
output_h5ad_path = os.path.join(dataset_dir,output_h5ad_file)

In [3]:
with open(var_names_path, "r") as var_file:
    var_read = csv.reader(var_file, delimiter='\t')
    var_names = []
    for row in var_read:
        var_names.append(row[1])

In [4]:
with open(obs_names_path, "r") as obs_file:
    obs_read = csv.reader(obs_file, delimiter='\t')
    obs_names = []
    for row in obs_read:
        obs_names.append(row[0])

In [5]:
adata = sc.read(data_path) 
adata = adata.transpose()

In [6]:
adata.var_names = var_names
adata.var_names_make_unique()
adata.obs_names = obs_names
adata.obs_names_make_unique()

In [7]:
adata.shape

(161764, 33538)

In [8]:
import pandas as pd
import numpy as np

meta = pd.read_csv(r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\GSE164378_sc.meta.data_3P.csv", index_col=0)
meta.head()

Unnamed: 0,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,orig.ident,lane,donor,time,celltype.l1,celltype.l2,celltype.l3,Phase,Batch
L1_AAACCCAAGAAACTCA,7535,217,10823,2915,SeuratProject,L1,P2,7,Mono,CD14 Mono,CD14 Mono,G1,Batch1
L1_AAACCCAAGACATACA,6013,209,5864,1617,SeuratProject,L1,P1,7,CD4 T,CD4 TCM,CD4 TCM_1,G1,Batch1
L1_AAACCCACAACTGGTT,6620,213,5067,1381,SeuratProject,L1,P4,2,CD8 T,CD8 Naive,CD8 Naive,S,Batch1
L1_AAACCCACACGTACTA,3567,202,4786,1890,SeuratProject,L1,P3,7,NK,NK,NK_2,G1,Batch1
L1_AAACCCACAGCATACT,6402,215,6505,1621,SeuratProject,L1,P4,7,CD8 T,CD8 Naive,CD8 Naive,G1,Batch1


In [9]:
adata.obs['celltype.l1'] = meta['celltype.l1'].tolist()
adata.obs['celltype.l2'] = meta['celltype.l2'].tolist()
adata.obs['celltype.l3'] = meta['celltype.l3'].tolist()
adata.obs['Batch'] = meta['Batch'].tolist()
adata.obs['donor'] = meta['donor'].tolist()
adata.obs['time'] = meta['time'].tolist()
adata.obs['lane'] = meta['lane'].tolist()
adata.obs['Phase'] = meta['Phase'].tolist()

adata.obs['nCount_ADT'] = meta['nCount_ADT'].tolist()
adata.obs['nFeature_ADT'] = meta['nFeature_ADT'].tolist()
adata.obs['nCount_RNA'] = meta['nCount_RNA'].tolist()
adata.obs['nFeature_RNA'] = meta['nFeature_RNA'].tolist()

In [10]:
# Get rid of spaces
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(" ", "_")
adata.obs['celltype.l2'] = adata.obs['celltype.l2'].str.replace(" ", "_")
adata.obs['celltype.l3'] = adata.obs['celltype.l3'].str.replace(" ", "_")

In [11]:
adata.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,celltype.l3,Batch,donor,time,lane,Phase,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA
L1_AAACCCAAGAAACTCA,Mono,CD14_Mono,CD14_Mono,Batch1,P2,7,L1,G1,7535,217,10823,2915
L1_AAACCCAAGACATACA,CD4_T,CD4_TCM,CD4_TCM_1,Batch1,P1,7,L1,G1,6013,209,5864,1617
L1_AAACCCACAACTGGTT,CD8_T,CD8_Naive,CD8_Naive,Batch1,P4,2,L1,S,6620,213,5067,1381
L1_AAACCCACACGTACTA,NK,NK,NK_2,Batch1,P3,7,L1,G1,3567,202,4786,1890
L1_AAACCCACAGCATACT,CD8_T,CD8_Naive,CD8_Naive,Batch1,P4,7,L1,G1,6402,215,6505,1621


In [12]:
# Briefly view adata.X (raw counts)
rows, cols = adata.X[:5,:].nonzero()
i = 0
for row, col in zip(rows, cols):
    if i == 20: break
    print(f'Row {row}, Column {col}, Value {adata.X[row, col]}')
    i += 1


Row 0, Column 12, Value 1.0
Row 0, Column 43, Value 2.0
Row 0, Column 48, Value 2.0
Row 0, Column 53, Value 2.0
Row 0, Column 55, Value 5.0
Row 0, Column 66, Value 1.0
Row 0, Column 77, Value 2.0
Row 0, Column 78, Value 5.0
Row 0, Column 89, Value 2.0
Row 0, Column 93, Value 3.0
Row 0, Column 102, Value 2.0
Row 0, Column 154, Value 17.0
Row 0, Column 178, Value 2.0
Row 0, Column 185, Value 1.0
Row 0, Column 190, Value 1.0
Row 0, Column 201, Value 8.0
Row 0, Column 224, Value 2.0
Row 0, Column 240, Value 1.0
Row 0, Column 259, Value 1.0
Row 0, Column 261, Value 6.0


In [13]:
output_h5ad_path = r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\Hao_PBMC_GSE164378_raw.h5ad"
adata.write(filename=output_h5ad_path)

In [14]:
adata.shape

(161764, 33538)