In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os
import csv

In [3]:
# HCC_dataset_dir = '/Users/evanli/Documents/Research_datasets/HCC_Lu'
# HCC_dataset_dir = r'C:\Users\evanlee\Documents\Research_datasets\HCC_Lu'
HCC_dataset_dir = dataset_dir / 'HCC_Lu'
data_file = "GSE149614_HCC.scRNAseq.S71915.count.txt.gz"
var_names_file = ''
obs_names_file = ''
output_h5ad_file = "HCC_Lu_GSE149614_raw.h5ad"
metadata_file = 'GSE149614_HCC.metadata.updated.txt'

data_path = os.path.join(HCC_dataset_dir,data_file)
var_names_path = os.path.join(HCC_dataset_dir,var_names_file)
obs_names_path = os.path.join(HCC_dataset_dir,obs_names_file)
metadata_path = os.path.join(HCC_dataset_dir, metadata_file)
output_h5ad_path = os.path.join(HCC_dataset_dir,output_h5ad_file)

In [None]:
with open(var_names_path, "r") as var_file:
    var_read = csv.reader(var_file, delimiter='\t')
    var_names = []
    for row in var_read:
        var_names.append(row[1])

In [None]:
with open(obs_names_path, "r") as obs_file:
    obs_read = csv.reader(obs_file, delimiter='\t')
    obs_names = []
    for row in obs_read:
        obs_names.append(row[0])

In [4]:
adata = sc.read(data_path) 
adata = adata.transpose()

Only considering the two last: ['.txt', '.gz'].
Only considering the two last: ['.txt', '.gz'].
Only considering the two last: ['.txt', '.gz'].


In [5]:
# adata.var_names = var_names
adata.var_names_make_unique()
# adata.obs_names = obs_names
adata.obs_names_make_unique()
adata.shape

(71915, 25712)

In [6]:
type(adata.X)

numpy.ndarray

In [7]:
# turn dense to sparse matrix
from scipy.sparse import csc_matrix

sparse_matrix = csc_matrix(adata.X)
type(sparse_matrix)


scipy.sparse._csc.csc_matrix

In [8]:
adata.X = sparse_matrix
type(adata.X)

scipy.sparse._csc.csc_matrix

In [9]:
metadata = pd.read_csv(metadata_path, sep='\t')
metadata.head()

Unnamed: 0,Cell,sample,res.3,site,patient,stage,virus,celltype
0,HCC01T_AAACCTGAGGGCATGT,HCC01T,13,Tumor,HCC01,I,HBV,T/NK
1,HCC01T_AAACCTGAGTCGCCGT,HCC01T,16,Tumor,HCC01,I,HBV,Myeloid
2,HCC01T_AAACCTGCATTACCTT,HCC01T,25,Tumor,HCC01,I,HBV,T/NK
3,HCC01T_AAACCTGGTCACACGC,HCC01T,2,Tumor,HCC01,I,HBV,T/NK
4,HCC01T_AAACCTGTCCAGTATG,HCC01T,2,Tumor,HCC01,I,HBV,T/NK


In [10]:
adata.obs['celltype'] = metadata['celltype'].tolist()
adata.obs['res_3_clus'] = metadata['res.3'].tolist()
adata.obs['sample'] = metadata['sample'].tolist()
adata.obs['site'] = metadata['site'].tolist()
adata.obs['patient'] = metadata['patient'].tolist()
adata.obs['stage'] = metadata['stage'].tolist()
adata.obs['virus'] = metadata['virus'].tolist()

adata.obs.head()

Unnamed: 0,celltype,res_3_clus,sample,site,patient,stage,virus
HCC01T_AAACCTGAGGGCATGT,T/NK,13,HCC01T,Tumor,HCC01,I,HBV
HCC01T_AAACCTGAGTCGCCGT,Myeloid,16,HCC01T,Tumor,HCC01,I,HBV
HCC01T_AAACCTGCATTACCTT,T/NK,25,HCC01T,Tumor,HCC01,I,HBV
HCC01T_AAACCTGGTCACACGC,T/NK,2,HCC01T,Tumor,HCC01,I,HBV
HCC01T_AAACCTGTCCAGTATG,T/NK,2,HCC01T,Tumor,HCC01,I,HBV


In [11]:
# Briefly view adata.X (raw counts)
rows, cols = adata.X[:5,:].nonzero()
i = 0
for row, col in zip(rows, cols):
    if i == 20: break
    print(f'Row {row}, Column {col}, Value {adata.X[row, col]}')
    i += 1


Row 0, Column 2, Value 1.0
Row 0, Column 10, Value 1.0
Row 0, Column 18, Value 1.0
Row 0, Column 20, Value 2.0
Row 0, Column 21, Value 1.0
Row 0, Column 22, Value 2.0
Row 0, Column 43, Value 3.0
Row 0, Column 68, Value 1.0
Row 0, Column 92, Value 1.0
Row 0, Column 99, Value 8.0
Row 0, Column 109, Value 1.0
Row 0, Column 125, Value 1.0
Row 0, Column 150, Value 1.0
Row 0, Column 168, Value 1.0
Row 0, Column 181, Value 1.0
Row 0, Column 183, Value 2.0
Row 0, Column 185, Value 1.0
Row 0, Column 196, Value 1.0
Row 0, Column 220, Value 1.0
Row 0, Column 222, Value 2.0


## Write as .h5ad

In [12]:
output_h5ad_path

'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Dataset\\HCC_Lu\\HCC_Lu_GSE149614_raw.h5ad'

In [12]:
adata.shape

(71915, 25712)

In [13]:
adata.write(filename=output_h5ad_path)