In [1]:
import os
import scanpy as sc
import numpy as np
import pandas as pd

np.random.seed(2023)

In [2]:
source_file = "../Data/RNA/Simulation-PBMC/covid_portal_210320_with_raw.h5ad"
res_dir = "../Data/RNA/Simulation-PBMC"

In [3]:
adata = sc.read_h5ad(source_file)

In [4]:
adata

AnnData object with n_obs × n_vars = 647366 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [5]:
# Remove LPS_10hours and LPS_90mins samples
cells = (adata.obs["Status_on_day_collection_summary"] != "LPS_10hours") & (adata.obs["Status_on_day_collection_summary"] != "LPS_90mins")
adata = adata[cells,]

In [6]:
adata

View of AnnData object with n_obs × n_vars = 639482 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [7]:
# Exclude samples that are resampling of the same patient; arbitrarily take the first sample.
keep_samples = []
patients = np.unique(adata.obs["patient_id"])
for patient in patients:
    keep_samples.append(np.unique(adata.obs["sample_id"][adata.obs["patient_id"] == patient])[0])

In [8]:
print(len(keep_samples))
keep_samples[0:10]

118


['AP1', 'AP10', 'AP11', 'AP12', 'AP2', 'AP3', 'AP4', 'AP5', 'AP6', 'AP8']

In [9]:
adata = adata[np.isin(adata.obs["sample_id"], keep_samples),:]
assert(len(np.unique(adata.obs["patient_id"])) == len(np.unique(adata.obs["sample_id"])))

In [10]:
adata

View of AnnData object with n_obs × n_vars = 614473 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [11]:
adata.write_h5ad(f"{res_dir}/stephenson.h5ad")

In [12]:


# outfile = "/Users/erahmani/Dropbox/TCAx/code/TCAx-R/Stephenson_raw.h5ad"
# ## (1) Stephenson et al. data (https://pubmed.ncbi.nlm.nih.gov/33879890/) (downloaded from https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-10026/)
# filename = "/Users/erahmani/Dropbox/keris/data/E-MTAB-10026/covid_portal_210320_with_raw.h5ad"

# # read data
# adata = sc.read_h5ad(filename)




# # keep only top 10000 most variable genes 
# var_genes = sc.pp.highly_variable_genes(adata, n_top_genes=10000, layer = "raw", batch_key="Site", flavor='seurat_v3', inplace=False)
# adata = adata[:,var_genes["highly_variable"]]










# library(Seurat)
# library(SeuratData)
# library(SeuratDisk)

# ?Convert

# Convert("../Data/RNA/Simulation-PBMC/haniffa21.processed.h5ad", dest = "h5seurat", overwrite = TRUE)

# pbmc <- LoadH5Seurat("../Data/RNA/Simulation-PBMC/haniffa21.processed.h5seurat")
# pbmc

# saveRDS(pbmc, "../Data/RNA/Simulation-PBMC/haniffa21.processed.rds")