In [1]:
import os
import random
import pandas as pd
import scanpy as sc
from scipy import sparse
import tqdm

import warnings
from anndata import ImplicitModificationWarning

warnings.filterwarnings("ignore", category=ImplicitModificationWarning)

# sc.settings.verbosity = 0
sc._settings.ScanpyConfig.verbosity = 0

### OneK1K to single matrices:

In [2]:
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE196nnn/GSE196735/suppl/GSE196735_RAW.tar

--2024-04-24 18:51:01--  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE196nnn/GSE196735/suppl/GSE196735_RAW.tar
           => ‘GSE196735_RAW.tar’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.11, 2607:f220:41e:250::7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /geo/series/GSE196nnn/GSE196735/suppl ... done.
==> SIZE GSE196735_RAW.tar ... 1551697920
==> PASV ... done.    ==> RETR GSE196735_RAW.tar ... done.
Length: 1551697920 (1.4G) (unauthoritative)


2024-04-24 18:52:23 (18.7 MB/s) - ‘GSE196735_RAW.tar’ saved [1551697920]



In [3]:
!mkdir raw
!mv GSE196735_RAW.tar ./raw/

In [13]:
!tar -xf ./raw/GSE196735_RAW.tar -C ./raw

!mkdir raw_counts
!cd ./raw && for f in *RawCounts.csv.gz ; do gunzip -c "$f" > ../raw_counts/"${f%.*}" ; done
!rm ./raw/*RawCounts.csv.gz

!mkdir single_matrices

In [32]:
metadata = pd.read_csv('metadata_all.csv')
metadata.set_index('barcode', inplace=True)
metadata.drop(columns=['percent.mt', 'cell_type_ontology_term_id',
                       'cell_type', 'observation_joinid'],
              inplace=True)
metadata['sex'] = metadata['sex'].replace(to_replace={'female': 'Female', 'male': 'Male'})
metadata['dataset'] = 'OneK1K'

raw_counts = os.listdir('./raw_counts')
with tqdm.tqdm(total=len(raw_counts)) as progress_bar:
    for raw in raw_counts:
        if raw.startswith('GSM'):
            # read pool mtx:
            adata = sc.read_csv(f"./raw_counts/{raw}")
            adata = adata.T
            adata.X = sparse.csr_matrix(adata.X)
    
            # add metadata:
            # adata.obs = pd.merge(adata.obs, metadata, left_index=True, right_on='barcode').reindex(adata.obs_names)
            adata.obs = pd.concat([adata.obs, metadata], axis=1).reindex(adata.obs_names)
            adata = adata[adata.obs.dropna().index]
            pool = int(adata.obs['pool_number'].unique()[0])
    
            # parse mtx with metadata:
            for id in adata.obs['donor_id'].unique():
                child = adata[adata.obs['donor_id'] == id]
                age = int(child.obs['age'].unique()[0])
                sex = child.obs['sex'].unique()[0]
                filename = f'{pool}-{id}-{age}-{sex}.h5ad'
                if age >= 65:
                    child.write(os.path.join('./single_matrices/', '65+', filename))
                else:                                    
                    child.write(os.path.join('./single_matrices/', filename))
            progress_bar.update()

100%|█████████████████████████████████████████| 75/75 [1:00:34<00:00, 48.46s/it]


### CLUES (healthy) to single matrices: 

Source: [GSE174188](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE174188)

> Examination of 1.2 million PBMCs in 162 SLE donors and 99 healthy individuals to find cellular and genetic correlates of SLE.

In [8]:
# try to run several times, it's valid link!
!wget -c ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174188/suppl/GSE174188%5FCLUES1%5Fadjusted.h5ad.gz

--2024-05-07 01:05:41--  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174188/suppl/GSE174188%5FCLUES1%5Fadjusted.h5ad.gz
           => ‘GSE174188_CLUES1_adjusted.h5ad.gz’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.12, 2607:f220:41e:250::7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /geo/series/GSE174nnn/GSE174188/suppl ... done.
==> SIZE GSE174188_CLUES1_adjusted.h5ad.gz ... 7833149210
==> PASV ... done.    ==> REST 6890045440 ... done.    
==> RETR GSE174188_CLUES1_adjusted.h5ad.gz ... done.
Length: 7833149210 (7.3G), 943103770 (899M) remaining (unauthoritative)

GSE174188_CLUES1_ad 100%[+++++++++++++++++==>]   7.29G  19.5MB/s    in 48s     

2024-05-07 01:06:32 (18.6 MB/s) - ‘GSE174188_CLUES1_adjusted.h5ad.gz’ saved [7833149210]



In [9]:
!gunzip GSE174188_CLUES1_adjusted.h5ad.gz

In [3]:
cl = sc.read_h5ad('./raw/GSE174188_CLUES1_adjusted.h5ad')
cl = cl[cl.obs['SLE_status'] == 'Healthy']

In [4]:
cl

View of AnnData object with n_obs × n_vars = 486418 × 1999
    obs: 'batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov', 'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status', 'SLE_status'
    var: 'gene_ids'
    uns: 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [14]:
cl_raw = cl.raw.to_adata()
cl_raw

AnnData object with n_obs × n_vars = 486418 × 32738
    obs: 'batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov', 'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status', 'SLE_status'
    var: 'gene_ids', 'feature_types-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0'
    uns: 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_pca', 'X_umap'
    obsp: 'connectivities', 'distances'

> Since CLUES data were parsed after I started working with OneK1K, I bring it into line with OneK1K:

In [17]:
cl_raw.var.drop(columns='feature_types-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0', 
                inplace=True)

In [18]:
cl_new = sc.AnnData(X=cl_raw.X,
                    obs=cl_raw.obs[['batch_cov', 'ind_cov', 'ct_cov', 'Age', 'Sex', 'pop_cov']],
                    var=cl_raw.var)

In [22]:
cl_new.obs.rename(columns={'batch_cov': 'pool_number', 
                           'ind_cov': 'donor_id', 
                           'Age': 'age', 
                           'Sex': 'sex', 
                           'pop_cov': 'ethnicity'}, inplace=True)
cl_new.obs['dataset'] = 'CLUES'

In [23]:
for id in cl_new.obs['donor_id'].unique():
    child = cl_new[cl_new.obs['donor_id'] == id]
    age = child.obs['age'].unique()[0]
    sex = child.obs['sex'].unique()[0]
    pool = child.obs['pool_number'].unique()[0]
    filename = f'CLUES-{pool}-{id}-{float(age):.0f}-{sex}.h5ad'
    if float(age) >= 65:
        child.write(f'./single_matrices/65+/{filename}')
    else: 
        child.write(f'./single_matrices/{filename}')

In [24]:
# let's check:
test = sc.read_h5ad('./single_matrices/CLUES-dmx_count_AH7TNHDMXX_YE_8-30-IGTB1793_IGTB1793-25-Female.h5ad')
test

AnnData object with n_obs × n_vars = 4152 × 32738
    obs: 'pool_number', 'donor_id', 'ct_cov', 'age', 'sex', 'ethnicity', 'dataset'
    var: 'gene_ids'

In [25]:
test.obs

Unnamed: 0,pool_number,donor_id,ct_cov,age,sex,ethnicity,dataset
CGTTCTGGTTTGTTTC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,CytoT_GZMK+,25.0,Female,European,CLUES
ACAGCCGCATTCACTT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,T8_naive,25.0,Female,European,CLUES
GTGCAGCGTTCCATGA-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,NK_dim,25.0,Female,European,CLUES
CAGCATAAGAGGGCTT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,,25.0,Female,European,CLUES
AAACCTGTCTTGCAAG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,T_mait,25.0,Female,European,CLUES
...,...,...,...,...,...,...,...
ATCGAGTCAATCACAC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,B_mem,25.0,Female,European,CLUES
CGAACATTCTGAAAGA-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,T4_naive,25.0,Female,European,CLUES
TACCTTAGTGATGTCT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,B_naive,25.0,Female,European,CLUES
TCAATCTCAAGTTAAG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0,dmx_count_AH7TNHDMXX_YE_8-30,IGTB1793_IGTB1793,T8_naive,25.0,Female,European,CLUES


In [26]:
len(os.listdir('./single_matrices/65+'))

576

In [27]:
len(os.listdir('./single_matrices/'))

507