# Download the scRNA data

In [1]:
data_dir = '/root/datos/maestria/netopaas/'
backup_dir = '/root/datos/maestria/netopaas/luca_explore/surgeries'

In [191]:
import numpy as np
import pandas as pd
import utils.functions as ut
import urllib

import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects

# Set the graphics device
robjects.r('options(bitmapType="cairo")')
import anndata2ri

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


## Zuani 2024

### Download

In [119]:
zuani_dir = f'{data_dir}/Zuani2024'
zuani_samples = pd.read_csv(f'{zuani_dir}/E-MTAB-13526.sdrf.txt', sep='\t')

# Filter for non enriched MDSC = Myeloid derived supressor cells, CD235a- filters for erythrocites
zuani_samples

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[individual],Characteristics[original source name],Characteristics[age],Unit[time unit],Term Source REF,Term Accession Number,...,Derived Array Data File,Protocol REF.6,Protocol REF.7,Derived Array Data File.1,Protocol REF.8,Protocol REF.9,Derived Array Data File.2,Factor Value[disease],Factor Value[FACS],Factor Value[sampling site]
0,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
1,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
2,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
3,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
4,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1563,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1564,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1565,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor


We filter for non-immune enriched samples and only tumor tissue. All the non-tumor enirhced ones are filtered for no plasma cells (erythrocites CD235a-)
that is also the case in many of the other samples... and we can also enrich for imput ethe as they are only 1 of the many cell types.

In [120]:
facs_filter = ['CD235a-']
zuani_samples = zuani_samples[zuani_samples['Factor Value[FACS]'].isin(facs_filter)]

zuani_samples = zuani_samples[zuani_samples['Factor Value[sampling site]']=='tumor']

In [76]:
base_url = 'https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files'
zuani_patients = zuani_samples['Source Name'].unique()

urls_paths = []

for pat in zuani_patients:
    filename = f'{pat}-barcodes.tsv.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

    filename = f'{pat}-features.tsv.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

    filename = f'{pat}-matrix.mtx.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

ut.download_parallel(urls_paths, cpus=8)

CPUS:  8
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-features.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-matrix.mtx.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-features.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-matrix.mtx.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P15_T2-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P15_T2-features.tsv.gz
url: https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-features.tsv.gz time (s): 1.5921189785003662
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files

### Convert to adata

In [187]:
import os
import io
import csv
import gzip

import scanpy as sc
import anndata as ad

def concatenate_datasets(directory):
    """
    Concatenate multiple .mtx datasets from a single directory into a single AnnData object.
    
    Parameters:
    - directory (str): The directory containing .mtx, features.tsv, and barcodes.tsv files.
    
    Returns:
    - AnnData: The concatenated AnnData object.
    """
    adatas = {}
    features_file = ''
    
    file_pattern = sorted([f for f in os.listdir(directory) if 'matrix' in f])
    for mtx_file in file_pattern:
        idx = mtx_file.replace('matrix.mtx.gz', '')
        print(idx)
        features_file = f'{idx}features.tsv'
        
        # Load data and append to list
        adata = sc.read_10x_mtx(directory,prefix=idx)
        adatas[idx[:-1]] = adata
    
    # Concatenate all AnnData objects
    concatenated_adata = ad.concat(adatas, label='sample')

    gene_names = []
    with gzip.open(features_file, 'rb') as f:
        reader = csv.reader(io.TextIOWrapper(f, encoding='utf-8'), delimiter='\t')
        for row in reader:
            gene_names.append(row[1])
    concatenated_adata.var['gene_symbols'] = gene_names

    return concatenated_adata

# Example usage:
adata = concatenate_datasets(f'{zuani_dir}')
print(adata)

P15_T2-
P16_T2-
P17_T2-
P17_T3-


KeyboardInterrupt: 

In [188]:
concatenated_adata.write_h5ad('Zuani.h5ad')

## Deng 2024

In [193]:
urllib.request.urlretrieve('https://figshare.com/ndownloader/files/44695465',f'{data_dir}/Deng2024/dengluad.rds')

ModuleNotFoundError: No module named 'curl'

In [215]:
%%R -i data_dir
library(Seurat)
library(SeuratDisk)

deng <- readRDS(paste0(data_dir, '/Deng2024/dengluad.rds'))
deng = UpdateSeuratObject(deng)

# After P49 there is no metadata so better delete. Also the ones with N as they are healthy controls,
cells_to_exclude <- which(grepl("P05|N", deng@meta.data$orig.ident))
deng <- subset(deng, cells = setdiff(Cells(deng), cells_to_exclude))


SaveH5Seurat(deng, filename=paste0(data_dir, '/Deng2024/deng.h5Seurat'))
Convert(paste0(data_dir, '/Deng2024/deng.h5Seurat'), dest = "h5ad")
Convert(paste0(data_dir, '/Deng2024/deng.h5Seurat'), dest = paste0(data_dir, '/Deng2024/dengRNA.h5ad'), assay = 'RNA')

Validating h5Seurat file
Adding data from RNA as X
Adding counts from RNA as raw
Transfering meta.data to obs
Adding dimensional reduction information for tsne (global)
Adding dimensional reduction information for umap (global)


In [None]:
! rm {data_dir}/Deng2024/deng.h5Seurat

In [None]:
# adata = sc.read_h5ad(f'{data_dir}/Deng2024/deng.h5ad')
adata_raw = sc.read_h5ad(f'{data_dir}/Deng2024/dengRNA.h5ad')

# After P49 there is no metadata so better delete. Also the ones with N as they are healthy controls,
# Should be done before in R to be faster
# adata = adata[np.logical_not(adata.obs['orig.ident'].str.contains('P05'))]
# adata = adata[np.logical_not(adata.obs['orig.ident'].str.contains('N'))]

# The assignment fo slots and assays is not very good with Convert from SeuratDisk so we make some arrangements
adata.layers['data'] = adata.raw.X.copy()
del adata.raw
adata.layers['scale.data'] = adata.X

adata_raw.layers['counts'] = adata_raw.raw.X.copy()
del adata_raw.raw
adata_raw.layers['data'] = adata_raw.X

adata.raw = adata_raw.copy()

del adata_raw

path_metadata = '/root/datos/maestria/netopaas/Zuani2024/metadata.xlsx'
if not os.path.exists(path_metadata):
    urllib.request.urlretrieve('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11031428/bin/mmc2.xlsx', path_metadata)
xl_zuani = pd.ExcelFile(path_metadata)
metada_zuani = xl_zuani.parse(' lung cancer in scRNAseq')
adata.obs = pd.merge(adata.obs, metada_zuani, how='left', left_on='orig.ident', right_on='Patient Number')


adata.write_h5ad(f'{backup_dir}/filtered_deng.h5ad')

## Alexandra 2023

In [77]:
urllib.request.urlretrieve('https://zenodo.org/records/7852154/files/data_code.zip?download=1',f'{data_dir}/Alexandra2023/data.zip')

('/root/datos/maestria/netopaas//alexandra2023/data.zip',
 <http.client.HTTPMessage at 0x7f7325a5cf10>)

## Zhong 2024


In [79]:
zhong_samples = pd.read_csv(f'{data_dir}/Zhong2024/GSE241934_IIT_Meta.txt', sep='\t')
zhong_samples

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,sampleID,cellID,RNA_snn_res.0.8,seurat_clusters,major.cell.type,RNA_snn_res.1,...,Gender,Age,Histology,Cycles,PD1,Pathological Response,EGFR,Pathological Response Rate,Smoking_History,PD-L1 TPS
0,P343,17606,4574,2.891060,P343,P343_AAACCTGAGCTATGCT-1,6,6,Fibro,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
1,P343,9542,3230,4.003354,P343,P343_AAACCTGCATGATCCA-1,8,8,Fibro,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
2,P343,10729,3832,2.302172,P343,P343_AAACCTGGTGAGTATA-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
3,P343,7788,2859,5.225989,P343,P343_AAACCTGTCCAGTATG-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
4,P343,15449,4074,3.301185,P343,P343_AAACGGGCACGACGAA-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78686,P591,6608,2528,3.344431,P591,P591_TGCCCTAGTCAGATAA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78687,P591,1064,707,7.518797,P591,P591_TGGCTGGAGCGTTGCC-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78688,P591,13226,3318,1.126569,P591,P591_TTCTCCTAGTGCGTGA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78689,P591,7335,2912,6.693933,P591,P591_TTTCCTCAGCGTAATA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
