# Download the scRNA data

In [1]:
data_dir = '/root/datos/maestria/netopaas/'
backup_dir = '/root/datos/maestria/netopaas/luca_explore/surgeries'

In [2]:
import os

import numpy as np
import pandas as pd
import utils.functions as ut
import urllib

import scanpy as sc

import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects

# Set the graphics device
robjects.r('options(bitmapType="cairo")')
import anndata2ri

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


## Zuani 2024

### Download

In [3]:
data_dir

'/root/datos/maestria/netopaas/'

In [6]:
! ls {data_dir}/Zuani2024

E-MTAB-13526.idf.txt	P17_T2-barcodes.tsv.gz	P22_T1-matrix.mtx.gz
E-MTAB-13526.sdrf.txt	P17_T2-features.tsv.gz	P23_T1-barcodes.tsv.gz
EmptyDrop_P15_T2-.sh	P17_T2-matrix.mtx.gz	P23_T1-features.tsv.gz
EmptyDrop_P16_T2-.sh	P17_T3-barcodes.tsv.gz	P23_T1-matrix.mtx.gz
EmptyDrop_P17_T2-.sh	P17_T3-features.tsv.gz	P24_T1-barcodes.tsv.gz
EmptyDrop_P17_T3-.sh	P17_T3-matrix.mtx.gz	P24_T1-features.tsv.gz
EmptyDrop_P18_T2-.sh	P18_T2-barcodes.tsv.gz	P24_T1-matrix.mtx.gz
EmptyDrop_P19_T2-.sh	P18_T2-features.tsv.gz	P4_T2-barcodes.tsv.gz
EmptyDrop_P20_T2-.sh	P18_T2-matrix.mtx.gz	P4_T2-features.tsv.gz
EmptyDrop_P21_T1-.sh	P19_T2-barcodes.tsv.gz	P4_T2-matrix.mtx.gz
EmptyDrop_P21_T2-.sh	P19_T2-features.tsv.gz	P4_T3-barcodes.tsv.gz
EmptyDrop_P22_T1-.sh	P19_T2-matrix.mtx.gz	P4_T3-features.tsv.gz
EmptyDrop_P23_T1-.sh	P20_T2-barcodes.tsv.gz	P4_T3-matrix.mtx.gz
EmptyDrop_P24_T1-.sh	P20_T2-features.tsv.gz	P8_T2-barcodes.tsv.gz
EmptyDrop_P4_T2-.sh	P20_T2-matrix.mtx.gz	P8_T2-features.tsv.gz
EmptyDrop_P4_T3-.sh	P21

In [28]:
zuani_dir = f'{data_dir}/Zuani2024'
zuani_samples = pd.read_csv(f'{zuani_dir}/E-MTAB-13526.sdrf.txt', sep='\t')

# Filter for non enriched MDSC = Myeloid derived supressor cells, CD235a- filters for erythrocites
zuani_samples

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[individual],Characteristics[original source name],Characteristics[age],Unit[time unit],Term Source REF,Term Accession Number,...,Derived Array Data File,Protocol REF.6,Protocol REF.7,Derived Array Data File.1,Protocol REF.8,Protocol REF.9,Derived Array Data File.2,Factor Value[disease],Factor Value[FACS],Factor Value[sampling site]
0,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
1,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
2,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
3,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
4,P1_B1,ERS16703054,SAMEA114591031,Homo sapiens,Patient 1,TB18.0655,73,year,EFO,UO_0000036,...,P1_B1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P1_B1-features.tsv.gz,non-small cell lung cancer,MDSC,normal tissue adjacent to tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1563,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1564,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor
1565,P18_T1,ERS16703118,SAMEA114591092,Homo sapiens,Patient 18,TB21.0006,77,year,EFO,UO_0000036,...,P18_T1-matrix.mtx.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-barcodes.tsv.gz,P-MTAB-137970,P-MTAB-137971,P18_T1-features.tsv.gz,lung squamous cell carcinoma,CD45+,tumor


We filter for non-immune enriched samples and only tumor tissue. All the non-tumor enirhced ones are filtered for no plasma cells (erythrocites CD235a-)
that is also the case in many of the other samples... we can consider the eryhrocites enriched depleted, for they are only one cell type that is not very present. But we should impute some abundance values in these samples.

In [5]:
facs_filter = ['CD235a-']
zuani_samples = zuani_samples[zuani_samples['Factor Value[FACS]'].isin(facs_filter)]

zuani_samples = zuani_samples[zuani_samples['Factor Value[sampling site]']=='tumor']

In [7]:
zuani_samples.columns

Index(['Source Name', 'Comment[ENA_SAMPLE]', 'Comment[BioSD_SAMPLE]',
       'Characteristics[organism]', 'Characteristics[individual]',
       'Characteristics[original source name]', 'Characteristics[age]',
       'Unit[time unit]', 'Term Source REF', 'Term Accession Number',
       'Characteristics[developmental stage]', 'Characteristics[sex]',
       'Characteristics[organism part]', 'Characteristics[disease]',
       'Characteristics[FACS]', 'Characteristics[sampling site]',
       'Characteristics[tumor grading]', 'Description', 'Material Type',
       'Protocol REF', 'Protocol REF.1', 'Protocol REF.2', 'Extract Name',
       'Comment[LIBRARY_LAYOUT]', 'Comment[LIBRARY_SELECTION]',
       'Comment[LIBRARY_SOURCE]', 'Comment[LIBRARY_STRATEGY]',
       'Comment[cdna read]', 'Comment[cdna read offset]',
       'Comment[cdna read size]', 'Comment[cell barcode offset]',
       'Comment[cell barcode read]', 'Comment[cell barcode size]',
       'Comment[end bias]', 'Comment[input molecu

In [11]:
zuani_samples['Characteristics[tumor grading]'].unique()

array(['T3N0M0', 'T2bN0M0', nan, 'T2aN0M0', 'T4N2M0', 'T2aN1M0', 'T3M0N0',
       'T2bN1M0', 'T1cN0M0', 'T4N0'], dtype=object)

In [76]:
base_url = 'https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files'
zuani_patients = zuani_samples['Source Name'].unique()

urls_paths = []

for pat in zuani_patients:
    filename = f'{pat}-barcodes.tsv.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

    filename = f'{pat}-features.tsv.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

    filename = f'{pat}-matrix.mtx.gz'
    urls_paths.append((f'{base_url}/{filename}',f'{zuani_dir}/{filename}'))

ut.download_parallel(urls_paths, cpus=8)

CPUS:  8
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-features.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P1_T1-matrix.mtx.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-features.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-matrix.mtx.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P15_T2-barcodes.tsv.gz
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P15_T2-features.tsv.gz
url: https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/P2_T1-features.tsv.gz time (s): 1.5921189785003662
getting https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files

The authors from Zuani alos offer an nanotated dataset:

In [10]:
%%bash
cd /root/datos/maestria/netopaas/Zuani2024/
wget https://www.ebi.ac.uk/biostudies/files/E-MTAB-13526/10X_Lung_Tumour_Annotated_v2.h5ad




Connected to hl-xfer-public.ebi.ac.uk.
220-Welcome to ftp.ebi.ac.uk
220 
230 Login successful.
Remote system type is UNIX.
Using binary mode to transfer files.
250 Directory successfully changed.
200 Switching to Binary mode.
local: 10X_Lung_Healthy_Background_Annotated_v2.h5ad remote: 10X_Lung_Healthy_Background_Annotated_v2.h5ad
229 Entering Extended Passive Mode (|||7072|)
150 Opening BINARY mode data connection for 10X_Lung_Healthy_Background_Annotated_v2.h5ad (45505872062 bytes).

receive aborted. Waiting for remote to finish abort.


### Remove empty drops

In [23]:
from os import system as sys
import os

codeDir = '/root/host_home/luca/utils/' # CHANGE ME TO THE DIRECTORY WHERE run_EmptyDrop.R IS CLONED
directory = '/root/datos/maestria/netopaas/Zuani2024/' # change me to the dir where the matrix files are stored

# All CellRanger samples for the lung data
samples = []

file_pattern = sorted([f for f in os.listdir(directory) if 'matrix' in f])
for mtx_file in file_pattern:
    idx = mtx_file.replace('matrix.mtx.gz', '')
    samples.append(idx)

for sample in samples:
    fname = directory+"/EmptyDrop_"+sample+".sh"
    f = open(f'{fname}', 'w')
    f.write("cd /root/datos/maestria/netopaas/Zuani2024/\n") # CHANGE ME TO A PLACE YOU WOULD LIKE TO WRITE OUTPUTS TO
    # f.write("conda activate minimal_env\n" )
    f.write("Rscript " + codeDir + "run_EmptyDrop.R {0}\n".format(str(sample)))
    f.close()

    # Example batch calling on the codon LSF cluster (this will vary depending on the system, bsub vs. qsub etc)
    print(f'bash {fname}')
    sys(f'bash {fname}')

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P16_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P16_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P16_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P17_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P17_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P17_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P17_T3-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P17_T3-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P17_T3-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P18_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P18_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P18_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P19_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P19_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P19_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P20_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P20_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P20_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P21_T1-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P21_T1-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P21_T1-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P21_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P21_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P21_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P22_T1-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P22_T1-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P22_T1-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P23_T1-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P23_T1-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P23_T1-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P24_T1-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P24_T1-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P24_T1-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P4_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P4_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P4_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P4_T3-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P4_T3-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P4_T3-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

bash /root/datos/maestria/netopaas/Zuani2024//EmptyDrop_P8_T2-.sh
[1] "/root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P8_T2-"
[1] "Creating folder  /root/datos/maestria/netopaas/Zuani2024//outputEmptyDrops/P8_T2-"


Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

### Convert to adata

In [33]:
import os
import io
import csv
import gzip

import scanpy as sc
import anndata as ad

def concatenate_datasets(directory):
    """
    Concatenate multiple .mtx datasets from a single directory into a single AnnData object.
    
    Parameters:
    - directory (str): The directory containing .mtx, features.tsv, and barcodes.tsv files.
    
    Returns:
    - AnnData: The concatenated AnnData object.
    """
    adatas = {}
    features_file = ''
    
    # List all subdirectories in the main directory
    subdirectories = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    
    for subdirectory in subdirectories:
        # Construct the path to the subdirectory
        subdirectory_path = os.path.join(directory, subdirectory)
        
        # Read the .mtx file and associated files from the subdirectory
        adata = sc.read_10x_mtx(subdirectory_path, var_names='gene_symbols', cache=True)
        
        # Use the subdirectory name as the identifier
        idx = subdirectory
        
        # Store the AnnData object in the dictionary
        adatas[idx] = adata
    
    # Concatenate all AnnData objects, labeling them by the subdirectory name which is used as the sample identifier
    concatenated_adata = ad.concat(adatas, label='sample')

    return concatenated_adata

# Example usage:
adata = concatenate_datasets(f'{zuani_dir}/outputEmptyDrops')
print(adata)



AnnData object with n_obs × n_vars = 318313 × 33538
    obs: 'sample'


  utils.warn_names_duplicates("obs")


In [3]:
import anndata as ad
adata2 = ad.read_h5ad('/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/full_atlas_hvg_integrated_scvi_integrated_scanvi.h5ad')

In [8]:
adata2.obs.cell_type_tumor.cat.categories

Index(['Alveolar cell type 1', 'Alveolar cell type 2', 'B cell',
       'B cell dividing', 'Ciliated', 'Club', 'DC mature',
       'Endothelial cell arterial', 'Endothelial cell capillary',
       'Endothelial cell lymphatic', 'Endothelial cell venous',
       'Fibroblast adventitial', 'Fibroblast alveolar',
       'Fibroblast peribronchial', 'Macrophage', 'Macrophage alveolar',
       'Mast cell', 'Mesothelial', 'Monocyte classical',
       'Monocyte non-classical', 'NK cell', 'NK cell dividing', 'Neutrophils',
       'Pericyte', 'Plasma cell', 'Plasma cell dividing',
       'ROS1+ healthy epithelial', 'Smooth muscle cell', 'T cell CD4',
       'T cell CD4 dividing', 'T cell CD8 activated', 'T cell CD8 dividing',
       'T cell CD8 effector memory', 'T cell CD8 naive',
       'T cell CD8 terminally exhausted', 'T cell NK-like',
       'T cell regulatory', 'Tumor cells LUAD', 'Tumor cells LUAD EMT',
       'Tumor cells LUAD MSLN', 'Tumor cells LUAD NE',
       'Tumor cells LUAD mitotic

In [36]:
adata.write_h5ad(f'{zuani_dir}/Zuani2024.h5ad')

In [63]:
adata = sc.read_h5ad('/root/datos/maestria/netopaas/Zuani2024/Zuani.h5ad')

  utils.warn_names_duplicates("obs")


Before emptydrops with FDR 0.001

In [5]:
adata

AnnData object with n_obs × n_vars = 101923200 × 33538
    obs: 'sample'
    var: 'gene_symbols'

after EmptyDrops

In [116]:
adata

AnnData object with n_obs × n_vars = 312502 × 33538
    obs: 'sample', 'barcode', 'Source Name', 'Characteristics[tumor grading]'

In [94]:
adata.obs['sample'] = adata.obs['sample'].str[:-1]
zuani_samples_unique = zuani_samples.drop_duplicates(subset=['Source Name'], keep='first')
adata.obs = pd.merge(adata.obs, zuani_samples_unique.loc[:, ['Source Name', 'Characteristics[tumor grading]']],
                            left_on='sample', right_on='Source Name', how='left')

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [100]:
import re
def map_tnm_to_roman(tnm_stage):
    stage_mappings = {
        r"^TisN0M0$": "0",
        r"^T1[ab]?N0M0$": "I",
        r"^T2[ab]?N0M0$": "II",
        r"^T2[ab]?N0/1$": "II",
        r"^T1[ab]?N1M0$": "II",
        r"^T2[ab]?N1M0$": "II",
        r"^T3N[01]M0$": "II",
        r"^T1[ab]?N2M0$": "III",
        r"^T2[ab]?N2M0$": "III",
        r"^T3N2M0$": "III",
        r"^T4N[012]M0$": "III",
        r"^T4N0$": "III",
        r"^T[1-4][ab]?N3M0$": "III",
        r"^T[1-4][ab]?N[0-3]M1[abc]$": "IV",
    }
    
    for pattern, stage in stage_mappings.items():
        if re.match(pattern, tnm_stage):
            return stage
    return "Unknown stage"

Remove the patients with no tumor grading

In [111]:
adata = adata[~adata.obs['Characteristics[tumor grading]'].isna()]

In [None]:
adata.obs['stage'] = adata.obs['Characteristics[tumor grading]'].apply(map_tnm_to_roman)
# adata.obs.stage

In [115]:
adata.write_h5ad(f'{zuani_dir}/Zuani2024.h5ad')

  df[key] = c
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  df[key] = c
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  df[key] = c
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  df[key] = c
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


## Deng 2024

In [None]:
# urllib.request.urlretrieve('https://figshare.com/ndownloader/files/44695465',f'{data_dir}/Deng2024/dengluad.rds')
urllib.request.urlretrieve('https://figshare.com/ndownloader/files/46617667',f'{data_dir}/Deng2024/dengEcoTyper.rds')


In [25]:
# ! rm /root/datos/maestria/netopaas/Deng2024/deng.h5Seurat
# ! rm /root/datos/maestria/netopaas/Deng2024/deng.h5ad
# ! rm /root/datos/maestria/netopaas/Deng2024/dengRNA.h5ad

In [26]:
%%R -i data_dir
library(Seurat)
library(SeuratDisk)

deng <- readRDS(paste0(data_dir, '/Deng2024/dengEcoTyper.rds'))
deng = UpdateSeuratObject(deng)

# After P49 there is no metadata so better delete. Also the ones with N as they are healthy controls,
cells_to_exclude <- Cells(deng)[grepl("P05|N", deng@meta.data$orig.ident)]
deng <- subset(deng, cells = setdiff(Cells(deng), cells_to_exclude))


SaveH5Seurat(deng, filename=paste0(data_dir, '/Deng2024/deng.h5Seurat'))
Convert(paste0(data_dir, '/Deng2024/deng.h5Seurat'), dest = "h5ad")
Convert(paste0(data_dir, '/Deng2024/deng.h5Seurat'), dest = paste0(data_dir, '/Deng2024/dengRNA.h5ad'), assay = 'RNA')

Validating object structure
Updating object slots
Ensuring keys are in the proper structure
Updating matrix keys for DimReduc ‘pca’
Updating matrix keys for DimReduc ‘umap’
Updating matrix keys for DimReduc ‘tsne’
Ensuring keys are in the proper structure
Ensuring feature names don't have underscores or pipes
Updating slots in RNA
Updating slots in integrated
Updating slots in integrated_nn
Setting default assay of integrated_nn to integrated
Updating slots in integrated_snn
Setting default assay of integrated_snn to integrated
Updating slots in pca
Updating slots in umap
Setting umap DimReduc to global
Updating slots in tsne
Setting tsne DimReduc to global
No assay information could be found for FindIntegrationAnchors
No assay information could be found for IntegrateData
Setting assay used for ScaleData.integrated to integrated
Setting assay used for RunPCA.integrated to integrated
Setting assay used for RunUMAP.integrated.pca to integrated
No assay information could be found for RunT

In [27]:
%%R
unique(deng@meta.data$orig.ident)

 [1] "P001" "P002" "P003" "P004" "P005" "P006" "P007" "P008" "P009" "P010"
[11] "P011" "P012" "P013" "P014" "P015" "P016" "P017" "P018" "P019" "P020"
[21] "P021" "P022" "P023" "P024" "P025" "P026" "P027" "P028" "P029" "P030"
[31] "P031" "P032" "P033" "P034" "P035" "P036" "P037" "P038" "P039" "P040"
[41] "P041" "P042" "P043" "P044" "P045" "P046" "P047" "P048" "P049"


In [28]:
adata_integ = sc.read_h5ad(f'{data_dir}/Deng2024/deng.h5ad')
adata = sc.read_h5ad(f'{data_dir}/Deng2024/dengRNA.h5ad')

# The assignment fo slots and assays is not very good with Convert from SeuratDisk so we make some arrangements
# We also arrange the X so that it is like the other filtered datasets in the repo
adata.layers['data'] = adata.X.copy()
adata.X = adata.raw.X

adata.raw = adata_integ

path_metadata = f'{data_dir}/Deng2024/metadata.xlsx'
if not os.path.exists(path_metadata):
    urllib.request.urlretrieve('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11031428/bin/mmc2.xlsx', path_metadata)
xl_deng = pd.ExcelFile(path_metadata)
metada_deng = xl_deng.parse(' lung cancer in scRNAseq')
adata.obs = pd.merge(adata.obs, metada_deng, how='left', left_on='orig.ident', right_on='Patient Number')


adata.write_h5ad(f'{backup_dir}/filtered_Deng_Liu_LUAD_2024.h5ad')

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [29]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,assigned_cell_type,stage,Ecotype,ecoRecovery,Patient Number,Group,...,Current or former smoking,Cancer history,Tumor location,Tumor size（cm）,Pathology,Histological subtype,T stage,N stage,M stage,Pathological stage
0,P001,4348.0,2099,12.531686,5,1,unassigned,unassigned,P001,pGGO,...,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
1,P001,5701.0,2614,4.821479,0,1,unassigned,CD8.T.cells_S04,P001,pGGO,...,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
2,P001,5636.0,2489,2.784740,0,1,unassigned,CD4.T.cells_S04,P001,pGGO,...,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
3,P001,5991.0,2591,6.061365,0,1,unassigned,CD4.T.cells_S05,P001,pGGO,...,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
4,P001,3464.0,1625,9.162717,5,1,unassigned,Mast.cells_S01,P001,pGGO,...,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421706,P049,1497.0,723,0.407106,1,9,unassigned,unassigned,P049,SolidN,...,No,No,RUL,1.6,ADC,Acinar predominant,T1b,N2,M0,IIIA
421707,P049,7093.0,541,0.027082,4,9,unassigned,unassigned,P049,SolidN,...,No,No,RUL,1.6,ADC,Acinar predominant,T1b,N2,M0,IIIA
421708,P049,5584.0,2466,5.115492,4,9,unassigned,unassigned,P049,SolidN,...,No,No,RUL,1.6,ADC,Acinar predominant,T1b,N2,M0,IIIA
421709,P049,5451.0,2348,9.361350,0,9,CD8.T.cells_S04,CD8.T.cells_S04,P049,SolidN,...,No,No,RUL,1.6,ADC,Acinar predominant,T1b,N2,M0,IIIA


In [19]:
np.max(adata.obs['nCount_RNA'])

139394.0

In [5]:
deng_meta = pd.read_excel('/root/datos/maestria/netopaas/Deng2024/metadata.xlsx')
deng_meta

Unnamed: 0,Patient Number,Group,Sex,Age （years）,Current or former smoking,Cancer history,Tumor location,Tumor size（cm）,Pathology,Histological subtype,T stage,N stage,M stage,Pathological stage
0,P001,pGGO,Female,61.0,No,No,LUL,0.9,ADC,MIA,T1a,N0,M0,IA1
1,P002,pGGO,Female,51.0,No,Yes,LUL,3.8,ADC,Lepidic+Acinar,T2a,N0,M0,IB
2,P003,pGGO,Female,55.0,No,No,RUL,1.1,ADC,Unkown,T1a,N0,M0,IA2
3,P004,dGGO,Female,63.0,No,No,RUL,1.1,ADC,MIA,T1a,N0,M0,IA1
4,P005,pGGO,Female,42.0,No,No,RLL,1.0,ADC,Lepidic+Acinar,T1b,N0,M0,IA2
5,P006,pGGO,Female,75.0,No,No,RUL,1.5,ADC,Lepidic predominant,T1c,N0,M0,IA3
6,P007,GGO25,Female,63.0,No,No,RUL,1.4,ADC,Lepidic predominant,T1b,N0,M0,IA2
7,P008,GGO25,Female,60.0,No,No,RUL,1.7,ADC,Lepidic+Acinar,T1b,N0,M0,IA2
8,P009,GGO25,Female,77.0,No,No,RUL,2.5,ADC,Lepidic predominant,T2a,N0,M0,IB
9,P010,GGO25,Male,49.0,No,No,RUL,1.4,ADC,MIA,T1b,N0,M0,IA2


In [21]:
adata.obs.ecoRecovery.unique()

array(['unassigned', 'CD8.T.cells_S04', 'CD4.T.cells_S04',
       'CD4.T.cells_S05', 'Mast.cells_S01', 'CD8.T.cells_S03',
       'NK.cells_S04', 'CD4.T.cells_S03', 'CD8.T.cells_S01',
       'B.cells_S02', 'CD4.T.cells_S01', 'Monocytes.and.Macrophages_S05',
       'NK.cells_S02', 'NK.cells_S01', 'CD4.T.cells_S02',
       'CD8.T.cells_S02', 'NK.cells_S03', 'Dendritic.cells_S01',
       'PMNs_S01', 'Monocytes.and.Macrophages_S04', 'B.cells_S01',
       'Monocytes.and.Macrophages_S02', 'B.cells_S03',
       'Monocytes.and.Macrophages_S03', 'PMNs_S03', 'Dendritic.cells_S02',
       'Mast.cells_S02', 'Monocytes.and.Macrophages_S01',
       'Epithelial.cells_S02', 'PMNs_S02', 'Fibroblasts_S03',
       'Endothelial.cells_S02', 'Epithelial.cells_S03', 'Fibroblasts_S01',
       'Endothelial.cells_S01', 'Endothelial.cells_S04',
       'Fibroblasts_S04', 'Epithelial.cells_S01', 'Endothelial.cells_S05',
       'Mast.cells_S03', 'Mast.cells_S04', 'Fibroblasts_S02',
       'Endothelial.cells_S03'], d

## Alexandra 2023

In [77]:
urllib.request.urlretrieve('https://zenodo.org/records/7852154/files/data_code.zip?download=1',f'{data_dir}/Alexandra2023/data.zip')

('/root/datos/maestria/netopaas//alexandra2023/data.zip',
 <http.client.HTTPMessage at 0x7f7325a5cf10>)

## Zhong 2024


In [79]:
zhong_samples = pd.read_csv(f'{data_dir}/Zhong2024/GSE241934_IIT_Meta.txt', sep='\t')
zhong_samples

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,sampleID,cellID,RNA_snn_res.0.8,seurat_clusters,major.cell.type,RNA_snn_res.1,...,Gender,Age,Histology,Cycles,PD1,Pathological Response,EGFR,Pathological Response Rate,Smoking_History,PD-L1 TPS
0,P343,17606,4574,2.891060,P343,P343_AAACCTGAGCTATGCT-1,6,6,Fibro,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
1,P343,9542,3230,4.003354,P343,P343_AAACCTGCATGATCCA-1,8,8,Fibro,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
2,P343,10729,3832,2.302172,P343,P343_AAACCTGGTGAGTATA-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
3,P343,7788,2859,5.225989,P343,P343_AAACCTGTCCAGTATG-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
4,P343,15449,4074,3.301185,P343,P343_AAACGGGCACGACGAA-1,10,10,Endo,,...,F,60,LUAD,3,Sintilimab,non-MPR,L858R,0.85,N,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78686,P591,6608,2528,3.344431,P591,P591_TGCCCTAGTCAGATAA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78687,P591,1064,707,7.518797,P591,P591_TGGCTGGAGCGTTGCC-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78688,P591,13226,3318,1.126569,P591,P591_TTCTCCTAGTGCGTGA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%
78689,P591,7335,2912,6.693933,P591,P591_TTTCCTCAGCGTAATA-1,7,7,Epi,,...,M,51,LUAD,3,Sintilimab,non-MPR,KDD,0.58,Y,<1%


## Thomas 2023

In [None]:
! cd /root/datos/maestria/netopaas/Thomas2023 & wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE153nnn/GSE153935/suppl/GSE153935%5FTLDS%5FAllCells%2Etxt%2Egz

In [4]:
thomas_mtx = pd.read_csv(f'{data_dir}/Thomas2023/GSE153935_TLDS_AllCells.txt.gz', sep='\t')
thomas_mtx

Unnamed: 0,D2T_GTTCAGCTTCTT,D2T_GCGCCCCATCAC,D2T_TGTCGGGGATCA,D2T_CCAATAATCTTA,D2T_CCTTTCCGTTTG,D2T_TCGCTGAATGTA,D2T_CAATACAACGAT,D2T_CTCTCCCTGTGG,D2T_GCTCGTGCTCTA,D2T_GATCCTTCTAGA,...,D4T_GCGCGGGATTTT,D12T_CACGCGTTTGTG,D12T_CGCCAACCACTG,D12T_GCTGACATATTT,D12T_TCCCTAAAGTGT,D12T_ACATGGGTAAGC,D12T_TGGTGATGGCCA,D12T_CACTATTCCTGA,D12T_TGGAGTATTGCA,D12T_TGTTCGAAGCTT
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,7,9,0,2,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hsa-mir-5195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-mir-6080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-mir-7515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-mir-8072,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
