In [None]:
import os
import scanpy as sc
import pandas as pd
import anndata

Set directory variables for the samples

In [None]:
def set_directories(patient_id, base_dir):
    """
    Generate a dictionary containing paths to relevant directories for a given patient.
    
    Parameters:
    - patient_id (str): The unique identifier for the patient.
    - base_dir (str): The base directory containing all patient data.
    
    Returns:
    - dict: A dictionary with paths to gene expression, dextramer, TCR, and CITE-seq data.
    """
    return {
        "dir_gex": os.path.join(base_dir, f"{patient_id}/CellRangerGex_results"),
        "dir_dex": os.path.join(base_dir, f"{patient_id}_dextramer_count/umi_count"),
        "dir_TCR": os.path.join(base_dir, f"{patient_id}_TCR_VDJ/CellRangerVdj_results"),
        "dir_CITE": os.path.join(base_dir, f"{patient_id}_hash_count/umi_count")
    }

# Define base directory and patient IDs
base_dir = "/Users/ecrosse/Desktop/"

# Set directories for each patient
dirs_SRSF2_9 = set_directories("data_for_edie_third_batch_january/WJK-2859_SRSF2_9", base_dir)
dirs_SRSF2_10 = set_directories("dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10", base_dir)

# Print to verify the directory structure
print(dirs_SRSF2_9)
print(dirs_SRSF2_10)


{'dir_gex': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9/CellRangerGex_results', 'dir_dex': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_dextramer_count/umi_count', 'dir_TCR': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_TCR_VDJ/CellRangerVdj_results', 'dir_CITE': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_hash_count/umi_count'}
{'dir_gex': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10/CellRangerGex_results', 'dir_dex': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_dextramer_count/umi_count', 'dir_TCR': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_TCR_VDJ/CellRangerVdj_results', 'dir_CITE': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_hash_count/umi_count'}


Define functions to load and integrate the expression and CITE-Seq data

In [None]:
# Load expression data
def load_expression_data(data_dir, project_name="dextramer_pilot"):
    """
    Load RNA expression data from 10X Genomics output and create an AnnData object.
    
    Parameters:
    - data_dir (str): Directory containing the filtered_feature_bc_matrix from Cell Ranger.
    - project_name (str): Name for the AnnData object (not required but included for clarity).
    
    Returns:
    - anndata.AnnData: An AnnData object containing gene expression data.
    """
    matrix_dir = os.path.join(data_dir, "filtered_feature_bc_matrix")
    adata = sc.read_10x_mtx(matrix_dir, var_names='gene_symbols', cache=True)
    
    # Store project name in AnnData metadata
    adata.uns['project_name'] = project_name
    
    return adata

# Load and integrate CITE data

def load_cite_data(data_dir, adata):
    """
    Load CITE-seq data from 10X and integrate it into an existing AnnData object.
    
    Parameters:
    - data_dir (str): Directory containing the CITE-seq data.
    - adata (anndata.AnnData): The RNA expression AnnData object.

    Returns:
    - anndata.AnnData: Updated AnnData object with CITE-seq assay added.
    """
    # Load CITE-seq data
    cite_adata = sc.read_10x_mtx(data_dir, var_names='gene_symbols', cache=True)
    
    # Append "-1" to barcodes (to match Cell Ranger format)
    cite_adata.obs.index = [barcode + "-1" for barcode in cite_adata.obs.index]
    
    # Find common barcodes
    common_barcodes = adata.obs.index.intersection(cite_adata.obs.index)
    
    # Subset both datasets to retain only common barcodes
    adata_common = adata[common_barcodes].copy()
    cite_adata_common = cite_adata[common_barcodes].copy()
    
    # Add CITE-seq assay as a separate layer
    adata_common.layers["CITE"] = cite_adata_common.X
    
    return adata_common


Load the data

In [None]:
# Load gene expression data
adata = load_expression_data(dirs_SRSF2_9["dir_gex"])

# Load and integrate CITE-seq data
adata = load_cite_data(dirs_SRSF2_9["dir_CITE"], adata)

Data QC and filtering