In [18]:
import os
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata

Set directory variables for the samples

In [19]:
def set_directories(patient_id, base_dir):
    """
    Generate a dictionary containing paths to relevant directories for a given patient.
    
    Parameters:
    - patient_id (str): The unique identifier for the patient.
    - base_dir (str): The base directory containing all patient data.
    
    Returns:
    - dict: A dictionary with paths to gene expression, dextramer, TCR, and CITE-seq data.
    """
    return {
        "dir_gex": os.path.join(base_dir, f"{patient_id}/CellRangerGex_results"),
        "dir_dex": os.path.join(base_dir, f"{patient_id}_dextramer_count/umi_count"),
        "dir_TCR": os.path.join(base_dir, f"{patient_id}_TCR_VDJ/CellRangerVdj_results"),
        "dir_CITE": os.path.join(base_dir, f"{patient_id}_hash_count/umi_count")
    }

# Define base directory and patient IDs
base_dir = "/Users/ecrosse/Desktop/"

# Set directories for each patient
dirs_SRSF2_9 = set_directories("data_for_edie_third_batch_january/WJK-2859_SRSF2_9", base_dir)
dirs_SRSF2_10 = set_directories("dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10", base_dir)

# Print to verify the directory structure
print(dirs_SRSF2_9)
print(dirs_SRSF2_10)


{'dir_gex': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9/CellRangerGex_results', 'dir_dex': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_dextramer_count/umi_count', 'dir_TCR': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_TCR_VDJ/CellRangerVdj_results', 'dir_CITE': '/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_hash_count/umi_count'}
{'dir_gex': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10/CellRangerGex_results', 'dir_dex': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_dextramer_count/umi_count', 'dir_TCR': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_TCR_VDJ/CellRangerVdj_results', 'dir_CITE': '/Users/ecrosse/Desktop/dextramer_data_for_edie_january_part_2/WJK-2864_SRSF2_10_hash_count/umi_count'}


In [34]:
# Define paths
dir_gex = dirs_SRSF2_9["dir_gex"]
dir_dex = dirs_SRSF2_9["dir_dex"]
dir_CITE = dirs_SRSF2_9["dir_CITE"]

print(dir_gex)
print(dir_dex)
print(dir_CITE)

adata = sc.read_10x_h5(os.path.join(dir_gex, "filtered_feature_bc_matrix.h5"))


/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9/CellRangerGex_results
/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_dextramer_count/umi_count
/Users/ecrosse/Desktop/data_for_edie_third_batch_january/WJK-2859_SRSF2_9_hash_count/umi_count


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Load the data

In [4]:
# Load gene expression data
adata = load_expression_data(dirs_SRSF2_9["dir_gex"])

# Load and integrate CITE-seq data
adata = load_cite_data(dirs_SRSF2_9["dir_CITE"], adata)

KeyError: 2

Data QC and filtering

In [None]:
# Step 1: Calculate mitochondrial gene ratio
adata.var["mt_gene"] = adata.var_names.str.startswith("MT-")
adata.obs["mitoRatio"] = (adata[:, adata.var["mt_gene"]].X.sum(axis=1)) / adata.X.sum(axis=1)

# Step 2: Subset data based on quality control thresholds
threshold_nCount_RNA = 500  # Define your threshold
threshold_nFeature_RNA = 200  # Define your threshold
threshold_mito = 0.2  # Define your threshold

adata = adata[
    (adata.obs["n_counts"] >= threshold_nCount_RNA) &
    (adata.obs["n_genes"] >= threshold_nFeature_RNA) &
    (adata.obs["mitoRatio"] < threshold_mito),
    :
]

# Step 3: Normalize data (log1p normalization)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Step 4: Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=4000, flavor="seurat_v3")

# Step 5: Filter out genes starting with "TR"
variable_genes = adata.var["highly_variable"]
filtered_variable_genes = adata.var_names[variable_genes & ~adata.var_names.str.startswith("TR")]

# Step 6: Update the list of highly variable genes
adata.var["highly_variable"] = adata.var_names.isin(filtered_variable_genes)

# Step 7: Scale the data
sc.pp.scale(adata, max_value=10)  # Scale all genes

# Step 8: Run PCA
sc.tl.pca(adata)


CITE-Seq analysis - to define the Dex + population

In [None]:
# Normalize CITE-seq data using CLR normalization
adata.layers["CITE"] = np.log1p(adata.layers["CITE"] / np.median(adata.layers["CITE"], axis=0))

# Generate a density plot for CITE-seq data
def generate_cite_density_plot(adata, assay_layer="CITE", title="CITE-seq Density Plot"):
    """
    Generate a density plot for CITE-seq data.
    
    Parameters:
    - adata: AnnData object containing the CITE-seq layer.
    - assay_layer: Layer containing CITE-seq data.
    - title: Title for the plot.
    
    Returns:
    - Matplotlib figure
    """
    cite_data = adata.layers[assay_layer]
    cite_values = cite_data.flatten()

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(cite_values, bins=50, density=True, alpha=0.6, color="blue")
    ax.set_xlabel("CITE-seq Expression (CLR-normalized)")
    ax.set_ylabel("Density")
    ax.set_title(title)
    
    return fig

# View the density plot
fig = generate_cite_density_plot(adata)
plt.show()

def apply_cite_threshold(adata, assay_layer="CITE", hash_threshold=1.5):
    """
    Apply a threshold to CITE-seq data to classify Dex+ populations.

    Parameters:
    - adata: AnnData object containing the CITE-seq layer.
    - assay_layer: Layer containing CITE-seq data.
    - hash_threshold: Threshold value to define Dex+ populations.

    Returns:
    - Updated AnnData object with a new 'dex_positive' column in obs.
    """
    cite_data = adata.layers[assay_layer]

    # Identify Dex+ cells (any protein exceeding threshold)
    dex_positive = (cite_data > hash_threshold).any(axis=1)

    # Store the result in metadata
    adata.obs["dex_positive"] = dex_positive.astype(int)  # 1 for Dex+, 0 for Dex−

    return adata

# Apply threshold to define Dex+ cells
adata = apply_cite_threshold(adata, assay_layer="CITE", hash_threshold=1.5)

# Check summary of Dex+ classifications
adata.obs["dex_positive"].value_counts()

# Apply threshold to classify Dex+ and Dex− cells
adata = apply_cite_threshold(adata, assay_layer="CITE", hash_threshold=1.5)

# Create subsets for Dex+ and Dex− populations
adata_dex_pos = adata[adata.obs["dex_positive"] == 1].copy()
adata_dex_neg = adata[adata.obs["dex_positive"] == 0].copy()

# Check the number of cells in each subset
print(f"Dex+ cells: {adata_dex_pos.n_obs}")
print(f"Dex− cells: {adata_dex_neg.n_obs}")

