<a href="https://colab.research.google.com/github/dtabuena/Workshop/blob/main/PatchSeq_Pilot_scAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install pandas
# !pip install scanpy
# !pip install wget


In [12]:
import pandas as pd
import scanpy as sc
import os
import wget
import gzip
import shutil
import chardet
import matplotlib.pyplot as plt
import requests

In [3]:
def load_anndata(file_path):
    """
    Load an AnnData object from the specified file path.
    """
    adata = sc.read(file_path)
    return adata


In [56]:
def download_and_decompress_gtf(gtf_url, gtf_file):
    """
    Download the GTF file and decompress it, if not already done.

    Parameters:
    gtf_url (str): The URL to download the GTF file from.
    gtf_file (str): The path where the GTF file (compressed) will be saved.

    Returns:
    str: The path to the decompressed GTF file.
    """
    # Define the decompressed file path
    decompressed_file = gtf_file.replace('.gz', '')

    # Check if the decompressed file already exists
    if not os.path.isfile(decompressed_file):
        # Check if the compressed file already exists
        if not os.path.isfile(gtf_file):
            # Download the GTF file
            wget.download(gtf_url, gtf_file, bar=None)  # `bar=None` to suppress the download progress bar

        # Decompress the GTF file
        with gzip.open(gtf_file, 'rb') as f_in:
            with open(decompressed_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        print(f"{decompressed_file} already exists. Skipping download and decompression.")

    return decompressed_file


def detect_file_encoding(file_path):
    """
    Detect the encoding of a file using chardet.
    """
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read(10000))
    return result['encoding']

def create_ensembl_to_gene_mapping(gtf_file):
    """
    Create a mapping of Ensembl IDs to gene names from a GTF file, focusing only on genes.
    """
    ensembl_to_gene = {}

    # Detect file encoding
    encoding = detect_file_encoding(gtf_file)
    if encoding is None:
        encoding = 'utf-8'  # fallback to utf-8 if encoding detection fails

    with open(gtf_file, 'r', encoding=encoding, errors='replace') as file:
        for line in file:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue  # skip lines that do not have enough fields
            feature, attributes = fields[2], fields[8]  # Use only feature and attributes

            if feature == 'gene':
                # Extract attributes
                attributes_dict = dict(item.split(None, 1) for item in attributes.split('; ') if item)
                ensembl_id = attributes_dict.get('gene_id', '').strip('"')
                gene_name = attributes_dict.get('gene_name', '').strip('"')

                if ensembl_id and gene_name:
                    ensembl_to_gene[ensembl_id.split('.')[0]] = gene_name

    # Create a DataFrame for inspection
    ensembl_df = pd.DataFrame(list(ensembl_to_gene.items()), columns=['Ensembl_ID', 'Gene_Name'])
    print(ensembl_df.head())  # Print first few rows for verification

    # Optionally save the mapping to CSV
    ensembl_df.to_csv('ensembl_to_gene_mapping.csv', index=False)

    return ensembl_to_gene, ensembl_df


In [6]:
def add_gene_names_metadata(adata, ensembl_to_gene):
    """
    Add gene names as metadata to the AnnData object.
    """
    gene_names = [ensembl_to_gene.get(gene_id, 'unknown') for gene_id in adata.var_names]
    adata.var['gene_name'] = gene_names
    return adata

In [7]:
def load_coding_genes(csv_file):
    """
    Load coding genes from a CSV file and return a set of gene names.
    """
    coding_df = pd.read_csv(csv_file)
    coding_genes = set(coding_df[coding_df['is_coding'] == True]['external_gene_name'])
    return coding_genes

In [35]:
def filter_noncoding_genes(adata, coding_genes):
    """
    Filter out non-coding genes from the AnnData object.
    """
    is_coding = adata.var['gene_name'].isin(coding_genes)
    filtered_adata = adata[:, is_coding].copy()
    return filtered_adata

In [9]:
def consolidate_counts_by_gene(adata):
    """
    Consolidate counts for rows with the same gene name by summing the counts.
    """
    counts_df = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var['gene_name'])
    consolidated_df = counts_df.groupby(level=0, axis=1).sum()
    consolidated_df = consolidated_df.groupby(consolidated_df.index).sum()

    # Create a new AnnData object with the consolidated counts
    adata_consolidated = sc.AnnData(X=consolidated_df.values, obs=adata.obs, var=pd.DataFrame(index=consolidated_df.index))
    return adata_consolidated

In [None]:
# Paths to your files
os.chdir(r'D:\Dropbox (Gladstone)\Gladstone Dropbox\Dennis Tabuena\0_Projects\_ApoE Patch Seq\pilot_patchseq\analysis_workspace')
counts_file = r"D:\Dropbox (Gladstone)\Gladstone Dropbox\Dennis Tabuena\0_Projects\_ApoE Patch Seq\pilot_patchseq\counts.STAR.MOUSE.txt"

# Define paths and URLs for the GTF file
gtf_url = 'ftp://ftp.ensembl.org/pub/release-110/gtf/mus_musculus/Mus_musculus.GRCm39.110.gtf.gz'
gtf_file = 'Mus_musculus.GRCm39.110.gtf.gz'

# Decompress and create the Ensembl to Gene mapping
decompressed_file = download_and_decompress_gtf(gtf_url, gtf_file)

# Load the AnnData object
patchseq_pilot_df = pd.read_csv(counts_file, sep='\t', index_col=0)
patchseq_pilotadata = sc.AnnData(X=patchseq_pilot_df.values,
                   obs=pd.DataFrame(index=patchseq_pilot_df.index),
                   var=pd.DataFrame(index=patchseq_pilot_df.columns)).T

# Get Ensembl to Gene mapping
ensembl_to_gene, _ = create_ensembl_to_gene_mapping(gtf_file)

# Add gene names as metadata
patchseq_pilotadata = add_gene_names_metadata(patchseq_pilotadata, ensembl_to_gene)

# Load coding genes
coding_genes_csv = 'mmusculus_coding_noncoding.csv'
coding_genes = load_coding_genes(coding_genes_csv)

# Filter out non-coding genes
patchseq_pilotadata_filt= filter_noncoding_genes(patchseq_pilotadata, coding_genes)


# # Consolidate counts by gene
# # consolidated_adata_cons = consolidate_counts_by_gene(patchseq_pilotadata_filt)

In [None]:

# Load the data



# Convert to AnnData
patchseq_pilot_adata = sc.AnnData(X=patchseq_pilot_df.T)


In [None]:
def download_and_decompress_gtf(gtf_url, gtf_file):
    """
    Download the GTF file and decompress it, overwriting if the file already exists.
    """
    # Download the GTF file
    wget.download(gtf_url, gtf_file, bar=None)  # `bar=None` to suppress the download progress bar

    decompressed_file = gtf_file.replace('.gz', '')

    # Decompress the GTF file
    with gzip.open(gtf_file, 'rb') as f_in:
        with open(decompressed_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    return decompressed_file




In [None]:
def detect_file_encoding(file_path):
    """
    Detect the encoding of a file using chardet.
    """
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read(10000))
    return result['encoding']

def create_ensembl_to_gene_mapping(gtf_file):
    """
    Create a mapping of Ensembl IDs to gene names from a GTF file, focusing only on gene features.
    """
    ensembl_to_gene = {}

    encoding = detect_file_encoding(gtf_file)
    if encoding is None:
        encoding = 'utf-8'  # fallback to utf-8 if encoding detection fails

    with open(gtf_file, 'r', encoding=encoding, errors='replace') as file:
        for line in file:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue  # skip lines that do not have enough fields
            seqname, source, feature, start, end, score, strand, frame, attributes = fields

            if feature == 'gene':
                # Extract attributes
                attributes_dict = dict(item.split(None, 1) for item in attributes.split('; ') if item)
                ensembl_id = attributes_dict.get('gene_id', '').strip('"')
                gene_name = attributes_dict.get('gene_name', '').strip('"')

                if ensembl_id and gene_name:
                    ensembl_to_gene[ensembl_id.split('.')[0]] = gene_name

    ensembl_df = pd.DataFrame(list(ensembl_to_gene.items()), columns=['Ensembl_ID', 'Gene_Name'])
    ensembl_df.to_csv('ensembl_to_gene_mapping.csv', index=False)

    return ensembl_to_gene, ensembl_df

In [None]:
def filter_adata_by_coding_genes(adata, ensembl_to_gene):
    """
    Filter the AnnData object to include only coding genes.

    Parameters:
    - adata: AnnData object
    - ensembl_to_gene: Dictionary mapping Ensembl IDs to gene names

    Returns:
    - filtered_adata: AnnData object with only coding genes
    """
    # Extract the list of Ensembl IDs from the AnnData object
    ensembl_ids = [name.split('.')[0] for name in adata.var_names]

    # Filter Ensembl IDs to only those present in the ensembl_to_gene mapping
    filtered_ensembl_ids = [ens_id for ens_id in ensembl_ids if ens_id in ensembl_to_gene]

    # Filter the AnnData object to include only the filtered Ensembl IDs
    filtered_var_names = [name for name in adata.var_names if name.split('.')[0] in filtered_ensembl_ids]
    filtered_adata = adata[:, filtered_var_names].copy()

    return filtered_adata



In [None]:

def add_gene_names_metadata(adata, ensembl_to_gene):
    """
    Add gene names as metadata to the AnnData object.
    """
    # Create a list of gene names for each Ensembl ID
    gene_names = [ensembl_to_gene.get(id.split('.')[0], id) for id in adata.var_names]

    # Add gene names as a new column in the AnnData object
    adata.var['gene_name'] = gene_names
    return adata

def consolidate_counts_by_gene(adata):
    """
    Consolidate counts for rows with the same gene name by summing the counts.
    """
    # Extract counts and gene names
    counts_df = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var['gene_name'])

    # Sum counts by gene name
    consolidated_df = counts_df.groupby(level=0, axis=1).sum()

    consolidated_df = consolidated_df.drop_duplicates(subset='gene_name')

    # Create a new AnnData object with the consolidated counts
    adata_consolidated = sc.AnnData(
        X=consolidated_df.values,
        obs=adata.obs,
        var=pd.DataFrame(index=consolidated_df.columns)
    )

    return adata_consolidated


In [None]:
# Load RNA-seq data into an AnnData object
counts_file = r"D:\Dropbox (Gladstone)\Gladstone Dropbox\Dennis Tabuena\0_Projects\_ApoE Patch Seq\pilot_patchseq\counts.STAR.MOUSE.txt"
patchseq_pilot_df = pd.read_csv(counts_file, sep='\t', index_col=0)
patchseq_pilot_adata = sc.AnnData(X=patchseq_pilot_df.T)

# Define paths and URLs for the GTF file
gtf_url = 'ftp://ftp.ensembl.org/pub/release-110/gtf/mus_musculus/Mus_musculus.GRCm39.110.gtf.gz'
gtf_file = 'Mus_musculus.GRCm39.110.gtf.gz'

# Decompress and create the Ensembl to Gene mapping
decompressed_file = download_and_decompress_gtf(gtf_url, gtf_file)
ensembl_to_gene, _ = create_ensembl_to_gene_mapping(decompressed_file)



# # Filter the AnnData object to include only coding genes
# filtered_adata = filter_adata_by_coding_genes(patchseq_pilot_adata, ensembl_to_gene)

# filtered_adata = add_gene_names_metadata(filtered_adata, ensembl_to_gene)

# filtered_adata = consolidate_counts_by_gene(filtered_adata)

# num_genes_filtered = len(filtered_adata.var_names)
# num_cells_filtered = filtered_adata.shape[0]
# print(num_genes_filtered,num_cells_filtered)

51887 24


  consolidated_df = counts_df.groupby(level=0, axis=1).sum()


In [None]:
len(ensembl_to_gene)

56631

In [None]:
gene_to_ensembl = {}

# Iterate through the original dictionary
for key, value in ensembl_to_gene.items():
    # If the value is already a key in the reversed dictionary, append the current key to the list
    if value in gene_to_ensembl:
        gene_to_ensembl[value].append(key)
    # Otherwise, create a new entry with the value as the key and the key as the single-item list
    else:
        gene_to_ensembl[value] = [key]

print(len(gene_to_ensembl))

56481


In [None]:
len(set(ensembl_to_gene.values()))

56481

In [None]:
ensembl_to_gene

{'ENSMUSG00000104478': 'Gm38212',
 'ENSMUSG00000104385': 'Gm7449',
 'ENSMUSG00000101231': 'Gm28283',
 'ENSMUSG00000102135': 'Gm37108',
 'ENSMUSG00000103282': 'Gm37275',
 'ENSMUSG00000101097': 'Gm6679',
 'ENSMUSG00000100764': 'Gm29155',
 'ENSMUSG00000102534': 'Gm37225',
 'ENSMUSG00000100831': 'Gm17847',
 'ENSMUSG00000100884': 'Gm28281',
 'ENSMUSG00000100635': 'Gm29157',
 'ENSMUSG00000102213': 'Gm37489',
 'ENSMUSG00000100480': 'Gm29156',
 'ENSMUSG00000089037': 'Gm22159',
 'ENSMUSG00000103629': 'Gm5694',
 'ENSMUSG00002076531': 'Gm54681',
 'ENSMUSG00000100025': 'Gm8141',
 'ENSMUSG00000051285': 'Pcmtd1',
 'ENSMUSG00000098201': 'Gm26983',
 'ENSMUSG00000103509': 'Gm38372',
 'ENSMUSG00000048538': 'Gm9826',
 'ENSMUSG00000103709': 'Nras-ps2',
 'ENSMUSG00000077244': 'Gm23274',
 'ENSMUSG00000102768': 'Gm19002',
 'ENSMUSG00000097797': 'Gm26901',
 'ENSMUSG00000103498': 'Gm18984',
 'ENSMUSG00000103067': 'Gm30414',
 'ENSMUSG00000102320': 'Gm37791',
 'ENSMUSG00000104226': 'Gm7470',
 'ENSMUSG00000103903

In [None]:
num_genes_filtered = len(filtered_adata.var_names)
num_cells_filtered = filtered_adata.shape[0]
print(num_genes_filtered,num_cells_filtered)

51887 24
