<a href="https://colab.research.google.com/github/dtabuena/Workshop/blob/main/PatchSeq_Pilot_scAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install pandas
# !pip install scanpy
# !pip install wget


In [10]:
import pandas as pd
import scanpy as sc
import os
import wget
import gzip
import shutil
import chardet
import matplotlib.pyplot as plt
import requests

In [11]:
def load_anndata(file_path):
    """
    Load an AnnData object from the specified file path.
    """
    adata = sc.read(file_path)
    return adata


In [46]:
def download_and_decompress_gtf(gtf_url, gtf_file):
    """
    Download the GTF file and decompress it, if not already done.

    Parameters:
    gtf_url (str): The URL to download the GTF file from.
    gtf_file (str): The path where the GTF file (compressed) will be saved.

    Returns:
    str: The path to the decompressed GTF file.
    """
    # Define the decompressed file path
    decompressed_file = gtf_file.replace('.gz', '')

    # Check if the decompressed file already exists
    if not os.path.isfile(decompressed_file):
        # Check if the compressed file already exists
        if not os.path.isfile(gtf_file):
            # Download the GTF file
            wget.download(gtf_url, gtf_file, bar=None)  # `bar=None` to suppress the download progress bar

        # Decompress the GTF file
        with gzip.open(gtf_file, 'rb') as f_in:
            with open(decompressed_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        print(f"{decompressed_file} already exists. Skipping download and decompression.")

    return decompressed_file


def detect_file_encoding(file_path):
    """
    Detect the encoding of a file using chardet.
    """
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read(10000))
    return result['encoding']

def create_ensembl_to_gene_mapping(gtf_file):
    """
    Create a mapping of Ensembl IDs to gene names from a GTF file, focusing only on coding regions (exons).
    """
    ensembl_to_gene = {}

    with open(gtf_file, 'r', encoding='utf-8', errors='replace') as file:
        for line in file:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue  # skip lines that do not have enough fields
            seqname, source, feature, start, end, score, strand, frame, attributes = fields

            if feature == 'exon':
                # Extract attributes
                attributes_dict = dict(item.split(None, 1) for item in attributes.split('; ') if item)
                ensembl_id = attributes_dict.get('gene_id', '').strip('"')
                gene_name = attributes_dict.get('gene_name', '').strip('"')

                if ensembl_id and gene_name:
                    ensembl_to_gene[ensembl_id.split('.')[0]] = gene_name

    ensembl_df = pd.DataFrame(list(ensembl_to_gene.items()), columns=['Ensembl_ID', 'Gene_Name'])
    ensembl_df.to_csv('ensembl_to_gene_mapping.csv', index=False)

    return ensembl_to_gene, ensembl_df

In [15]:
def add_gene_names_metadata(adata, ensembl_to_gene):
    """
    Add gene names as metadata to the AnnData object.
    """
    gene_names = [ensembl_to_gene.get(gene_id, 'unknown') for gene_id in adata.var_names]
    adata.var['gene_name'] = gene_names
    return adata

In [6]:
def load_coding_genes(csv_file):
    """
    Load coding genes from a CSV file and return a set of gene names.
    """
    coding_df = pd.read_csv(csv_file)
    coding_genes = set(coding_df[coding_df['is_coding'] == True]['external_gene_name'])
    return coding_genes

In [7]:
def filter_noncoding_genes(adata, coding_genes):
    """
    Filter out non-coding genes from the AnnData object.
    """
    is_coding = adata.var['gene_name'].isin(coding_genes)
    filtered_adata = adata[:, is_coding].copy()
    return filtered_adata

In [59]:
def consolidate_counts_by_gene(adata):
    """
    Consolidate counts for genes by summing counts for identical gene names.
    """
    # Convert AnnData to DataFrame
    counts_df = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var['gene_name'])

    # Consolidate counts by gene
    # Group by gene names (columns), summing counts for each gene
    consolidated_df = counts_df.groupby(counts_df.columns, axis=1).sum()

    # Ensure the new DataFrame has the same index and columns
    consolidated_df = consolidated_df.groupby(consolidated_df.index).sum()

    # Create a new AnnData object with consolidated counts
    # The new var DataFrame should have gene names as its index
    adata_consolidated = sc.AnnData(
        X=consolidated_df.values,
        obs=adata.obs,
        var=pd.DataFrame(index=consolidated_df.columns)
    )

    return adata_consolidated

In [60]:
# Paths to your files
os.chdir(r'D:\Dropbox (Gladstone)\Gladstone Dropbox\Dennis Tabuena\0_Projects\_ApoE Patch Seq\pilot_patchseq\analysis_workspace')
counts_file = r"D:\Dropbox (Gladstone)\Gladstone Dropbox\Dennis Tabuena\0_Projects\_ApoE Patch Seq\pilot_patchseq\counts.STAR.MOUSE.txt"

# Define paths and URLs for the GTF file
gtf_url = 'ftp://ftp.ensembl.org/pub/release-110/gtf/mus_musculus/Mus_musculus.GRCm39.110.gtf.gz'
gtf_file = 'Mus_musculus.GRCm39.110.gtf.gz'

# Decompress and create the Ensembl to Gene mapping
decompressed_file = download_and_decompress_gtf(gtf_url, gtf_file)

# Load the AnnData object
patchseq_pilot_df = pd.read_csv(counts_file, sep='\t', index_col=0)
patchseq_pilotadata = sc.AnnData(X=patchseq_pilot_df.values,
                   obs=pd.DataFrame(index=patchseq_pilot_df.index),
                   var=pd.DataFrame(index=patchseq_pilot_df.columns)).T

# Get Ensembl to Gene mapping
ensembl_to_gene, df = create_ensembl_to_gene_mapping(decompressed_file)
# display(df)

# Add gene names as metadata
patchseq_pilotadata = add_gene_names_metadata(patchseq_pilotadata, ensembl_to_gene)

# Load coding genes
coding_genes_csv = 'mmusculus_coding_noncoding.csv'
coding_genes = load_coding_genes(coding_genes_csv)
# display(coding_genes)

# Filter out non-coding genes
patchseq_pilotadata_filt= filter_noncoding_genes(patchseq_pilotadata, coding_genes)
display(patchseq_pilotadata_filt)

# Consolidate counts by gene
consolidated_adata_cons = consolidate_counts_by_gene(patchseq_pilotadata_filt)
display(consolidated_adata_cons)

Mus_musculus.GRCm39.110.gtf already exists. Skipping download and decompression.


AnnData object with n_obs × n_vars = 24 × 21470
    var: 'gene_name'

  consolidated_df = counts_df.groupby(counts_df.columns, axis=1).sum()


AnnData object with n_obs × n_vars = 24 × 21446