In [1]:
import os
import pandas as pd 
import polars as pl

# Define path that contains some junction files (only 2 files are used for this example, corresponding to 2 individual cells)
juncs_path = "/gpfs/commons/home/kisaev/LeafletSC/data/raw/junctions/"
print("The junctions are loaded from the following path: " + juncs_path) 

# print the files in the path 
print("The files in the path are: " + str(os.listdir(juncs_path)))

# define path for saving the output data 
output_path = "/gpfs/commons/home/kisaev/LeafletSC/data/processed/"

The junctions are loaded from the following path: /gpfs/commons/home/kisaev/LeafletSC/data/raw/junctions/
The files in the path are: ['B107926_O8_Blue_Blood_S250.homo.gencode.v30.ERCC.chrM.juncswbarcodes', 'B107925_B5_S284.homo.gencode.v30.ERCC.chrM.juncswbarcodes']


In [2]:
import pandas as pd
import numpy as np
import argparse
import pyranges as pr
from gtfparse import read_gtf #initially tested with version 1.3.0)
from tqdm import tqdm
import time
import warnings
import glob 
import time
import gzip
from pathlib import Path
import concurrent.futures
import concurrent.futures

In [3]:
def process_gtf(gtf_file): #make this into a seperate script that processes the gtf file into gr object that can be used in the main scriptas input 
    """
    Process the GTF file into a pyranges object.

    Parameters:
    - gtf_file (str): Path to the GTF file.

    Returns:
    - gtf_exons_gr (pyranges.GenomicRanges): Processed pyranges object.
    """

    print("The gtf file you provided is " + gtf_file)
    print("This step may take a while depending on the size of your gtf file")

    # calculate how long it takes to read gtf_file and report it 
    start_time = time.time()
    #[1] extract all exons from gtf file provided 
    gtf = read_gtf(gtf_file, result_type="pandas") #to reduce the speed of this, can just get rows with exon in the feature column (preprocess this before running package)? check if really necessary
    end_time = time.time()

    print("Reading gtf file took " + str(round((end_time-start_time), 2)) + " seconds")
    # assert that gtf is a non empty dataframe otherwise return an error
    if gtf.empty or type(gtf) != pd.DataFrame:
        raise ValueError("The gtf file provided is empty or not a pandas DataFrame. Please provide a valid gtf file and ensure you have the \
                         latest version of gtfparse installed by running 'pip install gtfparse --upgrade'")
    
    # Convert the seqname column to a string in gtf 
    gtf["seqname"] = gtf["seqname"].astype(str)

    # Make a copy of the DataFrame
    gtf_exons = gtf[(gtf["feature"] == "exon")].copy()

    if gtf_exons['seqname'].str.contains('chr').any():
        gtf_exons.loc[gtf_exons['seqname'].str.contains('chr'), 'seqname'] = gtf_exons['seqname'].map(lambda x: x.lstrip('chr').rstrip('chr'))

    if not set(['seqname', 'start', 'end', 'score', 'strand', 'gene_id', 'gene_name', 'transcript_id', 'exon_id']).issubset(gtf_exons.columns):
        # print the columns that the file is missing
        missing_cols = set(['seqname', 'start', 'end', 'score', 'strand', 'gene_id', 'gene_name', 'transcript_id', 'exon_id']).difference(gtf_exons.columns)
        print("Your gtf file is missing the following columns: " + str(missing_cols))

        # if the missing column is just exon_id, we can generate it
        if "exon_id" in missing_cols:
            # add exon_id to gtf_exons
            print("Adding exon_id column to gtf file")
            gtf_exons.loc[:, "exon_id"] = gtf_exons["transcript_id"] + "_" + gtf_exons["start"].astype(str) + "_" + gtf_exons["end"].astype(str)
        else:
            pass

    # Convert the DataFrame to a PyRanges object
    gtf_exons_gr = pr.from_dict({"Chromosome": gtf_exons["seqname"], "Start": gtf_exons["start"], "End": gtf_exons["end"], "Strand": gtf_exons["strand"], "gene_id": gtf_exons["gene_id"], "gene_name": gtf_exons["gene_name"], "transcript_id": gtf_exons["transcript_id"], "exon_id": gtf_exons["exon_id"]})

    # Remove rows where exon start and end are the same or when gene_name is empty
    gtf_exons_gr = gtf_exons_gr[ ~ (gtf_exons_gr.Start == gtf_exons_gr.End)]
    gtf_exons_gr = gtf_exons_gr[ ~ (gtf_exons_gr.gene_name == "")]

    # When do I need to do this? depends on gtf file used? base 0 or 1? probably need this to be a parameter 
    gtf_exons_gr.Start = gtf_exons_gr.Start-1

    # Drop duplicated positions on same strand 
    gtf_exons_gr = gtf_exons_gr.drop_duplicate_positions(strand=True) # Why are so many gone after this? 

    # Print the number of unique exons, transcript ids, and gene ids
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("The number of unique exons is " + str(len(gtf_exons_gr.exon_id.unique())))
    print("The number of unique transcript ids is " + str(len(gtf_exons_gr.transcript_id.unique())))
    print("The number of unique gene ids is " + str(len(gtf_exons_gr.gene_id.unique())))
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    return(gtf_exons_gr)

In [4]:
def refine_clusters(clust_info):
    # for all start positions that are same for each cluster get the sum counts_total
    clust_info_5ss = clust_info.groupby(['Cluster', 'Start']).agg({'counts_total': 'sum'}).reset_index()
    clust_info_3_ss = clust_info.groupby(['Cluster', 'End']).agg({'counts_total': 'sum'}).reset_index()
    # rename columns in 5ss to be total 5ss counts
    clust_info_5ss.rename(columns={'counts_total': 'total_5ss_counts'}, inplace=True)
    clust_info_3_ss.rename(columns={'counts_total': 'total_3ss_counts'}, inplace=True)
    # remove Start and End column from each
    clust_info = clust_info.merge(clust_info_5ss, on=['Cluster', 'Start'])
    clust_info = clust_info.merge(clust_info_3_ss, on=['Cluster', 'End'])

    # give each junction a 5ss fraction and 3ss fraction and then add column counts_total 
    clust_info['5SS_usage'] = clust_info['counts_total'] / clust_info['total_5ss_counts']
    clust_info['3SS_usage'] = clust_info['counts_total'] / clust_info['total_3ss_counts']
    clust_info["min_usage"] = clust_info[["5SS_usage", "3SS_usage"]].min(axis=1)
    print("Done refining clusters!")
    return(clust_info)

In [5]:
def filter_junctions_by_shared_splice_sites(df):
    # Function to apply to each group (cluster)
    def filter_group(group):
        # Find duplicated start and end positions within the group
        duplicated_starts = group['Start'].duplicated(keep=False)
        duplicated_ends = group['End'].duplicated(keep=False)
        
        # Keep rows where either start or end position is duplicated
        return group[duplicated_starts | duplicated_ends]
    
    # Group by 'Cluster' and apply the filtering function
    filtered_df = df.groupby('Cluster').apply(filter_group).reset_index(drop=True)
    return filtered_df.Cluster.unique()

In [6]:
def read_junction_files(junc_files, junc_suffix):
    """
    Read junction files.

    Parameters:
    - junc_files (list): List of paths to junction files.
    - junc_suffix (str): Suffix of junction files.

    Returns:
    - pd.DataFrame: Concatenated DataFrame of junction files.
    """
    all_juncs_list = []

    for junc_path in junc_files:
        junc_path = Path(junc_path)
        print(f"Reading in junction files from {junc_path}")

        junc_files_in_path = list(junc_path.rglob(junc_suffix))
        if not junc_files_in_path:
            print(f"No junction files found in {junc_path} with suffix {junc_suffix}")
            continue

        print(f"The number of junction files to be processed is {len(junc_files_in_path)}")

        files_not_read = []

        for junc_file in tqdm(junc_files_in_path):
            try:
                juncs = pd.read_csv(junc_file, sep="\t", header=None)
                juncs['file_name'] = junc_file  # Add the file name as a new column
                juncs['cell_type'] = junc_file
                all_juncs_list.append(juncs)  # Append the DataFrame to the list
            except Exception as e:
                print(f"Could not read in {junc_file}: {e}")
                files_not_read.append(junc_file)

    if len(files_not_read) > 0:
        print("The total number of files that could not be read is " + str(len(files_not_read)) + " as these had no junctions")

    # Concatenate all DataFrames into a single DataFrame
    all_juncs = pd.concat(all_juncs_list, ignore_index=True) if all_juncs_list else pd.DataFrame()

    return all_juncs

In [7]:
def clean_up_juncs(all_juncs, col_names, min_intron, max_intron):
    
    # Apply column names to the DataFrame
    all_juncs.columns = col_names
    
    # Split 'blockSizes' into two new columns and convert them to integers (this step takes a while)
    all_juncs[['block_add_start', 'block_subtract_end']] = all_juncs["blockSizes"].str.split(',', expand=True).astype(int)

    # Adjust 'chromStart' and 'chromEnd' based on 'block_add_start' and 'block_subtract_end'
    all_juncs["chromStart"] += all_juncs['block_add_start']
    all_juncs["chromEnd"] -= all_juncs['block_subtract_end']

    # Calculate 'intron_length' and filter based on 'min_intron' and 'max_intron'
    all_juncs["intron_length"] = all_juncs["chromEnd"] - all_juncs["chromStart"]
    mask = (all_juncs["intron_length"] >= min_intron) & (all_juncs["intron_length"] <= max_intron)
    all_juncs = all_juncs[mask]

    # Filter for 'chrom' column to handle "chr" prefix
    all_juncs = all_juncs.copy()

    # New filter for 'chrom' column to handle "chr" prefix, using .loc for safe in-place modification
    standard_chromosomes_pattern = r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|X|Y|MT)$'
    all_juncs = all_juncs[all_juncs['chrom'].str.match(standard_chromosomes_pattern)]

    print("Cleaning up 'chrom' column")
    # Remove "chr" prefix from 'chrom' column
    all_juncs['chrom'] = all_juncs['chrom'].str.replace(r'^chr', '', regex=True)
    
    # Add 'junction_id' column
    all_juncs['junction_id'] = all_juncs['chrom'] + '_' + all_juncs['chromStart'].astype(str) + '_' + all_juncs['chromEnd'].astype(str)
    
    # Get total score for each junction and merge with all_juncs with new column "total_counts"
    all_juncs = all_juncs.groupby('junction_id').agg({'score': 'sum'}).reset_index().merge(all_juncs, on='junction_id', how='left')

    # rename score_x and score_y to total_junc_counts and score 
    all_juncs.rename(columns={'score_x': 'counts_total', 'score_y': 'score'}, inplace=True)

    return(all_juncs)

In [8]:
def mapping_juncs_exons(juncs_gr, gtf_exons_gr, singletons):
    print("Annotating junctions with known exons based on input gtf file")
    
    # for each junction, the start of the junction should equal end of exons and end of junction should equal start of exon 
    juncs_gr = juncs_gr.k_nearest(gtf_exons_gr, strandedness = "same", ties="different", k=2, overlap=False)
    # ensure distance parameter is still 1 
    juncs_gr = juncs_gr[abs(juncs_gr.Distance) == 1]

    # group juncs_gr by gene_id and ensure that each junction has Start and End aligning with at least one End_b and Start_b respectively
    grouped_gr = juncs_gr.df.groupby("gene_id")
    juncs_keep = []
    for name, group in grouped_gr:
        group = group[(group.Start.isin(group.End_b)) & (group.End.isin(group.Start_b))]
        # save junctions that are found here after filtering for matching start and end positions
        juncs_keep.append(group.junction_id.unique())

    # flatten the list of lists
    juncs_keep = [item for sublist in juncs_keep for item in sublist]
    juncs_gr = juncs_gr[juncs_gr.junction_id.isin(juncs_keep)]
    
    print("The number of junctions after assessing distance to exons is " + str(len(juncs_gr.junction_id.unique())))
    if len(juncs_gr.junction_id.unique()) < 5000:
        print("There are less than 5000 junctions after assessing distance to exons. Please check your gtf file and ensure that it is in the correct format (start and end positions are not off by 1).", flush=True)
    
    print("Clustering intron splicing events by gene_id")
    juncs_coords_unique = juncs_gr[['Chromosome', 'Start', 'End', 'Strand', 'junction_id', 'gene_id']].drop_duplicate_positions()
    clusters = juncs_coords_unique.cluster(by="gene_id", slack=-1, count=True)
    print("The number of clusters after removing singletons is " + str(len(clusters.Cluster.unique())))

    if singletons == False:
        # remove singletons 
        clusters = clusters[clusters.Count > 1]
        print("The number of clusters after removing singletons is " + str(len(clusters.Cluster.unique())))
        # update juncs_gr to only include junctions that are part of clusters
        juncs_gr = juncs_gr[juncs_gr.junction_id.isin(clusters.junction_id)]
        # update juncs_coords_unique to only include junctions that are part of clusters
        juncs_coords_unique = juncs_coords_unique[juncs_coords_unique.junction_id.isin(clusters.junction_id)]
        print("The number of junctions after removing singletons is " + str(len(juncs_coords_unique.junction_id.unique())))
        return juncs_gr, juncs_coords_unique, clusters
    else:
        return juncs_gr, juncs_coords_unique, clusters

In [9]:
# junc_files defines a path for where junction files can be found, in this case, the path is defined above
junc_files = juncs_path

# we provide a gtf file for the human genome as well to make better sense of the junctions that are detected in cells
# please replace with the path to the gtf file on your system
gtf_file="/gpfs/commons/groups/knowles_lab/Karin/genome_files/gencode.v43.basic.annotation.gtf" 

# define additional parameters 
sequencing_type = "single_cell"

# ensure output files are to be saved in output_path 
output_file = output_path + "test_intron_clusters"
junc_bed_file= output_path + "test_juncs.bed" # you can load this file into IGV to visualize the junction coordinates 
min_intron_length = 50
max_intron_length = 500000
threshold_inc = 0.05 
min_junc_reads = 2
min_num_cells_wjunc = 2
keep_singletons = False # ignore junctions that do not share splice sites with any other junction (likely const)
junc_suffix = "*.juncswbarcodes" 

if "," in junc_files:
    junc_files = junc_files.split(",")
else:
    junc_files = [junc_files]
all_juncs_list = []

In [10]:
junc_files

['/gpfs/commons/home/kisaev/LeafletSC/data/raw/junctions/']

In [11]:
if gtf_file is not None:
    gtf_exons_gr = process_gtf(gtf_file)
    print("Done extracting exons from gtf file")
else:
    pass

The gtf file you provided is /gpfs/commons/groups/knowles_lab/Karin/genome_files/gencode.v43.basic.annotation.gtf
This step may take a while depending on the size of your gtf file


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']


Reading gtf file took 115.49 seconds


  return {k: v for k, v in df.groupby(grpby_key)}


+++++++++++++++++++++++++++++++++++++++++++++++++++++++
The number of unique exons is 411865
The number of unique transcript ids is 115526
The number of unique gene ids is 62668
+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Done extracting exons from gtf file


In [12]:
all_juncs = read_junction_files(junc_files, junc_suffix)
# Define column names based on sequencing type
col_names = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", 
         "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts"]
if sequencing_type == "single_cell":
    col_names += ["num_cells_wjunc", "cell_readcounts"]
col_names += ["file_name", "cell_type"]

Reading in junction files from /gpfs/commons/home/kisaev/LeafletSC/data/raw/junctions
The number of junction files to be processed is 2


100%|██████████| 2/2 [00:00<00:00, 15.99it/s]


In [13]:
all_juncs = clean_up_juncs(all_juncs, col_names, min_intron_length, max_intron_length)
print("Number of unique junctions is " + str(len(all_juncs.junction_id.unique())))

Cleaning up 'chrom' column
Number of unique junctions is 14416


In [14]:
# 7. Make gr object from ALL junctions across all cell types 
print("Making gr object from all junctions across all cell types")
juncs_gr = pr.from_dict({"Chromosome": all_juncs["chrom"], "Start": all_juncs["chromStart"], "End": all_juncs["chromEnd"], "Strand": all_juncs["strand"], "Cell": all_juncs["cell_type"], "junction_id": all_juncs["junction_id"], "counts_total": all_juncs["counts_total"]})
juncs_gr = juncs_gr[["Chromosome", "Start", "End", "Strand", "junction_id", "counts_total"]].drop_duplicate_positions()

  return {k: v for k, v in df.groupby(grpby_key)}


Making gr object from all junctions across all cell types


In [15]:
# if min_junc_reads is not none then remove junctions with less than min_junc_reads
if min_junc_reads is not None:
    juncs_gr = juncs_gr[juncs_gr.counts_total > min_junc_reads]

In [16]:
if gtf_file is not None:
    juncs_gr, juncs_coords_unique, clusters = mapping_juncs_exons(juncs_gr, gtf_exons_gr, False)

Annotating junctions with known exons based on input gtf file
The number of junctions after assessing distance to exons is 11689
Clustering intron splicing events by gene_id
The number of clusters after removing singletons is 11189
The number of clusters after removing singletons is 223
The number of junctions after removing singletons is 723


In [17]:
juncs_gr 

Unnamed: 0,Chromosome,Start,End,Strand,junction_id,counts_total,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,exon_id,Distance
0,1,101237099,101238821,+,1_101237099_101238821,128,101237018,101237099,+,ENSG00000170989.10,S1PR1,ENST00000305352.7,ENSE00001356737.6,1
1,1,101237099,101238821,+,1_101237099_101238821,128,101238821,101241518,+,ENSG00000170989.10,S1PR1,ENST00000305352.7,ENSE00001167649.10,1
2,1,101237099,101238821,+,1_101237099_101238821,128,101238821,101241189,+,ENSG00000170989.10,S1PR1,ENST00000649383.1,ENSE00003834657.1,1
3,1,101237099,101238821,+,1_101237099_101238821,128,101238821,101241372,+,ENSG00000170989.10,S1PR1,ENST00000475289.2,ENSE00001918985.2,1
4,1,101237099,101238821,+,1_101237099_101238821,128,101238821,101241308,+,ENSG00000170989.10,S1PR1,ENST00000648480.1,ENSE00003835205.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,X,119616742,119625334,-,X_119616742_119625334,172,119625334,119625379,-,ENSG00000125354.24,SEPTIN6,ENST00000360156.11,ENSE00003594905.1,-1
1900,X,119620051,119629317,-,X_119620051_119629317,6,119616944,119620051,-,ENSG00000125354.24,SEPTIN6,ENST00000394610.7,ENSE00001455353.4,-1
1901,X,119620051,119629317,-,X_119620051_119629317,6,119629317,119629508,-,ENSG00000125354.24,SEPTIN6,ENST00000360156.11,ENSE00003693275.1,-1
1902,X,119625379,119629317,-,X_119625379_119629317,167,119625334,119625379,-,ENSG00000125354.24,SEPTIN6,ENST00000360156.11,ENSE00003594905.1,-1


In [18]:
# now for each cluster we want to check that each junction shares a splice site with at least one other junction in the cluster
clusts_keep = filter_junctions_by_shared_splice_sites(clusters.df)
# update clusters, juncs_gr, and juncs_coords_unique to only include clusters
clusters = clusters[clusters.Cluster.isin(clusts_keep)]
juncs_gr = juncs_gr[juncs_gr.junction_id.isin(clusters.junction_id)]
juncs_coords_unique = juncs_coords_unique[juncs_coords_unique.junction_id.isin(clusters.junction_id)]
print("The number of clusters after filtering for shared splice sites is " + str(len(clusters.Cluster.unique())))
print("The number of junctions after filtering for shared splice sites is " + str(len(juncs_coords_unique.junction_id.unique())))

  filtered_df = df.groupby('Cluster').apply(filter_group).reset_index(drop=True)


The number of clusters after filtering for shared splice sites is 214
The number of junctions after filtering for shared splice sites is 703


In [19]:
# update our all_juncs file to only include junctions that are part of clusters
all_juncs = all_juncs[all_juncs.junction_id.isin(juncs_coords_unique.junction_id)]
# double check that juncs_gr and all_juncs have the same number of unique junctions
print("The number of unique junctions in juncs_gr is " + str(len(juncs_gr.junction_id.unique())))
print("The number of unique junctions in all_juncs is " + str(len(all_juncs.junction_id.unique())))
# also check clusters 
print("The number of clusters in clusters is " + str(len(clusters.Cluster.unique())))
print("The number of unique junctions in clusters is " + str(len(clusters.junction_id.unique())))

The number of unique junctions in juncs_gr is 703
The number of unique junctions in all_juncs is 703
The number of clusters in clusters is 214
The number of unique junctions in clusters is 703


In [20]:
print("Refining intron clusters to account for junction usage ratio threshold...")
juncs_counts = juncs_gr.df[['junction_id', 'Start', 'End', 'counts_total']].drop_duplicates()
clust_info = clusters.df[['Cluster', 'junction_id']].drop_duplicates()
clust_info = clust_info.merge(juncs_counts)
junc_scores_all = refine_clusters(clust_info)
junc_scores_all = junc_scores_all[junc_scores_all.min_usage >= threshold_inc]
# add 5ss and 3ss usatio of each junction to all_juncs
junc_scores_all.head()

Refining intron clusters to account for junction usage ratio threshold...
Done refining clusters!


Unnamed: 0,Cluster,junction_id,Start,End,counts_total,total_5ss_counts,total_3ss_counts,5SS_usage,3SS_usage,min_usage
0,69,1_175004814_175006744,175004814,175006744,8,8,153,1.0,0.052288,0.052288
1,69,1_175004833_175006744,175004833,175006744,145,145,153,1.0,0.947712,0.947712
2,150,1_145608175_145616233,145608175,145616233,218,238,218,0.915966,1.0,0.915966
3,150,1_145608175_145618319,145608175,145618319,20,238,158,0.084034,0.126582,0.084034
4,150,1_145616293_145618319,145616293,145618319,138,138,158,1.0,0.873418,0.873418


In [21]:
all_juncs = all_juncs.merge(junc_scores_all[['junction_id', 'total_5ss_counts', 'total_3ss_counts', "5SS_usage", "3SS_usage"]], on='junction_id')
all_juncs.head()

Unnamed: 0,junction_id,counts_total,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,...,cell_readcounts,file_name,cell_type,block_add_start,block_subtract_end,intron_length,total_5ss_counts,total_3ss_counts,5SS_usage,3SS_usage
0,10_118326243_118335113,6,10,118326243,118335113,JUNC00005275,6,-,118326153,118335119,...,B107926_O8_Blue_Blood_S250.homo.gencode.v30.ER...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,90,6,8870,22,6,0.272727,1.0
1,10_118326243_118335553,16,10,118326243,118335553,JUNC00005276,16,-,118326164,118335637,...,B107926_O8_Blue_Blood_S250.homo.gencode.v30.ER...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,79,84,9310,22,16,0.727273,1.0
2,10_12210322_12215739,111,10,12210322,12215739,JUNC00004927,111,+,12210289,12215835,...,B107926_O8_Blue_Blood_S250.homo.gencode.v30.ER...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,33,96,5417,114,111,0.973684,1.0
3,10_12215835_12217360,117,10,12215835,12217360,JUNC00004929,117,+,12215739,12217459,...,B107926_O8_Blue_Blood_S250.homo.gencode.v30.ER...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,96,99,1525,117,120,1.0,0.975
4,10_3779590_3780105,341,10,3779590,3780105,JUNC00004884,104,-,3779495,3780203,...,B107926_O8_Blue_Blood_S250.homo.gencode.v30.ER...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,/gpfs/commons/home/kisaev/LeafletSC/data/raw/j...,95,98,515,347,341,0.982709,1.0


In [22]:
# remove junctions that are in junc_scores_all from juncs_gr, clusters, all_juncs and juncs_coords_unique
juncs_gr = juncs_gr[juncs_gr.junction_id.isin(junc_scores_all.junction_id)]
clusters = clusters[clusters.junction_id.isin(junc_scores_all.junction_id)]
all_juncs = all_juncs[all_juncs.junction_id.isin(junc_scores_all.junction_id)]
juncs_coords_unique = juncs_coords_unique[juncs_coords_unique.junction_id.isin(junc_scores_all.junction_id)]
print("The number of clusters after removing low confidence junctions is " + str(len(clusters.Cluster.unique())))


The number of clusters after removing low confidence junctions is 214


In [23]:
# double check that juncs_gr and all_juncs have the same number of unique junctions
print("The number of unique junctions in juncs_gr is " + str(len(juncs_gr.junction_id.unique())))
print("The number of unique junctions in all_juncs is " + str(len(all_juncs.junction_id.unique())))
# also check clusters 
print("The number of clusters in clusters is " + str(len(clusters.Cluster.unique())))
print("The number of unique junctions in clusters is " + str(len(clusters.junction_id.unique())))

The number of unique junctions in juncs_gr is 577
The number of unique junctions in all_juncs is 577
The number of clusters in clusters is 214
The number of unique junctions in clusters is 577


In [24]:
# 12. given junctions that remain, see if need to recluster introns (low confidence junctions removed)
print("Reclustering intron splicing events after low confidence junction removal")
# check if there are any duplicate entried in pyranges object 
juncs_gr = juncs_gr.drop_duplicate_positions()
# drop original cluster column and add new one
clusters = juncs_gr.cluster(by="gene_id", slack=-1, count=True)
 

Reclustering intron splicing events after low confidence junction removal


In [25]:
clusters = clusters[clusters.Count > 1]

In [26]:
juncs_gr = juncs_gr[juncs_gr.junction_id.isin(clusters.junction_id)]
juncs_gr

  df = pd.concat([plus, minus])
  df = pd.concat([plus, minus])
  df = pd.concat([plus, minus])


Unnamed: 0,Chromosome,Start,End,Strand,junction_id,counts_total,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,exon_id,Distance
0,1,101237099,101238821,+,1_101237099_101238821,128,101237018,101237099,+,ENSG00000170989.10,S1PR1,ENST00000305352.7,ENSE00001356737.6,1
1,1,101238425,101238821,+,1_101238425_101238821,38,101238334,101238425,+,ENSG00000170989.10,S1PR1,ENST00000648480.1,ENSE00003837007.1,1
2,1,111477428,111477650,+,1_111477428_111477650,75,111477333,111477428,+,ENSG00000143110.12,C1orf162,ENST00000343534.9,ENSE00000958163.1,1
3,1,111477428,111477725,+,1_111477428_111477725,253,111477333,111477428,+,ENSG00000143110.12,C1orf162,ENST00000343534.9,ENSE00000958163.1,1
4,1,120824146,120824865,+,1_120824146_120824865,9,120823973,120824146,+,ENSG00000273136.8,NBPF26,ENST00000620612.5,ENSE00003709317.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,X,40599741,40600761,+,X_40599741_40600761,371,40599648,40599741,+,ENSG00000182220.15,ATP6AP2,ENST00000638153.1,ENSE00003795580.1,1
371,X,40599741,40600781,+,X_40599741_40600781,25,40599648,40599741,+,ENSG00000182220.15,ATP6AP2,ENST00000638153.1,ENSE00003795580.1,1
372,X,48575283,48575560,+,X_48575283_48575560,207,48575167,48575283,+,ENSG00000102317.18,RBM3,ENST00000376759.8,ENSE00003606332.1,1
373,X,48575283,48576313,+,X_48575283_48576313,15,48575167,48575283,+,ENSG00000102317.18,RBM3,ENST00000376759.8,ENSE00003606332.1,1


In [27]:
juncs_coords_unique = juncs_coords_unique[juncs_coords_unique.junction_id.isin(clusters.junction_id)]
juncs_coords_unique

  df = pd.concat([plus, minus])
  df = pd.concat([plus, minus])


Unnamed: 0,Chromosome,Start,End,Strand,junction_id,gene_id
0,1,101237099,101238821,+,1_101237099_101238821,ENSG00000170989.10
1,1,101238425,101238821,+,1_101238425_101238821,ENSG00000170989.10
2,1,111477428,111477650,+,1_111477428_111477650,ENSG00000143110.12
3,1,111477428,111477725,+,1_111477428_111477725,ENSG00000143110.12
4,1,120824146,120824865,+,1_120824146_120824865,ENSG00000273136.8
...,...,...,...,...,...,...
370,X,40599741,40600761,+,X_40599741_40600761,ENSG00000182220.15
371,X,40599741,40600781,+,X_40599741_40600781,ENSG00000182220.15
372,X,48575283,48575560,+,X_48575283_48575560,ENSG00000102317.18
373,X,48575283,48576313,+,X_48575283_48576313,ENSG00000102317.18
