## scQuint Data Preprocessing

In [1]:
##processing pipeline of cellxSJ matric StarSolo output for use with scQuint. Scripts by Gonzalo Benegas

In [2]:
!date; hostname

Thu Sep  5 12:42:48 PDT 2024
c4-dev2


In [3]:
!echo $CONDA_PREFIX

/c4/home/derek/miniconda3/envs/scquint_4


In [4]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scanpy as sc
import seaborn as sns
from umap import UMAP

from scquint.data import add_gene_annotation, group_introns, load_adata_from_starsolo, filter_singletons

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
output_dir='/c4/home/derek/data1/derek/data_scSLR/prenatal_brain/'

### Load Gene Expression Anndata

In [6]:
# we will extract obs and X_umap
adata_exp = anndata.read_h5ad(output_dir+"scANVI_label.h5ad")
adata_exp

AnnData object with n_obs × n_vars = 74327 × 36385
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple'
    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'C_scANVI_simple_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    obsp: 'connectivities', 'distances'

### Load Splice Junction Matricies into scQuint

In [7]:
input_dir = "/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/STAR_outs/"

In [8]:
%%time
adatas_spl = {}
samples = ["GW16_1", "GW16_2", "GW17_1", "GW21_1", "GW21_2", "GW23_1"]
for sample in samples:
    print(sample)
    try:
        adata_spl_sample = load_adata_from_starsolo(input_dir+f"/{sample}/StarOut/Solo.out/SJ/raw/", var_filename="SJ.out.tab")
    except:
        adata_spl_sample = load_adata_from_starsolo(input_dir+f"/{sample}/StarOut/Solo.out/SJ/raw/", var_filename="SJ.out.tab")
    adatas_spl[sample] = adata_spl_sample

GW16_1
Filtering out undefined strand.
GW16_2
Filtering out undefined strand.
GW17_1
Filtering out undefined strand.
GW21_1
Filtering out undefined strand.
GW21_2
Filtering out undefined strand.
GW23_1
Filtering out undefined strand.
CPU times: user 12min 58s, sys: 28.2 s, total: 13min 26s
Wall time: 13min 52s


In [9]:
adata_spl = anndata.concat(adatas_spl, index_unique="-", merge="same")
adata_spl

AnnData object with n_obs × n_vars = 40769280 × 254570
    var: 'chromosome', 'start', 'end', 'strand', 'intron_motif'

In [10]:
# solo_doublet = pd.read_csv('./solo_preds.csv', index_col=0)

# adata_spl = adata_spl[adata_spl.obs_names.isin(solo_doublet.index)]
# adata_spl.obs['solo_doublet'] = solo_doublet 

In [11]:
##fix obs names
adata_spl.obs_names = adata_spl.obs_names.str.replace(
    'GW16_1','0').str.replace(
    'GW16_2','1').str.replace(
    'GW17_1','2').str.replace(
    'GW21_1','3').str.replace(
    'GW21_2','4').str.replace(
    'GW23_1','5')


In [12]:
adata_spl = adata_spl[adata_exp.obs.index.values]
adata_spl.obs = pd.concat([adata_spl.obs,adata_exp.obs],axis=1)
adata_spl.obsm["X_umap"] = adata_exp.obsm["X_umap"]

In [13]:
# adata_spl = adata_spl[adata_spl.obs.solo_doublet == 'singlet']

In [14]:
adata_spl

AnnData object with n_obs × n_vars = 74327 × 254570
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple'
    var: 'chromosome', 'start', 'end', 'strand', 'intron_motif'
    obsm: 'X_umap'

In [15]:
# MAJIQ_input_dir ='/c4/home/derek/data1//derek/data_scSLR/prenatal_brain/majiq/voila_modulize_out_PSI_0.01/'
# os.listdir(MAJIQ_input_dir)

In [16]:
# cass = pd.read_csv(MAJIQ_input_dir+'cassette.tsv',sep='\t',comment='#')

In [17]:
# cass[cass['merged_complete_median_reads'] > 100]['junction_coord'].nunique()

### Filter splice juntions 

In [18]:
%%time
##get junction overlap from all alignments from alignnment SJ.out.tab files

SJ_table = {}

samples = ["GW16_1", "GW16_2", "GW17_1", "GW21_1", "GW21_2", "GW23_1"]
for sample in samples:
    print(sample)
    try:
        SJ_table_sample = pd.read_csv(input_dir+f"/{sample}/StarOut/SJ.out.tab",sep='\t',header=None,
                            names=['chromosome','start','end','strand','intron_motif','annotated','coverage','multi_map_coverage','max_ovehang'])
    except:
        SJ_table_sample = pd.read_csv(input_dir+f"/{sample}/StarOut/SJ.out.tab_new",sep='\t',header=None,
                            names=['chromosome','start','end','strand','intron_motif','annotated','coverage','multi_map_coverage','max_ovehang'])
        
    
    SJ_table[sample] = SJ_table_sample



GW16_1
GW16_2
GW17_1
GW21_1
GW21_2
GW23_1
CPU times: user 12.3 s, sys: 1.3 s, total: 13.6 s
Wall time: 15.8 s


In [19]:
SJ_table_df = pd.concat(SJ_table.values(), ignore_index=True)

In [20]:
SJ_table_df = SJ_table_df.groupby(['chromosome','start','end','strand','intron_motif','annotated']).agg({'coverage':'sum', 
                         'multi_map_coverage':'max', 
                         'max_ovehang':'max',})     


In [21]:
SJ_table_df=SJ_table_df.reset_index()

In [22]:
SJ_table_df_bac = SJ_table_df.copy()

In [23]:
SJ_table_df = SJ_table_df[SJ_table_df['coverage'] >= 100]#['annotated'].value_counts()

In [24]:
SJ_table_df

Unnamed: 0,chromosome,start,end,strand,intron_motif,annotated,coverage,multi_map_coverage,max_ovehang
170,GL000008.2,196215,198475,2,2,1,100,47,62
434,GL000194.1,93657,93718,1,1,1,594,94,59
453,GL000194.1,112851,114985,2,2,1,279,9,53
481,GL000195.1,5786,51859,1,1,1,127,10,12
502,GL000195.1,30998,31555,1,3,1,160,0,18
...,...,...,...,...,...,...,...,...,...
8367241,chrY,25726514,25727293,2,4,1,359,138,68
8367535,chrY,26346833,26407744,1,1,1,114,29,47
8367585,chrY,26409251,26409698,1,1,0,120,0,50
8367586,chrY,26409251,26409698,1,1,1,197,8,52


In [25]:
SJ_table_df.index = SJ_table_df['chromosome']+':'+SJ_table_df['start'].astype(str)+'-'+SJ_table_df['end'].astype(str)

In [26]:
adata_spl = adata_spl[:,adata_spl.var_names.isin(SJ_table_df.index)]

In [27]:
print(adata_spl.var.head())
print(len(adata_spl.var))

                  chromosome  start     end strand  intron_motif
chr1:14830-14969        chr1  14830   14969      -             2
chr1:16718-187230       chr1  16718  187230      +             1
chr1:17056-17232        chr1  17056   17232      -             2
chr1:17056-17914        chr1  17056   17914      -             2
chr1:17369-17605        chr1  17369   17605      -             2
214469


In [28]:
sc.pp.filter_genes(adata_spl, min_cells=10)
chromosomes = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
adata_spl = adata_spl[:, adata_spl.var.chromosome.isin(chromosomes)]

  adata.var['n_cells'] = number


In [29]:
print(len(adata_spl.var))

186051


In [33]:
pwd

'/nowakowskilab/data1/derek/scSLR/notebooks/fig3'

### Annotate junctions with gene names

In [30]:
%%time
##add annotations

gtf_path = '/c4/home/derek/data1/derek/reference/human_hp3_reference/genes/genes.gtf'
filter_unique_gene=True
    

gtf = pd.read_csv(
    gtf_path,
    sep="\t",
    header=None,
    comment="#",
    names=[
        "chromosome",
        "source",
        "feature",
        "start",
        "end",
        "score",
        "strand",
        "frame",
        "attribute",
    ],
)

gtf = gtf[gtf.feature == "exon"]
gtf["gene_id"] = gtf.attribute.str.extract(r'gene_id "([^;]*)";')
gtf["gene_name"] = gtf.attribute.str.extract(r'gene_name "([^;]*)";')

gtf["transcript_id"] = gtf.attribute.str.extract(r'transcript_id "([^;]*)";')
#gtf.chromosome = "chr" + gtf.chromosome.astype(str)

gene_id_name = gtf[["gene_id", "gene_name"]].drop_duplicates()

exon_starts = (
    gtf[["chromosome", "start", "gene_id"]].copy().rename(columns={"start": "pos"})
)
exon_starts.pos -= 1
exon_ends = (
    gtf[["chromosome", "end", "gene_id"]].copy().rename(columns={"end": "pos"})
)
exon_ends.pos += 1
exon_boundaries = pd.concat(
    [exon_starts, exon_ends], ignore_index=True
).drop_duplicates()

genes_by_exon_boundary = exon_boundaries.groupby(
    ["chromosome", "pos"]
).gene_id.unique()

adata_spl.var = (
    adata_spl.var.merge(
        genes_by_exon_boundary,
        how="left",
        left_on=["chromosome", "start"],
        right_on=["chromosome", "pos"],
    )
    .rename(columns={"gene_id": "gene_id_start"})
    .set_index(adata_spl.var.index)
)
adata_spl.var = (
    adata_spl.var.merge(
        genes_by_exon_boundary,
        how="left",
        left_on=["chromosome", "end"],
        right_on=["chromosome", "pos"],
    )
    .rename(columns={"gene_id": "gene_id_end"})
    .set_index(adata_spl.var.index)
)

def fill_na_with_empty_array(val):
    return val if isinstance(val, np.ndarray) else np.array([])

adata_spl.var.gene_id_start = adata_spl.var.gene_id_start.apply(fill_na_with_empty_array)
adata_spl.var.gene_id_end = adata_spl.var.gene_id_end.apply(fill_na_with_empty_array)

adata_spl.var["gene_id_list"] = adata_spl.var.apply(
    lambda row: np.unique(np.concatenate([row.gene_id_start, row.gene_id_end])),
    axis=1,
)
adata_spl.var["n_genes"] = adata_spl.var.gene_id_list.apply(len)
adata_spl.var.gene_id_list = adata_spl.var.gene_id_list.apply(
    lambda x: ",".join(x.tolist())
)
adata_spl.var.gene_id_start = adata_spl.var.gene_id_start.apply(
    lambda x: ",".join(x.tolist())
)
adata_spl.var.gene_id_end = adata_spl.var.gene_id_end.apply(
    lambda x: ",".join(x.tolist())
)





if filter_unique_gene:
    print("Filtering to introns associated to 1 and only 1 gene.")
    adata_spl = adata_spl[:, adata_spl.var.n_genes == 1]
    adata_spl.var["gene_id"] = adata_spl.var.gene_id_list
    adata_spl.var.drop(columns=["gene_id_list",], inplace=True)
    adata_spl.var = adata_spl.var.merge(gene_id_name, how="left", on="gene_id").set_index(
        adata_spl.var.index
    )
    adata_spl.var.index = adata_spl.var.gene_name.astype(str) + "_" + adata_spl.var.index.astype(str)




Filtering to introns associated to 1 and only 1 gene.




CPU times: user 3min 33s, sys: 17.5 s, total: 3min 51s
Wall time: 3min 56s


In [31]:
#adata_spl.write_h5ad(output_dir+'adata_spl_all.h5ad')

### group introns by 3' and 5' splice sites

In [34]:
def group_introns(adata, by="three_prime", filter_unique_gene_per_group=True):
    if by == "three_prime":
        adata.var["intron_group"] = adata.var.apply(
            lambda intron: intron.chromosome
            + "_"
            + (str(intron.end) if intron.strand == "+" else str(intron.start))
            + "_"
            + intron.strand,
            axis=1,
        )
    elif by == "five_prime":
        adata.var["intron_group"] = adata.var.apply(
            lambda intron: intron.chromosome
            + "_"
            + (str(intron.start) if intron.strand == "+" else str(intron.end))
            + "_"
            + intron.strand,
            axis=1,
        )
    elif by == "gene":
        adata.var["intron_group"] = adata.var.gene_id
    else:
        raise Exception(f"Grouping by {by} not yet supported.")

        
    intron_group_sizes = (
        adata.var.intron_group.value_counts()
        .to_frame()
        .rename(columns={"intron_group": "intron_group_size"})
    )
    
    adata.var = adata.var.merge(
        intron_group_sizes, how="left", left_on="intron_group", right_index=True
    ).set_index(adata.var.index)
        
    # print("Filtering singletons.")
    # adata = adata[:, adata.var.intron_group_size > 1]     


    if filter_unique_gene_per_group:
        print("Filtering intron groups associated with more than 1 gene.")
        n_genes_per_intron_group = adata.var.groupby("intron_group").gene_id.nunique().to_frame().rename(columns={"gene_id": "n_genes_per_intron_group"})
        adata.var = adata.var.merge(n_genes_per_intron_group, how="left", left_on="intron_group", right_index=True)
        adata = adata[:, adata.var.n_genes_per_intron_group==1]
        adata.var.intron_group = adata.var.gene_name.astype(str) + "_" + adata.var.intron_group.astype(str)

    return adata

In [35]:
adata_spl_3p = group_introns(adata_spl.copy(), by="three_prime")
adata_spl_3p.var["grouping"] = "three_prime"
adata_spl_3p

Filtering intron groups associated with more than 1 gene.


  self[name] = value


AnnData object with n_obs × n_vars = 74327 × 169541
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple'
    var: 'chromosome', 'start', 'end', 'strand', 'intron_motif', 'n_cells', 'gene_id_start', 'gene_id_end', 'n_genes', 'gene_id', 'gene_name', 'intron_group', 'intron_group_size', 'n_genes_per_intron_group', 'grouping'
    obsm: 'X_umap'

In [37]:
adata_spl_3p.write_h5ad(output_dir+'3prime_grouped_spl.h5ad')

In [38]:
adata_spl_3p.var.to_csv(output_dir+'3prime_grouped_introns.csv')

In [36]:
adata_spl_5p = group_introns(adata_spl.copy(), by="five_prime")
adata_spl_5p.var["grouping"] = "five_prime"
adata_spl_5p

Filtering intron groups associated with more than 1 gene.


  self[name] = value


AnnData object with n_obs × n_vars = 74327 × 169541
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple'
    var: 'chromosome', 'start', 'end', 'strand', 'intron_motif', 'n_cells', 'gene_id_start', 'gene_id_end', 'n_genes', 'gene_id', 'gene_name', 'intron_group', 'intron_group_size', 'n_genes_per_intron_group', 'grouping'
    obsm: 'X_umap'

In [40]:
adata_spl_5p.write_h5ad(output_dir+'5prime_grouped_spl.h5ad')

In [41]:
#adata_spl_5p.var.to_csv(output_dir+'5prime_grouped_introns.csv')

## combine 3p and 5p groupings

#### keep groups with greater cell coverage

In [None]:
%%time

var_concat = pd.concat([adata_spl_3p.var, adata_spl_5p.var])

direction_dict = {}


for intron in var_concat[var_concat.index.duplicated()].index.unique():

    pair = var_concat[var_concat.index == intron]
    
    count_dict={}
    for intron_group in pair.intron_group:
        n_cells = var_concat[var_concat.intron_group == intron_group].n_cells.sum()
        direction = pair[pair.intron_group == intron_group].grouping[0]

        count_dict[intron_group] = n_cells
        direction_dict[intron_group] = direction
    
    
    #keep introns that exist in directional group with higher cell counts
    keep = max(count_dict,key=count_dict.get)
    direction = direction_dict[keep]
    
    
    direction_dict[intron] = direction
    
    
#     if direction == 'five_prime':
#         direction_dict[intron] = 'five_prime'
        
#     elif direction == 'three_prime':
#         direction_dict[intron] = 'three_prime'

var_concat = pd.concat([adata_spl_3p.var, adata_spl_5p.var])
keep_5p = var_concat[~var_concat.index.duplicated()].query('grouping == "five_prime"').index.values


adata_spl__ = anndata.concat(
    [adata_spl_3p, adata_spl_5p[:, keep_5p]],
    axis=1,
    merge="same",
)
adata_spl__

In [None]:
    
df=var_concat[var_concat.index.duplicated(False)]#.index.unique()

df_ = pd.DataFrame()

for index, row in df.iterrows():
    if row.grouping == direction_dict[index]:
        row=pd.DataFrame(row).T
        
        df_ = pd.concat([df_,row])
        
keep__5p_nondups = var_concat[~var_concat.index.duplicated(keep=False)].query('grouping == "five_prime"').index.values

keep__3p_nondups = var_concat[~var_concat.index.duplicated(keep=False)].query('grouping == "three_prime"').index.values

keep_3p_all = np.concatenate([keep__3p_nondups, df_[df_.grouping == 'three_prime'].index])
keep_5p_all = np.concatenate([keep__5p_nondups, df_[df_.grouping == 'five_prime'].index])

adata_spl_ = anndata.concat(
    [adata_spl_3p[:,keep_3p_all], adata_spl_5p[:, keep_5p_all]],
    axis=1,
    merge="same",
)
adata_spl_

In [None]:
adata_spl_.write_h5ad('singlets_spl.h5ad')

In [None]:
pwd

In [None]:
adata_spl_

In [50]:
# sc.read_h5ad('singlets_spl_.h5ad')

AnnData object with n_obs × n_vars = 74327 × 83187
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'chromosome', 'start', 'end', 'strand', 'intron_motif', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'gene_id_start', 'gene_id_end', 'n_genes', 'gene_id', 'gene_name', 'intron_group', 'intron_group_size', 'n_genes_per_intron_group', 'grouping'
    obsm: 'X_umap'

#### graph based AS structure definition

In [137]:
## Because of the definition of AS structures, it can be proven that a given sub-AS-junction can only be included in up to 
## two AS structures (i.e., SI can only be 1 or 2).

##For the A5SS or A3SS patterns, the representative graphs are asymmetric and 
## are composed of one AS structure with SI value equal to 1 for all sub-AS-junctions

## For the SE pattern, the representative graphs are symmetric, composed of two AS structures, each containing two sub-AS-junctions 
## with SI values equal to 1 and 2, respectively

## For MXE with 𝑛 mutually exclusive exons, the representative graph is composed of one pair of AS structures that each has n sub-AS-junctions
## with SI values all equal to 1. For the MXE pattern, JUM utilizes extra quality control steps, including that coordinates of MXEs
## meet the condition ai < bi < a(i + 1), where i = 1, …, n


In [138]:
#define AS-structure
# An AS structure is a set of splice junctions that share the same start site or the same ending site, 
# with each splice junction in an AS structure defined as a sub-AS-junction

# AS structures are scquint spliceing groups

# define SI for each sub-AS-junction as the number of AS structures that share the specific sub-AS-junction.

In [139]:
import networkx as nx

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [140]:
var_concat = pd.concat([adata_spl_3p.var, adata_spl_5p.var]).sort_values(by='gene_name')

In [141]:
#define SI for each sub-AS-junction as the number of AS structures that share the specific sub-AS-junction

var_concat['SI'] = var_concat.index.map(var_concat.index.value_counts().to_dict())

In [142]:
G = nx.from_pandas_edgelist(var_concat,
                        source='start',
                        target='end',
                        edge_attr=['chromosome','gene_name','n_cells','intron_group','SI']
                       )

In [143]:
var_concat

Unnamed: 0,chromosome,start,end,strand,intron_motif,n_cells,gene_id_start,gene_id_end,n_genes,gene_id,gene_name,intron_group,intron_group_size,n_genes_per_intron_group,grouping,SI
AADAT_chr4:170073346-170078508,chr4,170073346,170078508,-,2,236,ENSG00000109576,ENSG00000109576,1,ENSG00000109576,AADAT,AADAT_chr4_170078508_-,2,1,five_prime,1
AADAT_chr4:170075462-170078508,chr4,170075462,170078508,-,2,106,,ENSG00000109576,1,ENSG00000109576,AADAT,AADAT_chr4_170078508_-,2,1,five_prime,1
AAMDC_chr11:77821242-77842478,chr11,77821242,77842478,+,1,614,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77842478_+,2,1,three_prime,2
AAMDC_chr11:77821242-77842478,chr11,77821242,77842478,+,1,614,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77821242_+,2,1,five_prime,2
AAMDC_chr11:77841061-77842478,chr11,77841061,77842478,+,1,119,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77842478_+,2,1,three_prime,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSWIM7_chr17:15993779-15999505,chr17,15993779,15999505,-,2,134,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15993779_-,3,1,three_prime,1
ZSWIM7_chr17:15977093-15978043,chr17,15977093,15978043,-,2,131,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15977093_-,2,1,three_prime,2
ZSWIM7_chr17:15977093-15977578,chr17,15977093,15977578,-,2,164,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15977093_-,2,1,three_prime,1
ZSWIM7_chr17:15977093-15978043,chr17,15977093,15978043,-,2,131,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15978043_-,2,1,five_prime,2


In [None]:
#G_ = G.subgraph(set(var_concat.head(100).tail(11)[['start','end']].to_numpy().ravel().tolist()))

In [None]:
%%time

counter=0
nxFrame = pd.DataFrame()


for c in  nx.connected_components(G):
    
    #print(c)
    frame = nx.to_pandas_edgelist(G.subgraph(c))
    frame['event'] = counter
    nxFrame = nxFrame.append(frame)
    
    counter +=1
    

In [None]:
nxFrame['a'] = nxFrame[['source','target']].min(axis=1)
nxFrame['b'] = nxFrame[['source','target']].max(axis=1)

nxFrame = nxFrame.drop(['source','target'],axis=1)[['a','b','chromosome','gene_name','intron_group','SI','n_cells','event']]



In [None]:
nxFrame = nxFrame.sort_values(by=['chromosome','a','b'])

In [102]:
nxFrame

Unnamed: 0,a,b,chromosome,gene_name,intron_group,SI,n_cells,event
1,944801,945041,chr1,NOC2L,NOC2L_chr1_944801_-,1,77,4042
0,944801,945056,chr1,NOC2L,NOC2L_chr1_944801_-,1,244,4042
1,999433,999525,chr1,HES4,HES4_chr1_999433_-,1,1790,2644
0,999433,999532,chr1,HES4,HES4_chr1_999433_-,1,63,2644
2,999614,999691,chr1,HES4,HES4_chr1_999614_-,1,724,2643
...,...,...,...,...,...,...,...,...
0,12702056,12707759,chrY,USP9Y,USP9Y_chrY_12702056_+,1,151,6935
2,12702056,12708633,chrY,USP9Y,USP9Y_chrY_12702056_+,1,192,6935
1,12702056,12709387,chrY,USP9Y,USP9Y_chrY_12709387_+,2,1134,6935
3,12707882,12709387,chrY,USP9Y,USP9Y_chrY_12709387_+,1,92,6935


In [202]:
#for each unique event 

In [104]:
df = pd.read_csv('sig_dfc_introns_PT_annotated.csv',index_col=0)

In [105]:
df_ = dict(zip(df['intron_group'], df['event_type']))


In [106]:
nxFrame_ = nxFrame.reset_index(drop=True)

In [107]:
nxFrame_[nxFrame_.gene_name == 'GRIA2']

Unnamed: 0,a,b,chromosome,gene_name,intron_group,SI,n_cells,event
16878,157220145,157220907,chr4,GRIA2,GRIA2_chr4_157220907_+,1,80,2519
16879,157220367,157221666,chr4,GRIA2,GRIA2_chr4_157221666_+,1,50,2519
16880,157220726,157220907,chr4,GRIA2,GRIA2_chr4_157220907_+,2,83,2519
16881,157220726,157221666,chr4,GRIA2,GRIA2_chr4_157221666_+,2,536,2519
16882,157221131,157221666,chr4,GRIA2,GRIA2_chr4_157221666_+,1,4423,2519
16883,157221543,157221666,chr4,GRIA2,GRIA2_chr4_157221666_+,1,140,2519
16884,157332987,157333090,chr4,GRIA2,GRIA2_chr4_157332987_+,1,169,2520
16885,157332987,157333248,chr4,GRIA2,GRIA2_chr4_157332987_+,1,4085,2520
16886,157336572,157341263,chr4,GRIA2,GRIA2_chr4_157341263_+,1,55,2521
16887,157336748,157341263,chr4,GRIA2,GRIA2_chr4_157341263_+,1,508,2521


In [208]:
df[df['event_type'] == 'MXE'].iloc[:,:20].sort_values(by='gene_name')

Unnamed: 0,chromosome,start,end,strand,intron_motif,n_cells,gene_id_start,gene_id_end,n_genes,gene_id,gene_name,size,intron_group,intron_group_size,n_genes_per_intron_group,grouping,coeff,abs_coeff,ivl,cluster
500,chr2,100008933,100011496,-,2,1771,ENSG00000144218,ENSG00000144218,1,ENSG00000144218,AFF3,2563,AFF3_chr2_100008933_-,2,1,three_prime,0.319742,0.319742,343,#5BBCD6
1931,chr4,113500512,113509637,-,2,25,ENSG00000145349,ENSG00000145349,1,ENSG00000145349,CAMK2D,9125,CAMK2D_chr4_113509637_-,2,1,five_prime,-0.915905,0.915905,1978,#00A08A
170,chrX,70492597,70495407,+,1,197,ENSG00000082458,ENSG00000082458,1,ENSG00000082458,DLG3,2810,DLG3_chrX_70492597_+,2,1,five_prime,0.629177,0.629177,1740,#5BBCD6
1720,chrX,70492597,70493375,+,1,67,ENSG00000082458,ENSG00000082458,1,ENSG00000082458,DLG3,778,DLG3_chrX_70492597_+,2,1,five_prime,-0.629177,0.629177,1739,#F2AD00
388,chr3,33297726,33358966,+,1,447,ENSG00000153558,ENSG00000153558,1,ENSG00000153558,FBXL2,61240,FBXL2_chr3_33297726_+,2,1,five_prime,0.233373,0.233373,1956,#5BBCD6
1437,chrX,136207962,136209870,+,1,28,ENSG00000022267,ENSG00000022267,1,ENSG00000022267,FHL1,1908,FHL1_chrX_136209870_+,3,1,three_prime,-0.233489,0.233489,1346,#F2AD00
295,chr16,56336861,56351383,+,1,635,ENSG00000087258,ENSG00000087258,1,ENSG00000087258,GNAO1,14522,GNAO1_chr16_56336861_+,2,1,five_prime,0.396543,0.396543,1497,#5BBCD6
1684,chr16,56336861,56340833,+,1,43,ENSG00000087258,ENSG00000087258,1,ENSG00000087258,GNAO1,3972,GNAO1_chr16_56336861_+,2,1,five_prime,-0.396543,0.396543,1496,#F2AD00
457,chr4,157360144,157361537,+,1,295,ENSG00000120251,ENSG00000120251,1,ENSG00000120251,GRIA2,1393,GRIA2_chr4_157360144_+,5,1,five_prime,0.300084,0.300084,1980,#5BBCD6
2065,chr4,157360144,157361009,+,1,327,ENSG00000120251,ENSG00000120251,1,ENSG00000120251,GRIA2,865,GRIA2_chr4_157360144_+,5,1,five_prime,0.319485,0.319485,1979,#00A08A


In [186]:
nxFrame_['annot'] = nxFrame_.intron_group.map(df_)

In [191]:
nxFrame_[nxFrame_['annot'] == 'MXE']

Unnamed: 0,a,b,chromosome,intron_group,SI,n_cells,event,annot
679,154170712,154171412,chr1,TPM3_chr1_154170712_-,1,469,3695,MXE
680,154170712,154172028,chr1,TPM3_chr1_154170712_-,1,1679,3695,MXE
681,154171489,154172907,chr1,TPM3_chr1_154172907_-,1,521,3694,MXE
682,154172105,154172907,chr1,TPM3_chr1_154172907_-,1,2669,3694,MXE
1937,73716363,73718612,chr11,RAB6A_chr11_73716363_-,1,849,2759,MXE
1938,73716363,73718784,chr11,RAB6A_chr11_73716363_-,1,178,2759,MXE
1939,73718719,73720845,chr11,RAB6A_chr11_73720845_-,1,789,2760,MXE
1940,73718891,73720845,chr11,RAB6A_chr11_73720845_-,1,188,2760,MXE
3972,63060940,63061197,chr15,TPM1_chr15_63060940_+,1,394,3691,MXE
3973,63060940,63061712,chr15,TPM1_chr15_63060940_+,2,872,3691,MXE


In [None]:
nxFrame_[nxFrame_]

In [61]:
set(var_concat.head(6)[['start','end']].to_numpy().ravel().tolist())

{77821242, 77841018, 77841061, 77842478, 170073346, 170075462, 170078508}

In [82]:
var_concat.reset_index()[var_concat.reset_index().intron_group_size > 2]

Unnamed: 0,index,chromosome,start,end,strand,intron_motif,n_cells,gene_id_start,gene_id_end,n_genes,gene_id,gene_name,intron_group,intron_group_size,n_genes_per_intron_group,grouping
92,AC093484.3_chr17:16382511-16382567,chr17,16382511,16382567,-,2,283,ENSG00000265401,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
93,AC093484.3_chr17:16382112-16382567,chr17,16382112,16382567,-,2,4797,,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
94,AC093484.3_chr17:16382340-16382567,chr17,16382340,16382567,-,2,5294,,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
120,ACAT2_chr6:159762778-159762918,chr6,159762778,159762918,+,1,263,ENSG00000120437,ENSG00000120437,1,ENSG00000120437,ACAT2,ACAT2_chr6_159762918_+,3,1,three_prime
121,ACAT2_chr6:159762480-159762918,chr6,159762480,159762918,+,1,239,,ENSG00000120437,1,ENSG00000120437,ACAT2,ACAT2_chr6_159762918_+,3,1,three_prime
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14774,ZNF92_chr7:65374001-65387901,chr7,65374001,65387901,+,1,1271,ENSG00000146757,ENSG00000146757,1,ENSG00000146757,ZNF92,ZNF92_chr7_65374001_+,3,1,five_prime
14775,ZNF92_chr7:65374001-65398340,chr7,65374001,65398340,+,1,185,ENSG00000146757,ENSG00000146757,1,ENSG00000146757,ZNF92,ZNF92_chr7_65374001_+,3,1,five_prime
14800,ZSWIM7_chr17:15993779-15999659,chr17,15993779,15999659,-,2,346,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15993779_-,3,1,three_prime
14801,ZSWIM7_chr17:15993779-15999518,chr17,15993779,15999518,-,2,670,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15993779_-,3,1,three_prime


In [77]:
var_concat[var_concat.intron_group_size > 2]

Unnamed: 0,chromosome,start,end,strand,intron_motif,n_cells,gene_id_start,gene_id_end,n_genes,gene_id,gene_name,intron_group,intron_group_size,n_genes_per_intron_group,grouping
AC093484.3_chr17:16382511-16382567,chr17,16382511,16382567,-,2,283,ENSG00000265401,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
AC093484.3_chr17:16382112-16382567,chr17,16382112,16382567,-,2,4797,,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
AC093484.3_chr17:16382340-16382567,chr17,16382340,16382567,-,2,5294,,ENSG00000265401,1,ENSG00000265401,AC093484.3,AC093484.3_chr17_16382567_-,3,1,five_prime
ACAT2_chr6:159762778-159762918,chr6,159762778,159762918,+,1,263,ENSG00000120437,ENSG00000120437,1,ENSG00000120437,ACAT2,ACAT2_chr6_159762918_+,3,1,three_prime
ACAT2_chr6:159762480-159762918,chr6,159762480,159762918,+,1,239,,ENSG00000120437,1,ENSG00000120437,ACAT2,ACAT2_chr6_159762918_+,3,1,three_prime
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF92_chr7:65374001-65387901,chr7,65374001,65387901,+,1,1271,ENSG00000146757,ENSG00000146757,1,ENSG00000146757,ZNF92,ZNF92_chr7_65374001_+,3,1,five_prime
ZNF92_chr7:65374001-65398340,chr7,65374001,65398340,+,1,185,ENSG00000146757,ENSG00000146757,1,ENSG00000146757,ZNF92,ZNF92_chr7_65374001_+,3,1,five_prime
ZSWIM7_chr17:15993779-15999659,chr17,15993779,15999659,-,2,346,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15993779_-,3,1,three_prime
ZSWIM7_chr17:15993779-15999518,chr17,15993779,15999518,-,2,670,ENSG00000214941,ENSG00000214941,1,ENSG00000214941,ZSWIM7,ZSWIM7_chr17_15993779_-,3,1,three_prime


In [57]:
var_concat.head(20)

Unnamed: 0,chromosome,start,end,strand,intron_motif,n_cells,gene_id_start,gene_id_end,n_genes,gene_id,gene_name,intron_group,intron_group_size,n_genes_per_intron_group,grouping
AADAT_chr4:170073346-170078508,chr4,170073346,170078508,-,2,236,ENSG00000109576,ENSG00000109576,1,ENSG00000109576,AADAT,AADAT_chr4_170078508_-,2,1,five_prime
AADAT_chr4:170075462-170078508,chr4,170075462,170078508,-,2,106,,ENSG00000109576,1,ENSG00000109576,AADAT,AADAT_chr4_170078508_-,2,1,five_prime
AAMDC_chr11:77821242-77842478,chr11,77821242,77842478,+,1,614,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77842478_+,2,1,three_prime
AAMDC_chr11:77821242-77842478,chr11,77821242,77842478,+,1,614,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77821242_+,2,1,five_prime
AAMDC_chr11:77841061-77842478,chr11,77841061,77842478,+,1,119,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77842478_+,2,1,three_prime
AAMDC_chr11:77821242-77841018,chr11,77821242,77841018,+,1,195,ENSG00000087884,ENSG00000087884,1,ENSG00000087884,AAMDC,AAMDC_chr11_77821242_+,2,1,five_prime
AARSD1_chr17:42951895-42954875,chr17,42951895,42954875,-,2,183,ENSG00000266967,ENSG00000266967,1,ENSG00000266967,AARSD1,AARSD1_chr17_42954875_-,2,1,five_prime
AARSD1_chr17:42953779-42954875,chr17,42953779,42954875,-,2,1666,ENSG00000266967,ENSG00000266967,1,ENSG00000266967,AARSD1,AARSD1_chr17_42954875_-,2,1,five_prime
AARSD1_chr17:42964238-42964401,chr17,42964238,42964401,-,2,2290,ENSG00000266967,ENSG00000266967,1,ENSG00000266967,AARSD1,AARSD1_chr17_42964238_-,2,1,three_prime
AARSD1_chr17:42951895-42954875,chr17,42951895,42954875,-,2,183,ENSG00000266967,ENSG00000266967,1,ENSG00000266967,AARSD1,AARSD1_chr17_42951895_-,2,1,three_prime


In [40]:
!conda list

# packages in environment at /c4/home/derek/miniconda3/envs/scquint_4:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiofiles                  22.1.0             pyhd8ed1ab_0    conda-forge
aiosqlite                 0.18.0             pyhd8ed1ab_0    conda-forge
anndata                   0.8.0                    pypi_0    pypi
anyio                     3.6.2              pyhd8ed1ab_0    conda-forge
appdirs                   1.4.4                    pypi_0    pypi
argon2-cffi               21.3.0             pyhd8ed1ab_0    conda-forge
argon2-cffi-bindings      21.2.0          py310h5764c6d_3    conda-forge
asttokens                 2.2.1              pyhd8ed1ab_0    conda-forge
attrs                     22.2.0             pyh71513ae_0    conda-forge
babel                     2.12.1             pyhd8ed1ab_1    conda-forg