In [1]:
import gseapy as gp
import pandas as pd
import numpy as np
import swan_vis as swan

In [2]:
sg = swan.read('../swan.p')

Read in graph from ../swan.p


In [10]:
gids = sg.t_df.gid.str.split('.', expand=True)[0].tolist()
gids[:10]

['ENSG00000227232',
 'ENSG00000227232',
 'ENSG00000227232',
 'ENSG00000225210',
 'ENSG00000279457',
 'ENSG00000279457',
 'ENSG00000279457',
 'ENSG00000177951',
 'ENSG00000244306',
 'ENSG00000226210']

In [4]:
names = gp.get_library_name('Human')
names[:20]

['ARCHS4_Cell-lines',
 'ARCHS4_IDG_Coexp',
 'ARCHS4_Kinases_Coexp',
 'ARCHS4_TFs_Coexp',
 'ARCHS4_Tissues',
 'Achilles_fitness_decrease',
 'Achilles_fitness_increase',
 'Aging_Perturbations_from_GEO_down',
 'Aging_Perturbations_from_GEO_up',
 'Allen_Brain_Atlas_10x_scRNA_2021',
 'Allen_Brain_Atlas_down',
 'Allen_Brain_Atlas_up',
 'Azimuth_Cell_Types_2021',
 'BioCarta_2013',
 'BioCarta_2015',
 'BioCarta_2016',
 'BioPlanet_2019',
 'BioPlex_2017',
 'CCLE_Proteomics_2020',
 'CORUM']

In [5]:
dbs = ['GO_Biological_Process_2021', 'GO_Cellular_Component_2021', 'GO_Molecular_Function_2021', 'KEGG_2019_Mouse']


In [6]:
bm = gp.parser.Biomart()
datasets = bm.get_datasets(mart='ENSEMBL_MART_ENSEMBL')
datasets.loc[datasets.Description.str.contains('Human')]

Unnamed: 0,Name,Description
66,hsapiens_gene_ensembl,Human genes (GRCh38.p13)


In [11]:
dataset = 'hsapiens_gene_ensembl'
attrs = bm.get_attributes(dataset=dataset)
results = bm.query(dataset='hsapiens_gene_ensembl',
                   attributes=['ensembl_gene_id', 'external_gene_name'],
                   filters={'ensembl_gene_id': gids})

In [12]:
results.head()

Unnamed: 0,ensembl_gene_id,external_gene_name
0,ENSG00000210049,MT-TF
1,ENSG00000211459,MT-RNR1
2,ENSG00000210077,MT-TV
3,ENSG00000210082,MT-RNR2
4,ENSG00000209082,MT-TL1


In [13]:
tested = []

conds = sg.adata.obs.loc[sg.adata.obs['sample'].str.contains('pgp1'), 'sample'].unique().tolist()
data = [[0 for i in range(len(conds))] for j in range(len(conds))]
n_die = pd.DataFrame(data=data, columns=conds, index=conds)
n_die.head()

for c1 in conds:
    for c2 in conds:
        if c1 == c2:
            continue 
        fname = 'iso_die_{}_{}.tsv'.format(c1, c2)
        try:
            df = pd.read_csv(fname, sep='\t')
        except:
            fname = 'iso_die_{}_{}.tsv'.format(c2, c1)
            df = pd.read_csv(fname, sep='\t')

        pref = '{}_{}_die'.format(c1, c2)
        
        # filter 
        df = df.loc[(df.adj_p_val < 0.05)&(df.dpi > 10)]
        df = df.loc[~df.gid.str.contains('SIRV')]
        df = df.loc[~df.gid.str.contains('ERCC')]
        
        # GO enrichment
        gids = df.gid.str.rsplit('.', n=1, expand=True)[0].to_frame()
        gids = gids.squeeze().str.strip().tolist()
        gids = bm.query(dataset='hsapiens_gene_ensembl',
                   attributes=['ensembl_gene_id', 'external_gene_name'],
                   filters={'ensembl_gene_id': gids})
        gids = gids.loc[~gids.external_gene_name.isna()]
        gnames = gids.external_gene_name.squeeze().str.strip().tolist()
        go = gp.enrichr(gene_list=gnames,
                        gene_sets=dbs,
                        organism='Human',
                        description=pref,
                        outdir='{}_GO'.format(pref),
                        cutoff=0.5)

