In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import gseapy as gp
from gseapy import barplot, dotplot
import pickle        


## Gene-sets from DBs

In [2]:
GENE_SETS = {}
MOUSE_DB = gp.get_library_name(organism='Mouse')
MOUSE_DB

['ARCHS4_Cell-lines',
 'ARCHS4_IDG_Coexp',
 'ARCHS4_Kinases_Coexp',
 'ARCHS4_TFs_Coexp',
 'ARCHS4_Tissues',
 'Achilles_fitness_decrease',
 'Achilles_fitness_increase',
 'Aging_Perturbations_from_GEO_down',
 'Aging_Perturbations_from_GEO_up',
 'Allen_Brain_Atlas_10x_scRNA_2021',
 'Allen_Brain_Atlas_down',
 'Allen_Brain_Atlas_up',
 'Azimuth_2023',
 'Azimuth_Cell_Types_2021',
 'BioCarta_2013',
 'BioCarta_2015',
 'BioCarta_2016',
 'BioPlanet_2019',
 'BioPlex_2017',
 'CCLE_Proteomics_2020',
 'CORUM',
 'COVID-19_Related_Gene_Sets',
 'COVID-19_Related_Gene_Sets_2021',
 'Cancer_Cell_Line_Encyclopedia',
 'CellMarker_2024',
 'CellMarker_Augmented_2021',
 'ChEA_2013',
 'ChEA_2015',
 'ChEA_2016',
 'ChEA_2022',
 'Chromosome_Location',
 'Chromosome_Location_hg19',
 'ClinVar_2019',
 'DGIdb_Drug_Targets_2024',
 'DSigDB',
 'Data_Acquisition_Method_Most_Popular_Genes',
 'DepMap_CRISPR_GeneDependency_CellLines_2023',
 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
 'DepMap_WG_CRISPR_Screens_Sanger_Cell

### WikiPathways2019_Mouse: PluriNetWork_WP1763

In [3]:
wp = gp.get_library(name='WikiPathways_2019_Mouse',organism='Mouse')

In [4]:
ESC_PATHWAYS= [  
                'Hedgehog Signaling Pathway WP116',
                'PluriNetWork WP1763',
                'ESC Pluripotency Pathways WP339',
                'Wnt Signaling Pathway WP403',]
CARDIO_PATHWAYS = ['Striated Muscle Contraction WP216', 
                'Heart Development WP2067',
                'Delta-Notch Signaling Pathway WP265',]

MES_PATHWAYS = ['Epithelial Mesenchymal Transition WP306',]

### Add Early Mesodermal lineage markers 
https://www.rndsystems.com/research-area/early-mesodermal-lineage-markers

In [5]:
MES = [
    'Cdh2',          # N-Cadherin

    # Early Mesodermal Cell Secreted Factors
    'Inhba',         # Activin A
    'Inhba',         # Activin AB (component 1)
    'Inhbb',         # Activin AB (component 2)
    'Inhba',         # Activin AC (component 1)
    'Inhbc',         # Activin AC (component 2)
    'Inhbb',         # Activin B
    'Inhbc',         # Activin C

    # BMP and Other Activin Receptor Activators/Inhibitors
    'Bmp2',          # BMP-2
    'Bmp2',          # BMP-2/BMP-4 (component 1)
    'Bmp4',          # BMP-2/BMP-4 (component 2)
    'Bmp2',          # BMP-2/BMP-6 Heterodimer (component 1)
    'Bmp6',          # BMP-2/BMP-6 Heterodimer (component 2)
    'Bmp2',          # BMP-2/BMP-7 Heterodimer (component 1)
    'Bmp7',          # BMP-2/BMP-7 Heterodimer (component 2)
    'Bmp2',          # BMP-2a (no specific `Bmp2a` in mouse)
    'Bmp4',          # BMP-4
    'Bmp6',          # BMP-6
    'Bmp7',          # BMP-7
    'Cfc1',          # Cryptic
    'Fabp4',         # FABP4/A-FABP
    'Fgf5',          # FGF-5
    'Gdf1',          # GDF-1
    'Gdf3',          # GDF-3
    'Inhba',         # INHBA
    'Inhbb',         # INHBB
    'Inhbe',         # INHBE
    'Nodal',         # Nodal
    'Tgfb1',         # TGF-beta 1
    'Tgfb2',         # TGF-beta 2
    'Tgfb3',         # TGF-beta 3
    'Wnt3a',         # Wnt-3a
    'Wnt8a',         # Wnt-8a
    'Wnt3',          # Wnt-3

    # ER Proteins in Early Mesodermal Cells
    'Mesd',        # MESDC2
    'Ncln',          # Nicalin

    # Transcription Factors in Early Mesodermal Cells
    'T',             # Brachyury
    'Eomes',         # EOMES
    'Foxc1',         # FoxC1
    'Foxf1',         # FoxF1
    'Gsc',           # Goosecoid
    'Hand1',         # HAND1
    'Mixl1',         # MIXL1
    'Snai2',         # Slug
    'Snai1',         # Snail
    'Tbx6',          # TBX6
    'Twist1',        # Twist-1
    'Twist2'         # Twist-2
]
MES = set(MES)

### Add Early Endodermal lineage markers 
https://www.rndsystems.com/research-area/early-endodermal-lineage-markers

In [6]:
# Definitive Endoderm Markers
DE = [
    "Cldn6",  # Claudin-6
    "Krt19",  # Cytokeratin 19
    "Eomes",  # EOMES
    "Fabp1",  # FABP1/L-FABP
    "Fabp2",  # FABP2/I-FABP
    "Gata4",  # GATA-4
    "Gsc",    # Goosecoid
    "Foxa1",  # FoxA1/HNF-3 alpha
    "Foxa2",  # FoxA2/HNF-3 beta
    "Sox7",   # SOX7
    "Sox17",  # SOX17
    "Hnf1b"   # HNF-1 beta/TCF-2
]

# Primitive Endoderm Markers
PrE = [
    "Afp",    # alpha-Fetoprotein/AFP
    "Ctnnb1", # beta-Catenin
    "Gata4",  # GATA-4
    "Gata6",  # GATA-6
    "Gdf1",   # GDF-1
    "Gdf3",   # GDF-3
    "Hnf4a",  # HNF-4 alpha/NR2A1
    "Mixl1",  # MIXL1
    "Sall4",  # SALL4
    "Sox7",   # SOX7
    "Sox17"   # SOX17
]
PrE = set(PrE)
DE = set(DE)

In [7]:
mouse_genes = pd.read_csv('../data/matrices/RNA_FPKM2500.csv')['GENE']
mouse_genes = set(mouse_genes)
assert len(MES.difference(mouse_genes))==0
assert len(PrE.difference(mouse_genes))==0
assert len(DE.difference(mouse_genes))==0

In [8]:
GENE_SETS = {  
                'Hedgehog Signaling Pathway WP116':wp['Hedgehog Signaling Pathway WP116'],
                #'PluriNetWork WP1763',
                'ESC Pluripotency Pathways WP339':wp['ESC Pluripotency Pathways WP339'],
                'Wnt Signaling Pathway WP403': wp['Wnt Signaling Pathway WP403'],
                'Early Mesodermal Lineage Markers': MES,
                'Heart Development WP2067': wp['Heart Development WP2067'],
                'Delta-Notch Signaling Pathway WP265': wp['Delta-Notch Signaling Pathway WP265'],
                'Neural Crest Differentiation WP2074': wp['Neural Crest Differentiation WP2074'],
                'Striated Muscle Contraction WP216': wp['Striated Muscle Contraction WP216'], 
                
                'Definitive Endoderm Markers': DE,
                }

### Map Gene names Human2Mouse (Manually AAA123A-> Aaa123a)

In [9]:
def hum2mouse(gene):
	new_gene = gene[0] + gene[1:].lower()
	return new_gene


In [10]:
GENE_SETS_low = {}

for term, genes in GENE_SETS.items():
    new_genes = []
    for gene in genes: 
            new_gene = hum2mouse(gene)
            new_genes.append(new_gene)
    GENE_SETS_low[term] = new_genes

In [11]:
! mkdir -p ../data/gene_sets
with open(f'../data/gene_sets/gene_sets_dict.pkl', 'wb') as f:
        pickle.dump(GENE_SETS_low, f)

# Bivalent/Active promoters ESC Gonzalez

In [12]:
ACT = set(pd.read_excel('../../00_RegionAnnotation/Gonzalez/active_promoters.xlsx')['gene'])
BIV = set(pd.read_excel('../../00_RegionAnnotation/Gonzalez/bivalent_promoters.xlsx')['gene'])
GONZALEZ = {  'Active':ACT, 
            'Bivalent':BIV}
with open(f'../data/gene_sets/gonzalez_dict.pkl', 'wb') as f:
        pickle.dump(GONZALEZ, f)