In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.stats.multitest as smm
import numpy as np
#from bioinfokit import analys, visuz
from matplotlib.patches import Circle
from matplotlib.patheffects import withStroke
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
import matplotlib

color_palette = sns.color_palette("colorblind")

sns.set_theme(palette=color_palette, font="Arial", font_scale=1.0, style="white")

import os


In [2]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 25)
#pd.set_option('display.max_colwidth', 300)
#pd.set_option('format.precision', 2)




In [4]:
# load results table from R DESeq analysis (gene-level, LFC shrinkage)
results_genes=pd.read_csv("gene_resAnno_only_female.csv")  
results_genes.rename(columns={"Unnamed: 0": "GENEID"}, inplace = True)

results_genes#.dropna(axis=0, subset=["SYMBOL"])
results_genes #all GENEIDs are unique

Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,GENENAME,SYMBOL
0,ENSMUSG00000000001,6486.858231,0.319575,0.084545,3.779926,0.000157,0.004735,Gnai3,Gnai3
1,ENSMUSG00000000003,0.000000,,,,,,Pbsn,Pbsn
2,ENSMUSG00000000028,179.606989,0.496340,0.209930,2.364310,0.018064,0.128084,Cdc45,Cdc45
3,ENSMUSG00000000037,33.474291,-0.258078,0.615465,-0.419322,0.674981,,Scml2,Scml2
4,ENSMUSG00000000049,2.997353,2.376936,1.844183,1.288883,0.197439,,Apoh,Apoh
...,...,...,...,...,...,...,...,...,...
35677,ENSMUSG00000118670,2.865875,-0.266241,1.296495,-0.205355,0.837295,,,
35678,ENSMUSG00000118671,251.083854,0.196333,0.251715,0.779979,0.435403,0.727331,EPPK1,EPPK1
35679,ENSMUSG00000118672,0.000000,,,,,,Muc4,Muc4
35680,ENSMUSG00001074846,0.000000,,,,,,,


In [5]:
#formatting
data=results_genes
data["padj_rounded"] = data["padj"].map('{:.1e}'.format)
data["GENENAME_padj"] = data["GENENAME"] + "\np=" + data["padj_rounded"].astype(str) +""
data["-log10_padj"] = -np.log10(data["padj"])

In [6]:
padj_thr = 0.05
lfc_thr = 1




data_all = data.dropna(subset=["log2FoldChange", "padj", "GENENAME"])
data_all_up = data_all[(data_all["log2FoldChange"] > lfc_thr) & (data_all["padj"] < 0.05)]
data_all_down = data_all[(data_all["log2FoldChange"] < -lfc_thr) & (data_all["padj"] < 0.05)]

In [7]:
data_DE_all_sortL2FC = pd.concat([data_all_up.sort_values(by=["log2FoldChange"], ascending=False),
           data_all_down.sort_values(by=["log2FoldChange"], ascending=False)])

print(len(data_DE_all_sortL2FC))
data_DE_all_sortL2FC

52


Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,GENENAME,SYMBOL,padj_rounded,GENENAME_padj,-log10_padj
7672,ENSMUSG00000030228,295.80341,2.157526,0.532874,4.048851,5.146978e-05,0.002093235,Pik3c2g,Pik3c2g,0.0021,Pik3c2g\np=2.1e-03,2.679182
9074,ENSMUSG00000032776,64.235516,2.09138,0.54992,3.803064,0.0001429173,0.004376582,Mctp2,Mctp2,0.0044,Mctp2\np=4.4e-03,2.358865
2959,ENSMUSG00000021214,5813.879494,2.008155,0.499583,4.019661,5.828192e-05,0.002259628,Akr1c18,Akr1c18,0.0023,Akr1c18\np=2.3e-03,2.645963
3388,ENSMUSG00000021983,41.360301,1.994092,0.66632,2.992696,0.00276525,0.03721639,Atp8a2,Atp8a2,0.037,Atp8a2\np=3.7e-02,1.429266
3089,ENSMUSG00000021453,248.881327,1.799815,0.297282,6.054226,1.410942e-09,4.259029e-07,Gadd45g,Gadd45g,4.3e-07,Gadd45g\np=4.3e-07,6.370689
15194,ENSMUSG00000053007,4644.332243,1.637491,0.175025,9.355749,8.300911e-21,3.507965e-17,Creb5,Creb5,3.5e-17,Creb5\np=3.5e-17,16.454945
8565,ENSMUSG00000031872,158.7693,1.517528,0.315108,4.815893,1.465432e-06,0.0001317641,Bean1,Bean1,0.00013,Bean1\np=1.3e-04,3.880203
33028,ENSMUSG00000113395,61.554082,1.48249,0.354932,4.176826,2.956047e-05,0.001348085,Gm48365,Gm48365,0.0013,Gm48365\np=1.3e-03,2.870283
4141,ENSMUSG00000023868,1014.548081,1.455722,0.402095,3.620343,0.000294213,0.007627878,Pde10a,Pde10a,0.0076,Pde10a\np=7.6e-03,2.117596
6334,ENSMUSG00000027894,52.771663,1.453162,0.46646,3.115299,0.001837582,0.02786707,Slc6a17,Slc6a17,0.028,Slc6a17\np=2.8e-02,1.554909


In [8]:
data_DE_all_sortL2FC.sort_values(by=["log2FoldChange"], ascending=True)[["GENENAME", 
                                                                         "log2FoldChange", 
                                                                         "padj_rounded"]]\
.round(3)\
.to_excel("data_l2fc_p_DEG_only_female_231212.xlsx")

In [10]:
genes_of_interest = ["Cacna1a",
                    "Cacna1b",
                    "Cacna1c",
                    "Cacna1d",
                    "Cacna1e",
                    "Cacna1f",
                    "Cacna1g",
                    "Cacna1h",
                    "Cacna1i",
                    "Cacna2d1",
                    "Cacna2d2",
                    "Cacnb1",
                    "Cacnb2",
                    "Cacnb3",
                    "Cacnb4"
                    ]

In [11]:
data_goi = data[data["GENENAME"].isin(genes_of_interest)]
data_goi.sort_values(by=["GENENAME"], ascending=True)

Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,GENENAME,SYMBOL,padj_rounded,GENENAME_padj,-log10_padj
9696,ENSMUSG00000034656,59.315295,-0.704451,0.455928,-1.545093,0.1223237,0.3976461,Cacna1a,Cacna1a,0.4,Cacna1a\np=4.0e-01,0.400503
673,ENSMUSG00000004113,45.643195,0.799207,0.406709,1.965056,0.0494077,0.2316534,Cacna1b,Cacna1b,0.23,Cacna1b\np=2.3e-01,0.635161
14851,ENSMUSG00000051331,3815.011557,0.200981,0.121228,1.657872,0.09734339,0.3479333,Cacna1c,Cacna1c,0.35,Cacna1c\np=3.5e-01,0.458504
1674,ENSMUSG00000015968,256.024213,0.620129,0.188,3.29856,0.0009718218,0.01817157,Cacna1d,Cacna1d,0.018,Cacna1d\np=1.8e-02,1.740608
672,ENSMUSG00000004110,9.562112,0.596897,0.856821,0.696641,0.4860274,,Cacna1e,Cacna1e,,Cacna1e\np=nan,
8133,ENSMUSG00000031142,2.967424,-1.218559,1.589666,-0.766551,0.4433487,,Cacna1f,Cacna1f,,Cacna1f\np=nan,
2768,ENSMUSG00000020866,23.656461,-0.435087,0.520216,-0.836359,0.4029532,,Cacna1g,Cacna1g,,Cacna1g\np=nan,
4278,ENSMUSG00000024112,2000.688488,-1.682962,0.11876,-14.171075,1.383663e-45,8.771041e-42,Cacna1h,Cacna1h,8.8e-42,Cacna1h\np=8.8e-42,41.056949
3640,ENSMUSG00000022416,25.848093,0.204966,0.638909,0.320806,0.7483576,,Cacna1i,Cacna1i,,Cacna1i\np=nan,
11696,ENSMUSG00000040118,7865.999032,0.480032,0.113218,4.2399,2.236196e-05,0.001065808,Cacna2d1,Cacna2d1,0.0011,Cacna2d1\np=1.1e-03,2.972321


In [12]:
kegg_list = pd.read_csv("Kegg_list_mmu.csv", 
                        usecols=['GeneID', 'PathwayID','Symbol', 'ENSEMBL', "Genename"])
kegg_aldo = kegg_list[kegg_list["PathwayID"]=="path:mmu04925"]

data_aldo_genes = kegg_aldo.merge(results_genes, how="left", left_on='ENSEMBL', right_on='GENEID', validate="1:1")
print(len(data_aldo_genes.sort_values(by=["log2FoldChange"], ascending=True)))
data_aldo_genes.sort_values(by=["log2FoldChange"], ascending=True)

102


Unnamed: 0,GeneID,PathwayID,Symbol,ENSEMBL,Genename,GENEID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,GENENAME,SYMBOL,padj_rounded,GENENAME_padj,-log10_padj
57,18227,path:mmu04925,Nr4a2,ENSMUSG00000026826,"nuclear receptor subfamily 4, group A, member 2",ENSMUSG00000026826,901.539706,-2.363813,1.133981,-2.084526,0.03711236,0.1988112,Nr4a2,Nr4a2,0.2,Nr4a2\np=2.0e-01,0.701559
93,58226,path:mmu04925,Cacna1h,ENSMUSG00000024112,"calcium channel, voltage-dependent, T type, al...",ENSMUSG00000024112,2000.688488,-1.682962,0.11876,-14.171075,1.383663e-45,8.771041e-42,Cacna1h,Cacna1h,8.8e-42,Cacna1h\np=8.8e-42,41.056949
41,15370,path:mmu04925,Nr4a1,ENSMUSG00000023034,"nuclear receptor subfamily 4, group A, member 1",ENSMUSG00000023034,5313.643828,-1.260267,1.114075,-1.131222,0.2579616,0.5753217,Nr4a1,Nr4a1,0.58,Nr4a1\np=5.8e-01,0.240089
92,54652,path:mmu04925,Cacna1f,ENSMUSG00000031142,"calcium channel, voltage-dependent, alpha 1F s...",ENSMUSG00000031142,2.967424,-1.218559,1.589666,-0.766551,0.4433487,,Cacna1f,Cacna1f,,Cacna1f\np=nan,
85,269060,path:mmu04925,Dagla,ENSMUSG00000035735,"diacylglycerol lipase, alpha",ENSMUSG00000035735,65.765206,-1.09905,0.302819,-3.629395,0.0002840865,0.007441422,Dagla,Dagla,0.0074,Dagla\np=7.4e-03,2.128344
25,12292,path:mmu04925,Cacna1s,ENSMUSG00000026407,"calcium channel, voltage-dependent, L type, al...",ENSMUSG00000026407,3.606254,-1.084701,1.213527,-0.893842,0.3714064,,Cacna1s,Cacna1s,,Cacna1s\np=nan,
62,18752,path:mmu04925,Prkcg,ENSMUSG00000078816,"protein kinase C, gamma",ENSMUSG00000078816,5.168329,-1.048316,1.192,-0.87946,0.379152,,Prkcg,Prkcg,,Prkcg\np=nan,
51,16521,path:mmu04925,Kcnj5,ENSMUSG00000032034,"potassium inwardly-rectifying channel, subfami...",ENSMUSG00000032034,45.683308,-0.867625,0.382738,-2.266889,0.02339703,0.1515726,Kcnj5,Kcnj5,0.15,Kcnj5\np=1.5e-01,0.819379
94,67821,path:mmu04925,Atp1b4,ENSMUSG00000016327,"ATPase, (Na+)/K+ transporting, beta 4 polypeptide",ENSMUSG00000016327,0.162192,-0.782954,4.080473,-0.191878,0.8478375,,Atp1b4,Atp1b4,,Atp1b4\np=nan,
69,18976,path:mmu04925,Pomc,ENSMUSG00000020660,pro-opiomelanocortin-alpha,ENSMUSG00000020660,5.121558,-0.696873,0.994334,-0.700844,0.4834003,,Pomc,Pomc,,Pomc\np=nan,


In [14]:
data_aldo_genes

Unnamed: 0,GeneID,PathwayID,Symbol,ENSEMBL,Genename,GENEID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,GENENAME,SYMBOL,padj_rounded,GENENAME_padj,-log10_padj
0,100043456,path:mmu04925,Hsd3b8,ENSMUSG00000095388,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",ENSMUSG00000095388,0.0,,,,,,Gm10681,Gm10681,,Gm10681\np=nan,
1,100043461,path:mmu04925,Hsd3b9,ENSMUSG00000090817,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",ENSMUSG00000090817,0.0,,,,,,Gm4450,Gm4450,,Gm4450\np=nan,
2,101540,path:mmu04925,Prkd2,ENSMUSG00000041187,protein kinase D2,ENSMUSG00000041187,239.025892,0.001991,0.212465,0.00937,0.9925238,0.9980344,Prkd2,Prkd2,1.0,Prkd2\np=1.0e+00,0.000855
3,104110,path:mmu04925,Adcy4,ENSMUSG00000022220,adenylate cyclase 4,ENSMUSG00000022220,157.549892,-0.115343,0.218021,-0.529046,0.5967737,0.8352464,Adcy4,Adcy4,0.84,Adcy4\np=8.4e-01,0.078185
4,104111,path:mmu04925,Adcy3,ENSMUSG00000020654,adenylate cyclase 3,ENSMUSG00000020654,771.103325,-0.061749,0.113386,-0.544593,0.5860331,0.8282924,Adcy3,Adcy3,0.83,Adcy3\np=8.3e-01,0.081816
5,108058,path:mmu04925,Camk2d,ENSMUSG00000053819,calcium/calmodulin-dependent protein kinase II...,ENSMUSG00000053819,4659.073264,0.32222,0.086392,3.729727,0.0001916873,0.00547345,Camk2d,Camk2d,0.0055,Camk2d\np=5.5e-03,2.261739
6,109305,path:mmu04925,Orai1,ENSMUSG00000049686,ORAI calcium release-activated calcium modulat...,ENSMUSG00000049686,151.411743,-0.257652,0.208723,-1.234422,0.2170459,0.5308078,Orai1,Orai1,0.53,Orai1\np=5.3e-01,0.275063
7,11512,path:mmu04925,Adcy6,ENSMUSG00000022994,adenylate cyclase 6,ENSMUSG00000022994,962.38789,0.14767,0.113638,1.299482,0.1937785,0.5004622,Adcy6,Adcy6,0.5,Adcy6\np=5.0e-01,0.300629
8,11513,path:mmu04925,Adcy7,ENSMUSG00000031659,adenylate cyclase 7,ENSMUSG00000031659,690.529756,-0.07404,0.136668,-0.541753,0.5879889,0.8286306,Adcy7,Adcy7,0.83,Adcy7\np=8.3e-01,0.081639
9,11514,path:mmu04925,Adcy8,ENSMUSG00000022376,adenylate cyclase 8,ENSMUSG00000022376,70.962579,0.682274,0.572679,1.191371,0.2335078,0.5495474,Adcy8,Adcy8,0.55,Adcy8\np=5.5e-01,0.259995


In [13]:
data_aldo_genes.sort_values(by=["padj", "log2FoldChange"], ascending=True).to_excel("kegg_aldo_mmu04925_genes_only_female_231212.xlsx",
                                                            columns=["Symbol", "log2FoldChange", "padj"])