In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.stats.multitest as smm
import numpy as np
#from bioinfokit import analys, visuz
from matplotlib.patches import Circle
from matplotlib.patheffects import withStroke
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
import matplotlib

color_palette = sns.color_palette("colorblind")

sns.set_theme(palette=color_palette, font="Arial", font_scale=1.0, style="white")

import os


In [2]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 25)
#pd.set_option('display.max_colwidth', 300)
#pd.set_option('format.precision', 2)




In [3]:
# load results table from R DESeq analysis (gene-level, LFC shrinkage)
results_genes=pd.read_csv("shrink_Anno_female_only.csv")  
results_genes.rename(columns={"Unnamed: 0": "GENEID"}, inplace = True)

results_genes#.dropna(axis=0, subset=["SYMBOL"])
results_genes #all GENEIDs are unique

Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,pvalue,padj,GENENAME,SYMBOL,ENTREZID
0,ENSMUSG00000000001,6486.858231,0.283231,0.087065,0.000157,0.004735,Gnai3,Gnai3,14679.0
1,ENSMUSG00000000003,0.000000,,,,,Pbsn,Pbsn,54192.0
2,ENSMUSG00000000028,179.606989,0.265318,0.259028,0.018064,0.128084,Cdc45,Cdc45,12544.0
3,ENSMUSG00000000037,33.474291,-0.009928,0.122518,0.674981,,Scml2,Scml2,107815.0
4,ENSMUSG00000000049,2.997353,0.009978,0.124762,0.197439,,Apoh,Apoh,11818.0
...,...,...,...,...,...,...,...,...,...
35677,ENSMUSG00000118670,2.865875,-0.002393,0.123846,0.837295,,,,
35678,ENSMUSG00000118671,251.083854,0.040241,0.118604,0.435403,0.727331,EPPK1,EPPK1,
35679,ENSMUSG00000118672,0.000000,,,,,Muc4,Muc4,140474.0
35680,ENSMUSG00001074846,0.000000,,,,,,,68265.0


In [4]:
#formatting
data=results_genes
data["padj_rounded"] = data["padj"].map('{:.1e}'.format)
data["GENENAME_padj"] = data["GENENAME"] + "\np=" + data["padj_rounded"].astype(str) +""
data["-log10_padj"] = -np.log10(data["padj"])

In [6]:
padj_thr = 0.05
lfc_thr = 1




data_all = data.dropna(subset=["log2FoldChange", "padj", "GENENAME"])
data_all_up = data_all[(data_all["log2FoldChange"] > lfc_thr) & (data_all["padj"] < 0.05)]
data_all_down = data_all[(data_all["log2FoldChange"] < -lfc_thr) & (data_all["padj"] < 0.05)]

In [9]:
data_DE_all_sortL2FC = pd.concat([data_all_up.sort_values(by=["log2FoldChange"], ascending=False),
           data_all_down.sort_values(by=["log2FoldChange"], ascending=False)])

print(len(data_DE_all_sortL2FC))
data_DE_all_sortL2FC

30


Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,pvalue,padj,GENENAME,SYMBOL,ENTREZID,padj_rounded,GENENAME_padj,-log10_padj
7672,ENSMUSG00000030228,295.80341,1.855896,0.584566,5.146978e-05,0.002093235,Pik3c2g,Pik3c2g,18705.0,0.0021,Pik3c2g\np=2.1e-03,2.679182
9074,ENSMUSG00000032776,64.235516,1.740238,0.624347,0.0001429173,0.004376582,Mctp2,Mctp2,244049.0,0.0044,Mctp2\np=4.4e-03,2.358865
2959,ENSMUSG00000021214,5813.879494,1.736374,0.547065,5.828192e-05,0.002259628,Akr1c18,Akr1c18,105349.0,0.0023,Akr1c18\np=2.3e-03,2.645963
3089,ENSMUSG00000021453,248.881327,1.696914,0.306713,1.410942e-09,4.259029e-07,Gadd45g,Gadd45g,23882.0,4.3e-07,Gadd45g\np=4.3e-07,6.370689
15194,ENSMUSG00000053007,4644.332243,1.600735,0.177159,8.300911e-21,3.507965e-17,Creb5,Creb5,231991.0,3.5e-17,Creb5\np=3.5e-17,16.454945
9082,ENSMUSG00000032803,11430.532646,1.435071,0.093609,2.5670320000000002e-54,3.2544829999999995e-50,Cdv3,Cdv3,321022.0,3.3e-50,Cdv3\np=3.3e-50,49.487518
8565,ENSMUSG00000031872,158.7693,1.376045,0.332561,1.465432e-06,0.0001317641,Bean1,Bean1,65115.0,0.00013,Bean1\np=1.3e-04,3.880203
3388,ENSMUSG00000021983,41.360301,1.299773,1.002006,0.00276525,0.03721639,Atp8a2,Atp8a2,50769.0,0.037,Atp8a2\np=3.7e-02,1.429266
33028,ENSMUSG00000113395,61.554082,1.291985,0.383196,2.956047e-05,0.001348085,Gm48365,Gm48365,,0.0013,Gm48365\np=1.3e-03,2.870283
4141,ENSMUSG00000023868,1014.548081,1.191616,0.455417,0.000294213,0.007627878,Pde10a,Pde10a,23984.0,0.0076,Pde10a\np=7.6e-03,2.117596


In [10]:
data_DE_all_sortL2FC.sort_values(by=["log2FoldChange"], ascending=True)[["GENENAME", 
                                                                         "log2FoldChange", 
                                                                         "padj_rounded"]]\
.round(3)\
.to_excel("data_l2fc_p_DEG_only_female_231212.xlsx")

  data_DE_all_sortL2FC.sort_values(by=["log2FoldChange"], ascending=True)[["GENENAME",


In [11]:
genes_of_interest = ["Cacna1a",
                    "Cacna1b",
                    "Cacna1c",
                    "Cacna1d",
                    "Cacna1e",
                    "Cacna1f",
                    "Cacna1g",
                    "Cacna1h",
                    "Cacna1i",
                    "Cacna2d1",
                    "Cacna2d2",
                    "Cacnb1",
                    "Cacnb2",
                    "Cacnb3",
                    "Cacnb4"
                    ]

In [12]:
data_goi = data[data["GENENAME"].isin(genes_of_interest)]
data_goi.sort_values(by=["GENENAME"], ascending=True)

Unnamed: 0,GENEID,baseMean,log2FoldChange,lfcSE,pvalue,padj,GENENAME,SYMBOL,ENTREZID,padj_rounded,GENENAME_padj,-log10_padj
9696,ENSMUSG00000034656,59.315295,-0.051312,0.135061,0.1223237,0.3976461,Cacna1a,Cacna1a,12286.0,0.4,Cacna1a\np=4.0e-01,0.400503
673,ENSMUSG00000004113,45.643195,0.079853,0.155987,0.0494077,0.2316534,Cacna1b,Cacna1b,12287.0,0.23,Cacna1b\np=2.3e-01,0.635161
14851,ENSMUSG00000051331,3815.011557,0.123166,0.109874,0.09734339,0.3479333,Cacna1c,Cacna1c,12288.0,0.35,Cacna1c\np=3.5e-01,0.458504
1674,ENSMUSG00000015968,256.024213,0.493105,0.210393,0.0009718218,0.01817157,Cacna1d,Cacna1d,12289.0,0.018,Cacna1d\np=1.8e-02,1.740608
672,ENSMUSG00000004110,9.562112,0.012211,0.123999,0.4860274,,Cacna1e,Cacna1e,12290.0,,Cacna1e\np=nan,
8133,ENSMUSG00000031142,2.967424,-0.007046,0.124331,0.4433487,,Cacna1f,Cacna1f,54652.0,,Cacna1f\np=nan,
2768,ENSMUSG00000020866,23.656461,-0.023761,0.124163,0.4029532,,Cacna1g,Cacna1g,12291.0,,Cacna1g\np=nan,
4278,ENSMUSG00000024112,2000.688488,-1.665987,0.119378,1.383663e-45,8.771041e-42,Cacna1h,Cacna1h,58226.0,8.8e-42,Cacna1h\np=8.8e-42,41.056949
3640,ENSMUSG00000022416,25.848093,0.007496,0.122432,0.7483576,,Cacna1i,Cacna1i,239556.0,,Cacna1i\np=nan,
11696,ENSMUSG00000040118,7865.999032,0.426686,0.118528,2.236196e-05,0.001065808,Cacna2d1,Cacna2d1,12293.0,0.0011,Cacna2d1\np=1.1e-03,2.972321


In [13]:
data_goi[["GENENAME", "log2FoldChange", "padj_rounded"]].round(3).to_excel("data_l2fc_p_calcium_channels_only_female_231212.xlsx")

  data_goi[["GENENAME", "log2FoldChange", "padj_rounded"]].round(3).to_excel("data_l2fc_p_calcium_channels_only_female_231212.xlsx")


In [22]:
kegg_list = pd.read_csv("Kegg_list_mmu.csv", 
                        usecols=['GeneID', 'PathwayID','Symbol', 'ENSEMBL', "Genename"])
kegg_aldo = kegg_list[kegg_list["PathwayID"]=="path:mmu04925"]

data_aldo_genes = kegg_aldo.merge(results_genes, how="left", left_on='ENSEMBL', right_on='GENEID', validate="1:1")
print(len(data_aldo_genes.sort_values(by=["log2FoldChange"], ascending=True)))
data_aldo_genes.sort_values(by=["log2FoldChange"], ascending=True)

102


Unnamed: 0,GeneID,PathwayID,Symbol,ENSEMBL,Genename,GENEID,baseMean,log2FoldChange,lfcSE,pvalue,padj,GENENAME,SYMBOL,ENTREZID,padj_rounded,GENENAME_padj,-log10_padj
93,58226,path:mmu04925,Cacna1h,ENSMUSG00000024112,"calcium channel, voltage-dependent, T type, al...",ENSMUSG00000024112,2000.688488,-1.665987,0.119378,1.383663e-45,8.771041e-42,Cacna1h,Cacna1h,58226.0,8.8e-42,Cacna1h\np=8.8e-42,41.056949
85,269060,path:mmu04925,Dagla,ENSMUSG00000035735,"diacylglycerol lipase, alpha",ENSMUSG00000035735,65.765206,-0.90282,0.338525,0.0002840865,0.007441422,Dagla,Dagla,269060.0,0.0074,Dagla\np=7.4e-03,2.128344
47,15497,path:mmu04925,Hsd3b6,ENSMUSG00000027869,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",ENSMUSG00000027869,755.756331,-0.45778,0.187055,0.0008135599,0.0158195,Hsd3b6,Hsd3b6,15497.0,0.016,Hsd3b6\np=1.6e-02,1.800807
67,18797,path:mmu04925,Plcb3,ENSMUSG00000024960,"phospholipase C, beta 3",ENSMUSG00000024960,1247.440152,-0.254912,0.102648,0.001898105,0.02847832,Plcb3,Plcb3,18797.0,0.028,Plcb3\np=2.8e-02,1.545486
38,14672,path:mmu04925,Gna11,ENSMUSG00000034781,"guanine nucleotide binding protein, alpha 11",ENSMUSG00000034781,2066.673464,-0.233123,0.087663,0.001405941,0.02343536,Gna11,Gna11,14672.0,0.023,Gna11\np=2.3e-02,1.630128
91,52163,path:mmu04925,Camk1,ENSMUSG00000030272,calcium/calmodulin-dependent protein kinase I,ENSMUSG00000030272,1212.418361,-0.171562,0.094744,0.01898799,0.1324874,Camk1,Camk1,52163.0,0.13,Camk1\np=1.3e-01,0.877825
37,13079,path:mmu04925,Cyp21a1,ENSMUSG00000024365,"cytochrome P450, family 21, subfamily a, polyp...",ENSMUSG00000024365,228217.747073,-0.147201,0.074824,0.02068337,0.1394666,Cyp21a1,Cyp21a1,13079.0,0.14,Cyp21a1\np=1.4e-01,0.85553
55,17200,path:mmu04925,Mc2r,ENSMUSG00000045569,melanocortin 2 receptor,ENSMUSG00000045569,6355.987068,-0.142694,0.07274,0.01939924,0.1339276,Mc2r,Mc2r,17200.0,0.13,Mc2r\np=1.3e-01,0.87313
80,231871,path:mmu04925,Daglb,ENSMUSG00000039206,"diacylglycerol lipase, beta",ENSMUSG00000039206,1057.103909,-0.120612,0.101924,0.09398634,0.3407568,Daglb,Daglb,231871.0,0.34,Daglb\np=3.4e-01,0.467555
51,16521,path:mmu04925,Kcnj5,ENSMUSG00000032034,"potassium inwardly-rectifying channel, subfami...",ENSMUSG00000032034,45.683308,-0.110779,0.193679,0.02339703,0.1515726,Kcnj5,Kcnj5,16521.0,0.15,Kcnj5\np=1.5e-01,0.819379


In [26]:
data_aldo_genes.sort_values(by=["padj", "log2FoldChange"], ascending=True).to_excel("kegg_aldo_mmu04925_genes_only_female_231212.xlsx",
                                                            columns=["Symbol", "log2FoldChange", "padj"])

  data_aldo_genes.sort_values(by=["padj", "log2FoldChange"], ascending=True).to_excel("kegg_aldo_mmu04925_genes_only_female_231212.xlsx",
