# DESeq in python using rpy2

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd 
import numpy as np

We will read the table and it should only contains count data

In [None]:
directory = "../topics/datasets/gtex10/"
L = 3
filename = f"{directory}/topsbm/topsbm_level_{L}_topic-dist.csv"
df = pd.read_csv(filename, index_col=1).drop("i_doc", axis=1)
df = df.subtract(df.mean(0),1)
df = df.add(df.min(0).abs(),1)
df=df.transpose().reset_index()
df.rename({"index":"id"}, axis=1, inplace=True)
df.set_index("id", inplace=True)
df=df.applymap(lambda p: round(p*1000)+1).astype(int)
df.reset_index(inplace=True)
print(df.shape)
df.head()

And here, we will create a design matrix based on the samples in the count table. Note that the sample name has to be used as the ```pd.DataFrame``` index

In [None]:
#sample_df = pd.DataFrame({'samplename': df.columns}) \
#        .query('samplename != "id"')\
#        .assign(sample = lambda d: d.samplename.str.extract('([AB])_', expand=False)) \
#        .assign(replicate = lambda d: d.samplename.str.extract('_([123])', expand=False)) 
sample_df = pd.read_csv(f"{directory}/files.dat")
sample_df["file_name"] = sample_df.SAMPID
sample_df=sample_df.reindex(columns=["file_name","SMTS"])

sample_df.set_index("file_name", inplace=True)
sample_df=sample_df[sample_df.index.isin(df.columns)]

tissue = sample_df["SMTS"].unique()[2]
print(tissue)

sample_df["tissue"]=["A" if tissue in str(d) else "B" for d in sample_df["SMTS"]]
sample_df = sample_df.reindex(index=df.columns, columns=["tissue"]).dropna(how="any", axis=0)
#df=df.reindex(columns=np.concatenate([["id"],df.columns[df.columns.isin(sample_df.index)]])).dropna(how="any", axis=0)
sample_df.head()

Running DESeq2 is jsut like how it is run in ```R```, but instead of the row.name being gene ID for the count table, we can jsut tell the function which column is the gene ID:

In [None]:
import sys
sys.path.append("./diffexpr/")

In [None]:
#df=pd.DataFrame(columns=["id", "s1","s2","s3"], data=[["g1",1,1,1],["g2",2,3,3]])
#sample_df = pd.DataFrame(columns=["subtype"], index=["s1","s2","s3"], data=["A","B","B"])

In [None]:
from diffexpr.py_deseq import py_DESeq2

dds = py_DESeq2(count_matrix = df,
               design_matrix = sample_df,
               design_formula = '~ tissue',
               gene_column = 'id') # <- telling DESeq2 this should be the gene ID column
    
dds.run_deseq() 
dds.get_deseq_result()
res = dds.deseq_result 
res.head()

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=res["baseMean"],
                         y=res["log2FoldChange"],
                         mode="markers+text",
                         marker={"size":20, "color":"gray"}, 
                         text=res.index,
                         textposition="top center")
             )
#for (x,y, name) in zip(res["baseMean"],res["log2FoldChange"],res.index):
    #plt.annotate(name,xy=(x,y))
    
layout = {}
layout["xaxis"]={}
layout["xaxis"]["title"]="baseMean"
layout["xaxis"]["titlefont_size"]=35
layout["xaxis"]["tickfont_size"]=30
layout["yaxis"]={}
layout["yaxis"]["title"]="log2FoldChange"
layout["yaxis"]["titlefont_size"]=35
layout["yaxis"]["tickfont_size"]=30
fig.update_layout(layout)
fig.show()
fig.write_image(f"{directory}/topsbm/desqe2_level{L}_{tissue}.pdf")

In [None]:
list(map(lambda g: print(g[:15]),pd.read_csv(f'{directory}/topsbm/topsbm_level_{L}_topics.csv')["Topic 4"].dropna()))

In [None]:
dds.normalized_count() #DESeq2 normalized count

In [None]:
dds.comparison # show coefficients for GLM

In [None]:
# from the last cell, we see the arrangement of coefficients, 
# so that we can now use "coef" for lfcShrink
# the comparison we want to focus on is 'sample_B_vs_A', so coef = 4 will be used
lfc_res = dds.lfcShrink(coef=2, method='apeglm')
lfc_res.head()

In [None]:
dds.deseq_result