# Pseudobulk analysis of differential expression

In [1]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

pandas2ri.activate()
DESeq2 = importr("DESeq2")

from rpy2.robjects import default_converter
from rpy2.robjects.conversion import rpy2py
base = importr("base")

1: Setting LC_COLLATE failed, using "C" 
2: Setting LC_TIME failed, using "C" 
3: Setting LC_MESSAGES failed, using "C" 
4: Setting LC_MONETARY failed, using "C" 


# prep expression marix and gene list

In [27]:
def deseq(meta: pd.DataFrame, counts: pd.DataFrame, formula: str, ref: str, exp: str):
    # Calculate normalization factors
    dds = DESeq2.DESeqDataSetFromMatrix(
        countData=counts, colData=meta, design=ro.Formula(formula))
    
    dds = DESeq2.DESeq(dds) #parallel=True
    #estimateSizeFactors(dds, type = 'iterate')
    
    print(f"experiment_{exp}_vs_{ref}")
    resR = DESeq2.results(dds, name=f"experiment_{ref}_vs_{exp}")
    res = r_to_df(resR)
    res = res.sort_values("padj")
    res = res.loc[res["padj"] < 0.05]
    res = res.loc[res["log2FoldChange"].abs() > 0.5]

    return res


def r_to_df(r_df):
    with localconverter(default_converter + pandas2ri.converter):
        return rpy2py(base.as_data_frame(r_df))

In [12]:
counts4 = pd.read_csv('Calu3_4h_sum.csv')
counts4 = counts4.set_index('Unnamed: 0')
counts4

Unnamed: 0_level_0,Calu3-S2-4h-A,Calu3-S2-4h-B,Calu3-mock-4h-A,Calu3-mock-4h-B
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000146038,10460.0,5557.0,7117.0,10530.0
ENSG00000136536,1581.0,834.0,961.0,1324.0
ENSG00000116679,2695.0,1646.0,1969.0,2911.0
ENSG00000160360,90.0,92.0,77.0,141.0
ENSG00000165282,1252.0,776.0,911.0,1289.0
...,...,...,...,...
ENSG00000163596,147.0,53.0,81.0,113.0
ENSG00000224152,37.0,13.0,19.0,36.0
ENSG00000145388,464.0,234.0,297.0,444.0
ENSG00000038219,4126.0,3086.0,2865.0,3428.0


### compare calu3 mock with calu3 4 hours post infection

In [13]:
counts = counts4

design = pd.DataFrame({
    "experiment": [sample_name.split("-")[1] for sample_name in counts.columns],
    #"hpi": [sample_name.split("-")[2] for sample_name in counts.columns],
}, index=counts.columns)

design

Unnamed: 0,experiment
Calu3-S2-4h-A,S2
Calu3-S2-4h-B,S2
Calu3-mock-4h-A,mock
Calu3-mock-4h-B,mock


In [14]:
res = deseq(meta=design, counts=counts, formula="~experiment", ref="mock", exp="S2")
res

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing



experiment_S2_vs_mock


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ENSG00000140988,36943.635938,0.599186,0.099922,5.996564,2.015359e-09,0.000005
ENSG00000089157,31618.988306,0.611304,0.099873,6.120838,9.308423e-10,0.000005
ENSG00000187840,4844.628679,0.654048,0.108288,6.039900,1.542093e-09,0.000005
ENSG00000138061,4746.343772,-0.598298,0.108016,-5.538964,3.042669e-08,0.000061
ENSG00000173207,1912.589301,0.633724,0.120555,5.256740,1.466314e-07,0.000236
...,...,...,...,...,...,...
ENSG00000115020,542.930130,-0.522573,0.173558,-3.010947,2.604343e-03,0.044635
ENSG00000263503,532.686247,-0.660317,0.219582,-3.007158,2.637031e-03,0.044908
ENSG00000277778,460.633287,-0.545188,0.182154,-2.993000,2.762494e-03,0.046125
ENSG00000177169,516.022820,-0.520622,0.174931,-2.976151,2.918912e-03,0.047980


In [6]:
res.to_csv('Calu3_DEGs_4h.csv')

### compare calu3 mock versus calu3 8 hours post infection

In [18]:
counts8 = pd.read_csv('Calu3_8h_sum.csv')
counts8 = counts8.set_index('Unnamed: 0')
counts8[['Calu3-mock-4h-A', 'Calu3-mock-4h-B']] = counts4[['Calu3-mock-4h-A', 'Calu3-mock-4h-B']] # add controls
counts8

Unnamed: 0_level_0,Calu3-S2-8h-A,Calu3-S2-8h-B,Calu3-mock-4h-A,Calu3-mock-4h-B
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000146038,5267.0,9900.0,7117.0,10530.0
ENSG00000136536,855.0,1489.0,961.0,1324.0
ENSG00000116679,1507.0,2347.0,1969.0,2911.0
ENSG00000160360,63.0,89.0,77.0,141.0
ENSG00000165282,634.0,894.0,911.0,1289.0
...,...,...,...,...
ENSG00000163596,50.0,119.0,81.0,113.0
ENSG00000224152,13.0,33.0,19.0,36.0
ENSG00000145388,254.0,427.0,297.0,444.0
ENSG00000038219,2874.0,3736.0,2865.0,3428.0


In [19]:
counts = counts8

design = pd.DataFrame({
    "experiment": [sample_name.split("-")[1] for sample_name in counts.columns],
    #"hpi": [sample_name.split("-")[2] for sample_name in counts.columns],
}, index=counts.columns)

design

res = deseq(meta=design, counts=counts, formula="~experiment", ref="mock", exp="S2")

res.to_csv('Calu3_DEGs_8h.csv')
res

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing



experiment_S2_vs_mock


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ENSG00000119922,7423.809814,-6.226836,0.129548,-48.065930,0.000000e+00,0.000000e+00
ENSG00000177409,5020.768078,-4.523739,0.116871,-38.707083,0.000000e+00,0.000000e+00
ENSG00000119917,6030.690255,-4.863030,0.133102,-36.536171,2.956829e-292,1.380839e-288
ENSG00000185745,7111.139505,-4.597371,0.127863,-35.955514,4.150595e-283,1.453746e-279
ENSG00000134321,4116.517565,-4.691396,0.146039,-32.124358,2.015076e-226,5.646244e-223
...,...,...,...,...,...,...
ENSG00000129451,185.158496,-0.547175,0.227326,-2.407006,1.608390e-02,4.875279e-02
ENSG00000148908,146.234506,0.607679,0.252543,2.406234,1.611792e-02,4.882658e-02
ENSG00000132881,62.393709,0.881456,0.366897,2.402466,1.628494e-02,4.926625e-02
ENSG00000162949,48.416227,-0.994920,0.414475,-2.400433,1.637571e-02,4.948742e-02


In [25]:
counts12 = pd.read_csv('Calu3_12h_sum.csv')
counts12 = counts12.set_index('Unnamed: 0')
counts12

Unnamed: 0_level_0,Calu3-S2-12h-A,Calu3-S2-12h-B,Calu3-mock-12h-A,Calu3-mock-12h-B
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000146038,5845.0,10123.0,4154.0,8402.0
ENSG00000136536,808.0,1407.0,806.0,1282.0
ENSG00000116679,1240.0,2182.0,1693.0,2784.0
ENSG00000160360,28.0,68.0,71.0,80.0
ENSG00000165282,512.0,880.0,706.0,1033.0
...,...,...,...,...
ENSG00000163596,37.0,63.0,57.0,85.0
ENSG00000224152,13.0,29.0,25.0,32.0
ENSG00000145388,194.0,346.0,195.0,417.0
ENSG00000038219,2042.0,3866.0,1877.0,3355.0


In [26]:
counts = counts12

design = pd.DataFrame({
    "experiment": [sample_name.split("-")[1] for sample_name in counts.columns],
    #"hpi": [sample_name.split("-")[2] for sample_name in counts.columns],
}, index=counts.columns)

design


res = deseq(meta=design, counts=counts, formula="~experiment", ref="mock", exp="S2")
res.to_csv('Calu3_DEGs_12h.csv')
res

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing



experiment_S2_vs_mock


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ENSG00000134321,8184.263213,-6.287534,0.167302,-37.581962,0.000000e+00,0.000000e+00
ENSG00000135114,4278.544006,-5.257256,0.146653,-35.848254,1.957942e-281,1.340701e-277
ENSG00000162654,3284.357935,-5.797959,0.168995,-34.308526,5.855898e-258,2.673218e-254
ENSG00000119917,10001.660663,-5.691165,0.166328,-34.216441,1.377139e-256,4.714979e-253
ENSG00000183486,2771.785008,-5.336498,0.156089,-34.188800,3.547414e-256,9.716366e-253
...,...,...,...,...,...,...
ENSG00000138100,94.221195,0.819677,0.322752,2.539647,1.109643e-02,4.876945e-02
ENSG00000181896,285.034964,-0.520578,0.205372,-2.534806,1.125096e-02,4.933778e-02
ENSG00000148400,99.007291,-0.805763,0.317981,-2.533999,1.127691e-02,4.941992e-02
ENSG00000246859,25.566954,-1.430677,0.564702,-2.533505,1.129281e-02,4.947378e-02
