## female - male differential analysis LT cells - MAST

**LT-HSCs**


Run this model:

`zlmCond_all <- zlm(formula = ~condition + leiden +n_genes, sca=sca)`


Comparisons:

compare both replicates of old and new


done with this docker image:

docker run --rm -d --name scanpy -p 8883:8888 -e JUPYTER_ENABLE_LAB=YES -v /Users/efast/Documents/:/home/jovyan/work r_scanpy:vs5

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [2]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

scanpy==1.4.5.1 anndata==0.7.1 umap==0.3.10 numpy==1.17.3 scipy==1.3.0 pandas==0.25.3 scikit-learn==0.22.2.post1 statsmodels==0.10.0 python-igraph==0.7.1 louvain==0.6.1


In [3]:
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
.libPaths(.libPaths()[c(3,2,1)])

# Load all the R libraries we will be using in the notebook
library(scran)
library(ggplot2)
library(plyr)
library(MAST)

## LT replicate #1 (old)

In [4]:
# load data

adata = sc.read('./sc_objects/LT_female.h5ad', cache = True)

In [5]:
#Create new Anndata object for use in MAST with non-batch corrected data as before
adata_raw = adata.copy()
adata_raw.X = adata.raw.X
adata_raw.obs['n_genes'] = (adata_raw.X > 0).sum(1) # recompute number of genes expressed per cell
adata = None

In [6]:
adata_raw.obs.head()

Unnamed: 0,sample,n_counts,log_counts,n_genes,percent_mito,Female,Female_cat,Female_str,sex_sample,batch,rXist,leiden,umap_density_sample
AAACCCAGTCTGTCAA,ct,9978.0,9.208138,3203,0.05021,True,True,True,ct_true,batch1,3.259153,0,0.493924
AAACCCAGTGAACTAA,ct,8042.0,8.992682,2777,0.061288,True,True,True,ct_true,batch1,3.019387,1,0.920127
AAACCCATCCAATCTT,ct,17477.0,9.769098,4695,0.052159,True,True,True,ct_true,batch1,0.525301,2,0.931675
AAACCCATCGAAATCC,ct,7697.0,8.948586,2738,0.050149,True,True,True,ct_true,batch1,3.12124,1,0.842655
AAACGAAAGGTGCTTT,ct,5694.0,8.647344,2177,0.08885,True,True,True,ct_true,batch1,3.571342,2,0.996808


### Run MAST on total cells - Select genes expressed in >5% of cells (no adaptive thresholding)

In [7]:
%%R -i adata_raw

#Convert SingleCellExperiment to SingleCellAssay type as required by MAST
sca <- SceToSingleCellAssay(adata_raw, class = "SingleCellAssay")

#Scale Gene detection rate
colData(sca)$n_genes = scale(colData(sca)$n_genes)

# filter genes based on hard cutoff (have to be expressed in at least 5% of all cells)
freq_expressed <- 0.05
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

#rename the sample to condition and make the ct the control
cond<-factor(colData(sca)$sample)
cond<-relevel(cond,"ct")
colData(sca)$condition<-cond

#### everything

background:  
`zlmCond_all <- zlm(formula = ~condition + leiden +n_genes, sca=sca) # this runs the model`

a formula with the measurement variable (gene expression) on the LHS (left hand side) and 
predictors present in colData on the RHS
expression of genes controlling for cluster, condition, sex + n_genes
questions I can ask:
sex differences controlling for treatments
sex differences controlling for clusters - not necessary analyze all the clusters
overall gene expression changes in treatment


In [8]:
%%R 
#Define & run hurdle model 
zlmCond_all <- zlm(formula = ~condition + n_genes + leiden, sca=sca) # this runs the model
summaryCond_all <- summary(zlmCond_all, doLRT=TRUE) # extracts the data, gives datatable with summary of fit, doLRT=TRUE extracts likelihood ratio test p-value
summaryDt_all <- summaryCond_all$datatable # reformats into a table

In [9]:
%%R
head(summaryDt_all)

       primerid component        contrast  Pr..Chisq.      ci.hi        ci.lo
1 0610009B22Rik         C   conditionGCSF 0.234644760 0.02895305 -0.007088412
2 0610009B22Rik         C conditiondmPGE2 0.276327345 0.05406436 -0.189282061
3 0610009B22Rik         C   conditionindo 0.674138687 0.01925699 -0.012452206
4 0610009B22Rik         C    conditionpIC 0.456503882 0.02241984 -0.049907953
5 0610009B22Rik         C         leiden1 0.009309932 0.04360971  0.006147095
6 0610009B22Rik         C         leiden2 0.029073026 0.03707664  0.002006930
          coef          z
1  0.010932321  1.1890169
2 -0.067608849 -1.0890722
3  0.003402393  0.4206078
4 -0.013744057 -0.7448826
5  0.024878401  2.6031698
6  0.019541783  2.1842892


In [10]:
%%R -o GCSF_all -o dmPGE2_all -o indo_all -o pIC_all

# reformat for GCSF
result_all_GCSF <- merge(summaryDt_all[contrast=='conditionGCSF' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionGCSF' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_GCSF[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
GCSF_all = result_all_GCSF[result_all_GCSF$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
GCSF_all = GCSF_all[order(GCSF_all$FDR),] # sorts the table


# reformat for dmPGE2
result_all_dmPGE2 <- merge(summaryDt_all[contrast=='conditiondmPGE2' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditiondmPGE2' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_dmPGE2[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
dmPGE2_all = result_all_dmPGE2[result_all_dmPGE2$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
dmPGE2_all = dmPGE2_all[order(dmPGE2_all$FDR),] # sorts the table


# reformat for indo
result_all_indo <- merge(summaryDt_all[contrast=='conditionindo' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionindo' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_indo[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
indo_all = result_all_indo[result_all_indo$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
indo_all = indo_all[order(indo_all$FDR),] # sorts the table

# reformat for pIC
result_all_pIC <- merge(summaryDt_all[contrast=='conditionpIC' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionpIC' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_pIC[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
pIC_all = result_all_pIC[result_all_pIC$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
pIC_all = pIC_all[order(pIC_all$FDR),] # sorts the table

In [11]:
%%R -o MAST_raw_all

MAST_raw_all <- summaryDt_all

In [12]:
# save files as .csvs

MAST_raw_all.to_csv('./write/MAST_raw_LT_leiden_female.csv')
GCSF_all.to_csv('./write/MAST_GCSF_LT_leiden_female.csv')
pIC_all.to_csv('./write/MAST_pIC_LT_leiden_female.csv')
dmPGE2_all.to_csv('./write/MAST_dmPGE2_LT_leiden_female.csv')
indo_all.to_csv('./write/MAST_indo_LT_leiden_female.csv')

In [13]:
%%R
# remove previous variables

rm(zlmCond_all)
rm(summaryDt_all)
rm(summaryCond_all)
rm(MAST_raw_all)

## Male LT

In [14]:
# load data

adata = sc.read('./sc_objects/LT_male.h5ad', cache = True)

In [15]:
#Create new Anndata object for use in MAST with non-batch corrected data as before
adata_raw = adata.copy()
adata_raw.X = adata.raw.X
adata_raw.obs['n_genes'] = (adata_raw.X > 0).sum(1) # recompute number of genes expressed per cell
adata = None

In [16]:
adata_raw.obs.head()

Unnamed: 0,sample,n_counts,log_counts,n_genes,percent_mito,Female,Female_cat,Female_str,sex_sample,batch,rXist,leiden,umap_density_sample
AAACCCACACAGAGCA,ct,7698.0,8.948846,2663,0.049227,False,False,False,ct_false,batch1,0.078504,2,0.769843
AAACCCAGTATCGTGT,ct,8031.0,8.991189,2538,0.054656,False,False,False,ct_false,batch1,0.078504,1,0.97859
AAACCCATCTATCGTT,ct,10312.0,9.241161,3250,0.05527,False,False,False,ct_false,batch1,0.078504,1,0.936704
AAACGAAAGACCAAAT,ct,7681.0,8.946506,2578,0.059367,False,False,False,ct_false,batch1,0.078504,1,0.972613
AAACGAATCTGCGGAC,ct,8571.0,9.056257,2762,0.047247,False,False,False,ct_false,batch1,0.078504,0,0.729113


### Run MAST on total cells - Select genes expressed in >5% of cells (no adaptive thresholding)

In [17]:
%%R -i adata_raw

#Convert SingleCellExperiment to SingleCellAssay type as required by MAST
sca <- SceToSingleCellAssay(adata_raw, class = "SingleCellAssay")

#Scale Gene detection rate
colData(sca)$n_genes = scale(colData(sca)$n_genes)

# filter genes based on hard cutoff (have to be expressed in at least 5% of all cells)
freq_expressed <- 0.05
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

#rename the sample to condition and make the ct the control
cond<-factor(colData(sca)$sample)
cond<-relevel(cond,"ct")
colData(sca)$condition<-cond

#### everything

background:  
`zlmCond_all <- zlm(formula = ~condition + leiden +n_genes, sca=sca) # this runs the model`

a formula with the measurement variable (gene expression) on the LHS (left hand side) and 
predictors present in colData on the RHS
expression of genes controlling for cluster, condition, sex + n_genes
questions I can ask:
sex differences controlling for treatments
sex differences controlling for clusters - not necessary analyze all the clusters
overall gene expression changes in treatment


In [18]:
%%R 
#Define & run hurdle model 
zlmCond_all <- zlm(formula = ~condition + n_genes + leiden, sca=sca) # this runs the model
summaryCond_all <- summary(zlmCond_all, doLRT=TRUE) # extracts the data, gives datatable with summary of fit, doLRT=TRUE extracts likelihood ratio test p-value
summaryDt_all <- summaryCond_all$datatable # reformats into a table

In [19]:
%%R
head(summaryDt_all)

       primerid component        contrast  Pr..Chisq.      ci.hi         ci.lo
1 0610009B22Rik         C   conditionGCSF 0.005230632 0.05674600  0.0099746937
2 0610009B22Rik         C conditiondmPGE2 0.048857123 0.09660119  0.0002764647
3 0610009B22Rik         C   conditionindo 0.033929354 0.03332900  0.0013275314
4 0610009B22Rik         C    conditionpIC 0.457744867 0.02350899 -0.0521991095
5 0610009B22Rik         C         leiden1 0.003471650 0.04852048  0.0095972637
6 0610009B22Rik         C         leiden2 0.804042956 0.02092118 -0.0162187225
          coef          z
1  0.033360346  2.7959484
2  0.048438829  1.9712147
3  0.017328268  2.1225761
4 -0.014345059 -0.7427421
5  0.029058871  2.9264973
6  0.002351231  0.2481604


In [20]:
%%R -o GCSF_all -o dmPGE2_all -o indo_all -o pIC_all

# reformat for GCSF
result_all_GCSF <- merge(summaryDt_all[contrast=='conditionGCSF' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionGCSF' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_GCSF[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
GCSF_all = result_all_GCSF[result_all_GCSF$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
GCSF_all = GCSF_all[order(GCSF_all$FDR),] # sorts the table


# reformat for dmPGE2
result_all_dmPGE2 <- merge(summaryDt_all[contrast=='conditiondmPGE2' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditiondmPGE2' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_dmPGE2[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
dmPGE2_all = result_all_dmPGE2[result_all_dmPGE2$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
dmPGE2_all = dmPGE2_all[order(dmPGE2_all$FDR),] # sorts the table


# reformat for indo
result_all_indo <- merge(summaryDt_all[contrast=='conditionindo' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionindo' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_indo[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
indo_all = result_all_indo[result_all_indo$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
indo_all = indo_all[order(indo_all$FDR),] # sorts the table

# reformat for pIC
result_all_pIC <- merge(summaryDt_all[contrast=='conditionpIC' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionpIC' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_pIC[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
pIC_all = result_all_pIC[result_all_pIC$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
pIC_all = pIC_all[order(pIC_all$FDR),] # sorts the table

In [21]:
%%R -o MAST_raw_all

MAST_raw_all <- summaryDt_all

In [22]:
# save files as .csvs

MAST_raw_all.to_csv('./write/MAST_raw_LT_leiden_male.csv')
GCSF_all.to_csv('./write/MAST_GCSF_LT_leiden_male.csv')
pIC_all.to_csv('./write/MAST_pIC_LT_leiden_male.csv')
dmPGE2_all.to_csv('./write/MAST_dmPGE2_LT_leiden_male.csv')
indo_all.to_csv('./write/MAST_indo_LT_leiden_male.csv')

In [23]:
%%R
# remove previous variables

rm(zlmCond_all)
rm(summaryDt_all)
rm(summaryCond_all)
rm(MAST_raw_all)

## Female MPP

In [24]:
# load data

adata = sc.read('./sc_objects/MPP_female.h5ad', cache = True)

In [25]:
#Create new Anndata object for use in MAST with non-batch corrected data as before
adata_raw = adata.copy()
adata_raw.X = adata.raw.X
adata_raw.obs['n_genes'] = (adata_raw.X > 0).sum(1) # recompute number of genes expressed per cell
adata = None

In [26]:
adata_raw.obs.head()

Unnamed: 0,assignment,batch,counts,demux_type,hto_type,rna_type,sample,select_cells,n_counts,log_counts,n_genes,percent_mito,Female,Female_cat,Female_str,sex_sample,rXist,leiden,umap_density_sample,umap_density_assignment
AAACGAAGTTGGACCC-0,MPP3/4,batch1,902.0,singlet,background,signal,ct,1.0,13508.0,9.511259,3988,0.057805,True,True,True,ct_true,3.158928,1,0.637315,0.847134
AAAGGATCACGCTGAC-0,MPP,batch1,869.0,singlet,background,signal,ct,1.0,18172.0,9.807637,4622,0.049802,True,True,True,ct_true,3.078445,0,0.783623,0.335968
AAAGGATGTAGTCTGT-0,MPP3/4,batch1,694.0,singlet,background,signal,ct,1.0,8688.0,9.070044,3151,0.059602,True,True,True,ct_true,3.20668,0,0.866588,0.514442
AAAGGTATCTTCGACC-0,MPP3/4,batch1,3446.0,singlet,signal,signal,ct,1.0,15875.0,9.672815,4297,0.046851,True,True,True,ct_true,2.978796,3,0.668953,0.709803
AACAAAGTCGGCTGTG-0,MPP3/4,batch1,951.0,singlet,background,signal,ct,1.0,17804.0,9.787459,4636,0.057387,True,True,True,ct_true,3.003566,0,0.702339,0.667352


### Run MAST on total cells - Select genes expressed in >5% of cells (no adaptive thresholding)

In [27]:
%%R -i adata_raw

#Convert SingleCellExperiment to SingleCellAssay type as required by MAST
sca <- SceToSingleCellAssay(adata_raw, class = "SingleCellAssay")

#Scale Gene detection rate
colData(sca)$n_genes = scale(colData(sca)$n_genes)

# filter genes based on hard cutoff (have to be expressed in at least 5% of all cells)
freq_expressed <- 0.05
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

#rename the sample to condition and make the ct the control
cond<-factor(colData(sca)$sample)
cond<-relevel(cond,"ct")
colData(sca)$condition<-cond

#### everything

background:  
`zlmCond_all <- zlm(formula = ~condition + leiden +n_genes, sca=sca) # this runs the model`

a formula with the measurement variable (gene expression) on the LHS (left hand side) and 
predictors present in colData on the RHS
expression of genes controlling for cluster, condition, sex + n_genes
questions I can ask:
sex differences controlling for treatments
sex differences controlling for clusters - not necessary analyze all the clusters
overall gene expression changes in treatment


In [28]:
%%R 
#Define & run hurdle model 
zlmCond_all <- zlm(formula = ~condition + n_genes + leiden, sca=sca) # this runs the model
summaryCond_all <- summary(zlmCond_all, doLRT=TRUE) # extracts the data, gives datatable with summary of fit, doLRT=TRUE extracts likelihood ratio test p-value
summaryDt_all <- summaryCond_all$datatable # reformats into a table

In [29]:
%%R
head(summaryDt_all)

       primerid component        contrast  Pr..Chisq.       ci.hi        ci.lo
1 0610009B22Rik         C   conditionGCSF 0.990551543 0.029809790 -0.029451896
2 0610009B22Rik         C conditiondmPGE2 0.059949452 0.001488856 -0.072678050
3 0610009B22Rik         C   conditionindo 0.660529254 0.035731993 -0.022654611
4 0610009B22Rik         C    conditionpIC 0.876636694 0.030077352 -0.035248981
5 0610009B22Rik         C         leiden1 0.612252432 0.021029130 -0.035691146
6 0610009B22Rik         C         leiden2 0.009177687 0.069439654  0.009834957
           coef           z
1  0.0001789473  0.01183666
2 -0.0355945967 -1.88127378
3  0.0065386909  0.43899106
4 -0.0025858143 -0.15516264
5 -0.0073310083 -0.50664465
6  0.0396373053  2.60676406


In [30]:
%%R -o GCSF_all -o dmPGE2_all -o indo_all -o pIC_all

# reformat for GCSF
result_all_GCSF <- merge(summaryDt_all[contrast=='conditionGCSF' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionGCSF' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_GCSF[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
GCSF_all = result_all_GCSF[result_all_GCSF$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
GCSF_all = GCSF_all[order(GCSF_all$FDR),] # sorts the table


# reformat for dmPGE2
result_all_dmPGE2 <- merge(summaryDt_all[contrast=='conditiondmPGE2' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditiondmPGE2' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_dmPGE2[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
dmPGE2_all = result_all_dmPGE2[result_all_dmPGE2$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
dmPGE2_all = dmPGE2_all[order(dmPGE2_all$FDR),] # sorts the table


# reformat for indo
result_all_indo <- merge(summaryDt_all[contrast=='conditionindo' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionindo' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_indo[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
indo_all = result_all_indo[result_all_indo$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
indo_all = indo_all[order(indo_all$FDR),] # sorts the table

# reformat for pIC
result_all_pIC <- merge(summaryDt_all[contrast=='conditionpIC' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionpIC' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_pIC[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
pIC_all = result_all_pIC[result_all_pIC$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
pIC_all = pIC_all[order(pIC_all$FDR),] # sorts the table

In [31]:
%%R -o MAST_raw_all

MAST_raw_all <- summaryDt_all

In [32]:
# save files as .csvs

MAST_raw_all.to_csv('./write/MAST_raw_MPP_leiden_female.csv')
GCSF_all.to_csv('./write/MAST_GCSF_MPP_leiden_female.csv')
pIC_all.to_csv('./write/MAST_pIC_MPP_leiden_female.csv')
dmPGE2_all.to_csv('./write/MAST_dmPGE2_MPP_leiden_female.csv')
indo_all.to_csv('./write/MAST_indo_MPP_leiden_female.csv')

In [33]:
%%R
# remove previous variables

rm(zlmCond_all)
rm(summaryDt_all)
rm(summaryCond_all)
rm(MAST_raw_all)

## Male MPP

In [34]:
# load data

adata = sc.read('./sc_objects/MPP_male.h5ad', cache = True)

In [35]:
#Create new Anndata object for use in MAST with non-batch corrected data as before
adata_raw = adata.copy()
adata_raw.X = adata.raw.X
adata_raw.obs['n_genes'] = (adata_raw.X > 0).sum(1) # recompute number of genes expressed per cell
adata = None

In [36]:
adata_raw.obs.head()

Unnamed: 0,assignment,batch,counts,demux_type,hto_type,rna_type,sample,select_cells,n_counts,log_counts,n_genes,percent_mito,Female,Female_cat,Female_str,sex_sample,rXist,leiden,umap_density_sample,umap_density_assignment
AAAGGGCAGCAGCGAT-0,MPP,batch1,848.0,singlet,background,signal,ct,1.0,8510.0,9.049468,2986,0.056965,False,False,False,ct_false,-0.049406,0,0.675819,0.257006
AAAGTGATCTAAGGAA-0,MPP3/4,batch1,783.0,singlet,background,signal,ct,1.0,16489.0,9.710449,4119,0.034508,False,False,False,ct_false,-0.049406,1,0.36422,0.586264
AAATGGAAGGGTGAGG-0,MPP,batch1,728.0,singlet,background,signal,ct,1.0,6484.0,8.777247,2336,0.054125,False,False,False,ct_false,-0.049406,4,0.402604,0.705514
AACACACCATGTGACT-0,MPP,batch1,794.0,singlet,background,signal,ct,1.0,7545.0,8.928773,2842,0.058707,False,False,False,ct_false,-0.049406,2,0.813763,0.772966
AACACACTCGAGAATA-0,MPP3/4,batch1,749.0,singlet,background,signal,ct,1.0,11126.0,9.317221,3362,0.045651,False,False,False,ct_false,-0.049406,3,0.371694,0.221309


### Run MAST on total cells - Select genes expressed in >5% of cells (no adaptive thresholding)

In [37]:
%%R -i adata_raw

#Convert SingleCellExperiment to SingleCellAssay type as required by MAST
sca <- SceToSingleCellAssay(adata_raw, class = "SingleCellAssay")

#Scale Gene detection rate
colData(sca)$n_genes = scale(colData(sca)$n_genes)

# filter genes based on hard cutoff (have to be expressed in at least 5% of all cells)
freq_expressed <- 0.05
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

#rename the sample to condition and make the ct the control
cond<-factor(colData(sca)$sample)
cond<-relevel(cond,"ct")
colData(sca)$condition<-cond

#### everything

background:  
`zlmCond_all <- zlm(formula = ~condition + leiden +n_genes, sca=sca) # this runs the model`

a formula with the measurement variable (gene expression) on the LHS (left hand side) and 
predictors present in colData on the RHS
expression of genes controlling for cluster, condition, sex + n_genes
questions I can ask:
sex differences controlling for treatments
sex differences controlling for clusters - not necessary analyze all the clusters
overall gene expression changes in treatment


In [38]:
%%R 
#Define & run hurdle model 
zlmCond_all <- zlm(formula = ~condition + n_genes + leiden, sca=sca) # this runs the model
summaryCond_all <- summary(zlmCond_all, doLRT=TRUE) # extracts the data, gives datatable with summary of fit, doLRT=TRUE extracts likelihood ratio test p-value
summaryDt_all <- summaryCond_all$datatable # reformats into a table

In [39]:
%%R
head(summaryDt_all)

       primerid component        contrast Pr..Chisq.       ci.hi       ci.lo
1 0610009B22Rik         C   conditionGCSF  0.3722662 0.019001636 -0.05074711
2 0610009B22Rik         C conditiondmPGE2  0.1406208 0.009401848 -0.06636773
3 0610009B22Rik         C   conditionindo  0.5994578 0.024936869 -0.04318505
4 0610009B22Rik         C    conditionpIC  0.3644513 0.018346365 -0.04993579
5 0610009B22Rik         C         leiden1  0.4688659 0.020990993 -0.04559093
6 0610009B22Rik         C         leiden2  0.2405993 0.055366328 -0.01389894
          coef          z
1 -0.015872737 -0.8920588
2 -0.028482943 -1.4735608
3 -0.009124088 -0.5250259
4 -0.015794715 -0.9067397
5 -0.012299969 -0.7241454
6  0.020733693  1.1733815


In [40]:
%%R -o GCSF_all -o dmPGE2_all -o indo_all -o pIC_all

# reformat for GCSF
result_all_GCSF <- merge(summaryDt_all[contrast=='conditionGCSF' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionGCSF' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_GCSF[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
GCSF_all = result_all_GCSF[result_all_GCSF$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
GCSF_all = GCSF_all[order(GCSF_all$FDR),] # sorts the table


# reformat for dmPGE2
result_all_dmPGE2 <- merge(summaryDt_all[contrast=='conditiondmPGE2' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditiondmPGE2' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_dmPGE2[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
dmPGE2_all = result_all_dmPGE2[result_all_dmPGE2$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
dmPGE2_all = dmPGE2_all[order(dmPGE2_all$FDR),] # sorts the table


# reformat for indo
result_all_indo <- merge(summaryDt_all[contrast=='conditionindo' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionindo' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_indo[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
indo_all = result_all_indo[result_all_indo$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
indo_all = indo_all[order(indo_all$FDR),] # sorts the table

# reformat for pIC
result_all_pIC <- merge(summaryDt_all[contrast=='conditionpIC' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
                  summaryDt_all[contrast=='conditionpIC' & component=='logFC', .(primerid, coef)],
                  by='primerid') #logFC coefficients
#Correct for multiple testing (FDR correction) and filtering
result_all_pIC[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')] # create column named FDR - probably that p.adjust function
pIC_all = result_all_pIC[result_all_pIC$FDR<0.01,, drop=F] # create new table where rows with FDR<0.01 are droped
pIC_all = pIC_all[order(pIC_all$FDR),] # sorts the table

In [41]:
%%R -o MAST_raw_all

MAST_raw_all <- summaryDt_all

In [42]:
# save files as .csvs

MAST_raw_all.to_csv('./write/MAST_raw_MPP_leiden_male.csv')
GCSF_all.to_csv('./write/MAST_GCSF_MPP_leiden_male.csv')
pIC_all.to_csv('./write/MAST_pIC_MPP_leiden_male.csv')
dmPGE2_all.to_csv('./write/MAST_dmPGE2_MPP_leiden_male.csv')
indo_all.to_csv('./write/MAST_indo_MPP_leiden_male.csv')

In [43]:
%%R
# remove previous variables

rm(zlmCond_all)
rm(summaryDt_all)
rm(summaryCond_all)
rm(MAST_raw_all)

In [44]:
!pip list

Package                Version            
---------------------- -------------------
alembic                1.3.0              
anndata                0.7.1              
anndata2ri             1.0.2              
async-generator        1.10               
attrs                  19.3.0             
backcall               0.1.0              
bleach                 3.1.0              
blinker                1.4                
certifi                2019.11.28         
certipy                0.1.3              
cffi                   1.13.2             
chardet                3.0.4              
conda                  4.7.12             
conda-package-handling 1.6.0              
cryptography           2.8                
cycler                 0.10.0             
decorator              4.4.1              
defusedxml             0.6.0              
entrypoints            0.3                
get-version            2.1                
gprofiler-official     1.0.0              
h5py       