In [2]:
%load_ext rpy2.ipython

In [3]:
%%R
workDir = '/home/chantal/Chazy/Bulk/515_806/data/'
figDir = '/home/chantal/Chazy/Bulk/515_806/data/figs/'

physeqDir = '/home/chantal/Chazy/data/phyloseq/'
physeqBulk = 'bulk-core-rare'

physeqAll = 'Full'
#physeqBulk = 'bulk-core' 

l2fcResFile.TillMonth = file.path(workDir, 'l2fc_sparse-maxRejH_BULK_0.25_.txt')
l2fcResFile.Till = file.path(workDir, 'l2fc_Till.txt')
l2fcResFile.OM = file.path(workDir, 'l2fc_sparse-maxRejH_bulk_OM.txt')




## NOTEBOOK RUN WITH 0.25 SPARSITY FILTERED DATASET...

nprocs = 20

In [32]:
%%R
library(phyloseq)
library(ape)
library(dplyr)
library(gdata)
library(parallel)

In [5]:
%%R 

F = file.path(physeqDir, physeqBulk)
physeq.Bulk = readRDS(F)
physeq.Bulk.m = physeq.Bulk %>% sample_data
physeq.Bulk = prune_samples((physeq.Bulk.m$Land_Management != "PAS"),
                           physeq.Bulk)
physeq.Bulk.m = physeq.Bulk %>% sample_data


physeq.Bulk

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19331 taxa and 175 samples ]
sample_data() Sample Data:       [ 175 samples by 25 sample variables ]
tax_table()   Taxonomy Table:    [ 19331 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19331 tips and 19330 internal nodes ]


In [6]:
%%R 

F = file.path(physeqDir, physeqAll)
physeq.All = readRDS(F)
physeq.All.m = physeq.All %>% sample_data

physeq.All.m = physeq.All %>% sample_data


physeq.All

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 25178 taxa and 1348 samples ]
sample_data() Sample Data:       [ 1348 samples by 25 sample variables ]
tax_table()   Taxonomy Table:    [ 25178 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 25178 tips and 25177 internal nodes ]


In [6]:
%%R
# Full Bulk Tree
Bulk_tree = phy_tree(physeq.Bulk)

In [7]:
%%R
write.tree(Bulk_tree, file = "data/Bulk_tree.nwk")

In [7]:
%%R
#Subset to OTUs in 25% sparsity filtered dataset used for tillage and OM deseq2 w/o time

physeq.sparse = filter_taxa(physeq.Bulk, function(x) sum(x > 0) > (0.25 * length(x)), TRUE)


physeq.sparse

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 3964 taxa and 175 samples ]
sample_data() Sample Data:       [ 175 samples by 25 sample variables ]
tax_table()   Taxonomy Table:    [ 3964 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 3964 tips and 3963 internal nodes ]


In [8]:
%%R
OTUs.sparse = otu_table(physeq.sparse)
OTUs.sparse.names = as.data.frame(rownames(OTUs.sparse))
colnames(OTUs.sparse.names) = 'OTU'

In [9]:
%%R
head(OTUs.sparse.names)
length(OTUs.sparse.names$OTU)

[1] 3964


In [10]:
%%R
OTUs = otu_table(physeq.All)
OTUs.names = as.data.frame(rownames(OTUs))
colnames(OTUs.names) = 'OTU'

In [11]:
%%R
head(OTUs.sparse.names)

        OTU
1  OTU.1739
2  OTU.1813
3   OTU.778
4  OTU.2180
5 OTU.12365
6  OTU.2343


In [12]:
%%R
head(OTUs.names)

        OTU
1  OTU.4940
2 OTU.28525
3  OTU.4163
4  OTU.5493
5 OTU.14642
6  OTU.1739


In [13]:
%%R
OTUs.to.remove = filter(OTUs.names, !(OTU %in% OTUs.sparse.names$OTU))

In [14]:
%%R
head(OTUs.to.remove)
length(OTUs.to.remove$OTU)

[1] 21214


In [15]:
%%R
OTUs.remove = as.vector(OTUs.to.remove$OTU)

In [16]:
%%R
full_tree = read.tree('/home/chantal/Chazy/otusn_pick.tree')

In [17]:
%%R
full_tree_sparse = drop.tip(full_tree, OTUs.remove, trim.internal = TRUE, subtree = FALSE,
         root.edge = 0, rooted = is.rooted(full_tree))

In [18]:
%%R
str(full_tree_sparse)

List of 5
 $ edge       : int [1:7928, 1:2] 3966 3966 3967 3968 3969 3970 3970 3969 3971 3972 ...
 $ Nnode      : int 3964
 $ tip.label  : chr [1:3965] "X90478" "OTU.1739" "OTU.1813" "OTU.778" ...
 $ edge.length: num [1:7928] 0.4798 0.1263 0.0106 0.0229 0.1192 ...
 $ node.label : chr [1:3964] "" "0.599.8" "0.736.11" "0.883" ...
 - attr(*, "class")= chr "phylo"
 - attr(*, "order")= chr "cladewise"


In [19]:
%%R
write.tree(full_tree_sparse, file = "data/Bulk_tree_sparse.nwk")

In [23]:
%%R
# log2 fold change analysis results file
df.l2fc.TillMonth = read.delim(l2fcResFile.TillMonth, sep='\t')
df.l2fc.Till = read.delim(l2fcResFile.Till, sep='\t')
df.l2fc.OM = read.delim(l2fcResFile.OM, sep='\t')


print(df.l2fc.TillMonth %>% head(n=3))
print(df.l2fc.Till %>% head(n=3))
print(df.l2fc.OM %>% head(n=3))

  Month Year sparsity_threshold      OTU log2FoldChange     lfcSE      padj
1  July 2014               0.35  OTU.778    -0.61597423 0.5124907 0.5899482
2  July 2014               0.35 OTU.2180     0.08880429 0.7413839 0.9671297
3  July 2014               0.35 OTU.1158     1.00383561 0.8009562 0.5635003
     Rank1          Rank2        Rank3                Rank4 Rank5 Rank6 Rank7
1 Bacteria Planctomycetes Pla4_lineage uncultured_bacterium  <NA>  <NA>  <NA>
2 Bacteria Planctomycetes Pla4_lineage uncultured_bacterium  <NA>  <NA>  <NA>
3 Bacteria Planctomycetes Pla4_lineage uncultured_bacterium  <NA>  <NA>  <NA>
  Rank8 padj2 Till
1  <NA>     1   NT
2  <NA>     1   PT
3  <NA>     1   PT
  sparsity_threshold      OTU log2FoldChange     lfcSE         padj    Rank1
1               0.25 OTU.1739      0.4012126 0.4523957 0.4684148169 Bacteria
2               0.25 OTU.1813     -1.4802581 0.4326507 0.0018267946 Bacteria
3               0.25  OTU.778     -0.5498601 0.1379296 0.0002483877 Bacteria


In [63]:
%%R
df.Till = df.l2fc.Till %>% select(OTU, padj2, Till)
df.Till$C[df.Till$Till == 'T' & df.Till$padj2 <=0.10] = '#ff0000'
df.Till$C[df.Till$Till == 'N' & df.Till$padj2 <=0.10] = '#0000ff'
df.Till$C[df.Till$padj2 > 0.10] = '#D3D3D3'



#df.Till$R[df.Till$padj >0.10] = -1

head(df.Till)


        OTU        padj2 Till       C
1  OTU.1739 0.6010558483    T #D3D3D3
2  OTU.1813 0.0056520888    N #0000ff
3   OTU.778 0.0009556926    N #0000ff
4  OTU.2180 0.5116160300    N #D3D3D3
5 OTU.12365 0.7656246065    N #D3D3D3
6  OTU.2343 0.0002535853    T #ff0000


In [64]:
%%R
df.Till$label = paste(df.Till$OTU, df.Till$C, sep = ',')
head(df.Till)

        OTU        padj2 Till       C             label
1  OTU.1739 0.6010558483    T #D3D3D3  OTU.1739,#D3D3D3
2  OTU.1813 0.0056520888    N #0000ff  OTU.1813,#0000ff
3   OTU.778 0.0009556926    N #0000ff   OTU.778,#0000ff
4  OTU.2180 0.5116160300    N #D3D3D3  OTU.2180,#D3D3D3
5 OTU.12365 0.7656246065    N #D3D3D3 OTU.12365,#D3D3D3
6  OTU.2343 0.0002535853    T #ff0000  OTU.2343,#ff0000


In [65]:
%%R
df.Till.select = select(df.Till, label) 
write.csv(df.Till.select, 'incorp_binary_Bulk_Till.csv')

In [178]:
%%R
head(df.l2fc.OM)

  sparsity_threshold       OTU log2FoldChange     lfcSE       padj    Rank1
1               0.25  OTU.1739    -0.99492092 0.3368987 0.03078444 Bacteria
2               0.25  OTU.1813    -0.34779469 0.3307678 0.55980382 Bacteria
3               0.25   OTU.778     0.08123953 0.1389644 0.77642167 Bacteria
4               0.25  OTU.2180     0.01465938 0.2546136 0.98421287 Bacteria
5               0.25 OTU.12365    -0.52013058 0.2320783 0.12348165 Bacteria
6               0.25  OTU.2343    -0.66780501 0.2743011 0.08786400 Bacteria
             Rank2          Rank3                  Rank4 Rank5 Rank6 Rank7
1 __Planctomycetes        __OM190 __uncultured_bacterium  <NA>  <NA>  <NA>
2 __Planctomycetes        __OM190 __uncultured_bacterium  <NA>  <NA>  <NA>
3 __Planctomycetes __Pla4_lineage __uncultured_bacterium  <NA>  <NA>  <NA>
4 __Planctomycetes __Pla4_lineage __uncultured_bacterium  <NA>  <NA>  <NA>
5 __Planctomycetes __Pla4_lineage __uncultured_bacterium  <NA>  <NA>  <NA>
6 __Planctomycetes

In [55]:
%%R
df.OM = df.l2fc.OM %>% select(OTU, padj2, OM) 
df.OM$C[df.OM$OM == 'Returned' & df.OM$padj2 <=0.10] = '#76A94C'
df.OM$C[df.OM$OM == 'Harvested' & df.OM$padj2 <=0.10] = '#FF00FF'
df.OM$C[df.OM$padj2 > 0.10] = '#000000'

In [57]:
%%R
df.OM$label = paste(df.OM$OTU, df.OM$C, sep = ',')
head(df.OM)

        OTU     padj2        OM       C             label
1  OTU.1739 0.2826947 Harvested #000000  OTU.1739,#000000
2  OTU.1813 0.9999966 Harvested #000000  OTU.1813,#000000
3   OTU.778 0.9999966  Returned #000000   OTU.778,#000000
4  OTU.2180 0.9999966  Returned #000000  OTU.2180,#000000
5 OTU.12365 0.6026827 Harvested #000000 OTU.12365,#000000
6  OTU.2343 0.5075497 Harvested #000000  OTU.2343,#000000


In [58]:
%%R
df.OM.select = select(df.OM, label) 
write.csv(df.OM.select, 'incorp_binary_Bulk_OM.csv')

In [48]:
%%R
#Response by tillage and by month
#Creating physeq object with responders only -
#(but including their status at every time point, even when not responding)

# adding incorporator status to tax_table
padj.cutoff = 0.10

physeq.Bulk.inc = physeq.Bulk
df.l2fc.s = df.l2fc.TillMonth %>%
    select(-Rank1, -Rank2, -Rank3, -Rank4, -Rank5, -Rank6, -Rank7, -Rank8) %>%
    group_by(OTU) %>%
    summarize(Incorporator = any(padj2 <= padj.cutoff) %>% as.numeric) %>%
    ungroup() 

tax = physeq.Bulk.inc %>% tax_table %>% as.data.frame
tax$OTU = rownames(tax)
tax.j = left_join(df.l2fc.s, tax, c('OTU' = 'OTU')) %>% 
    mutate(Incorporator = ifelse(is.na(Incorporator), 0, Incorporator)) %>%
    as.data.frame() 
rownames(tax.j) = tax.j$OTU %>% as.character
tax.j$OTU = NULL

# making new phyloseq object with incorporator info
physeq.Bulk.inc = phyloseq(physeq.Bulk.inc %>% otu_table, 
                             tax.j %>% as.matrix %>% tax_table, 
                             physeq.Bulk.inc %>% sample_data,
                             physeq.Bulk.inc %>% phy_tree)
physeq.Bulk.inc = subset_taxa(physeq.Bulk.inc, Incorporator==1)
physeq.Bulk.inc

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 285 taxa and 175 samples ]
sample_data() Sample Data:       [ 175 samples by 25 sample variables ]
tax_table()   Taxonomy Table:    [ 285 taxa by 9 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 285 tips and 284 internal nodes ]


In [49]:
%%R
Bulk_tree_inc_TillMonth = phy_tree(physeq.Bulk.inc)

In [51]:
%%R
write.tree(Bulk_tree_inc_TillMonth, file = "data/Bulk_tree_inc.nwk")

In [54]:
%%R
inc = df.l2fc.TillMonth %>% filter(padj2 <=0.10) %>% select(OTU,log2FoldChange, Till)
head(inc)

       OTU log2FoldChange Till
1 OTU.5964       3.418800   PT
2  OTU.432       2.915847   PT
3  OTU.193      -3.303977   NT
4  OTU.930       3.515477   PT
5  OTU.180      -4.682644   NT
6  OTU.677       4.850230   PT


In [55]:
%%R
#if Till = PT, produce 1; if Till = NT, produce 0. 
inc$R[inc$Till == 'PT'] = 1
inc$R[inc$Till == 'NT'] = 0
head(inc)


       OTU log2FoldChange Till R
1 OTU.5964       3.418800   PT 1
2  OTU.432       2.915847   PT 1
3  OTU.193      -3.303977   NT 0
4  OTU.930       3.515477   PT 1
5  OTU.180      -4.682644   NT 0
6  OTU.677       4.850230   PT 1


In [56]:
%%R
inc.select = select(inc, OTU, R) %>% group_by(OTU) %>% summarise(Rtot = max(R))
head(inc.select)
write.csv(inc.select, 'incorp_binary_Bulk.csv')

In [57]:
%%R
length(unique(inc.select$OTU))

[1] 285


In [59]:
%%R
color = c("#85A214",
"#967DFA",
"#1DCE23",
"#F376ED",
"#01C78B",
"#E63BA7",
"#41D5ED",
"#F12B42",
"#2BB0FE",
"#F971B1",
"#197D9A",
"#BB4CC1",
"#735A76",
"#849EF9")


Rank2 = unique(df.l2fc.TillMonth$Rank2)

taxcol = data.frame(color, Rank2)

head(taxcol)


Error in data.frame(color, Rank2) : 
  arguments imply differing number of rows: 14, 30


  arguments imply differing number of rows: 14, 30

  res = super(Function, self).__call__(*new_args, **new_kwargs)


In [60]:
%%R
tax = df.l2fc.TillMonth %>% filter(padj2 <=0.10) %>% select(OTU, Rank2) %>% unique()
tax$type = 'label'
length(tax$OTU)

[1] 285


In [61]:
%%R
tax2 = full_join(tax, taxcol)
head(tax2)
write.csv(tax2, 'incorp_tax_Bulk.csv')


Error in tbl_vars(y) : object 'taxcol' not found



  res = super(Function, self).__call__(*new_args, **new_kwargs)


## ConsenTRAIT

In [20]:
%%R
bulkTree = read.tree('data/Bulk_tree_sparse.nwk')

In [21]:
%%R
str(bulkTree)

List of 5
 $ edge       : int [1:7928, 1:2] 3966 3966 3967 3968 3969 3970 3970 3969 3971 3972 ...
 $ Nnode      : int 3964
 $ tip.label  : chr [1:3965] "X90478" "OTU.1739" "OTU.1813" "OTU.778" ...
 $ edge.length: num [1:7928] 0.4798 0.1263 0.0106 0.0229 0.1192 ...
 $ node.label : chr [1:3964] "" "0.599.8" "0.736.11" "0.883" ...
 - attr(*, "class")= chr "phylo"
 - attr(*, "order")= chr "cladewise"


In [180]:
%%R
df.Till = df.l2fc.Till %>% select(OTU, padj2, Till) 
df.Till$pTill = df.Till$padj2
df.Till$padj2 = NULL
head(df.Till)

        OTU Till        pTill
1  OTU.1739    T 0.6010558483
2  OTU.1813    N 0.0056520888
3   OTU.778    N 0.0009556926
4  OTU.2180    N 0.5116160300
5 OTU.12365    N 0.7656246065
6  OTU.2343    T 0.0002535853


In [182]:
%%R
df.OM = df.l2fc.OM %>% select(OTU, padj2, OM) 
df.OM$pOM = df.OM$padj2
df.OM$padj2 = NULL
head(df.OM)

        OTU        OM       pOM
1  OTU.1739 Harvested 0.2826947
2  OTU.1813 Harvested 0.9999966
3   OTU.778  Returned 0.9999966
4  OTU.2180  Returned 0.9999966
5 OTU.12365 Harvested 0.6026827
6  OTU.2343 Harvested 0.5075497


In [184]:
%%R
OTUs = as.data.frame(bulkTree$tip.label)
colnames(OTUs) = 'OTU'
head(OTUs)


        OTU
1    X90478
2  OTU.1739
3  OTU.1813
4   OTU.778
5  OTU.2180
6 OTU.12365


In [188]:
%%R
All = full_join(OTUs, df.Till)
All = full_join(All, df.OM)

In [190]:
%%R
head(All)

        OTU Till        pTill        OM       pOM
1    X90478 <NA>           NA      <NA>        NA
2  OTU.1739    T 0.6010558483 Harvested 0.2826947
3  OTU.1813    N 0.0056520888 Harvested 0.9999966
4   OTU.778    N 0.0009556926  Returned 0.9999966
5  OTU.2180    N 0.5116160300  Returned 0.9999966
6 OTU.12365    N 0.7656246065 Harvested 0.6026827


In [203]:
%%R
All.Enrich = All %>% select(OTU, pOM, pTill, OM, Till)
All.Enrich$PT[All.Enrich$Till == 'T' & All.Enrich$pTill <=0.10] = 1
All.Enrich$PT[All.Enrich$Till == 'N' & All.Enrich$pTill <=0.10] = 0
All.Enrich$PT[All.Enrich$pTill > 0.10] = 0
All.Enrich$PT[is.na(All.Enrich$pTill)] = 0

All.Enrich$NoTill[All.Enrich$Till == 'N' & All.Enrich$pTill <=0.10] = 1
All.Enrich$NoTill[All.Enrich$Till == 'T' & All.Enrich$pTill <=0.10] = 0
All.Enrich$NoTill[All.Enrich$pTill > 0.10] = 0
All.Enrich$NoTill[is.na(All.Enrich$pTill)] = 0

All.Enrich$H[All.Enrich$OM == 'Harvested' & All.Enrich$pOM <=0.10] = 1
All.Enrich$H[All.Enrich$OM == 'Returned' & All.Enrich$pOM <=0.10] = 0
All.Enrich$H[All.Enrich$pOM > 0.10] = 0
All.Enrich$H[is.na(All.Enrich$pOM)] = 0

All.Enrich$R[All.Enrich$OM == 'Returned' & All.Enrich$pOM <=0.10] = 1
All.Enrich$R[All.Enrich$OM == 'Harvested' & All.Enrich$pOM <=0.10] = 0
All.Enrich$R[All.Enrich$pOM > 0.10] = 0
All.Enrich$R[is.na(All.Enrich$pOM)] = 0

All.Enrich = All.Enrich  %>% select(OTU,PT, NoTill, H, R) #%>% filter(OTU != 'X90478')



write.table(All.Enrich, 'data/Trait_TEnriched.txt', col.names = FALSE, row.names = FALSE,sep = '\t')

head(All.Enrich)

        OTU PT NoTill H R
1    X90478  0      0 0 0
2  OTU.1739  0      0 0 0
3  OTU.1813  0      1 0 0
4   OTU.778  0      1 0 0
5  OTU.2180  0      0 0 0
6 OTU.12365  0      0 0 0


In [204]:
%%R
test = read.table('data/Trait_TEnriched.txt', sep = '\t')
head(test)

         V1 V2 V3 V4 V5
1    X90478  0  0  0  0
2  OTU.1739  0  0  0  0
3  OTU.1813  0  1  0  0
4   OTU.778  0  1  0  0
5  OTU.2180  0  0  0  0
6 OTU.12365  0  0  0  0


In [None]:
%%bash
/home/chantal/tree_edit/bin/concentrait.r /home/chantal/Chazy/Bulk/515_806/aln_for_tree_sparse_boot.nwk \
    X90478 data/Trait_TEnriched.txt -p 10 > data/consenTRAITtable.txt

In [240]:
%%R
tau = read.table('data/consenTRAITtable.txt')

In [243]:
%%R
head(tau)

     V1                  V2      V3
1 Trait               tau_D p-value
2    t1  0.0131340091891306    0.69
3    t2  0.0113040223831905       1
4    t3 0.00968437116372465       1
5    t4 0.00919891527694024       1


In [None]:

Trait	tau_D	p-value 
t1	0.017174231524428	1 
t2	0.0142315037878788	1 
t3	0.0123093129770992	1 
t4	0.0119394210526316	1 