In [1]:
library(phylofactor)
library("treeio")
library("ggtree")
library("ggplot2")
library(readr)
library(randomForest)

Loading required package: ape
Loading required package: magrittr
Loading required package: data.table
Loading required package: Matrix

Attaching package: ‘treeio’

The following object is masked from ‘package:ape’:

    drop.tip

ggtree v1.14.6  For help: https://guangchuangyu.github.io/software/ggtree

If you use ggtree in published research, please cite the most appropriate paper(s):

- Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628

- Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, accepted. doi: 10.1093/molbev/msy194

Attaching package: ‘ggtree’

The following object is masked from ‘package:Matrix’:

    expand

The following ob

# Load Data

MODIFY THE PATH OF Microbiome-Factorization

In [None]:
ROOT <- "/Users/earmingol/Dropbox/Universidad/UCSanDiego/Lab_Knight/Microbiome-Factorization/"

Load Data

In [2]:
train_OTUs_filename <- paste(ROOT, "/outputs/train_filtered_OTUs_table.tsv", sep='')
test_OTUs_filename <- paste(ROOT, "/outputs/test_filtered_OTUs_table.tsv", sep='')
train_MetaData_filename <- paste(ROOT, "/outputs/train_filtered_metadata.txt", sep='')
test_MetaData_filename <- paste(ROOT, "/outputs/test_filtered_metadata.txt", sep='')

Tree_filename <- paste(ROOT, "/data/GreenGenes/97_otus.tree", sep='')
Taxonomy_filename <- paste(ROOT, "/data/GreenGenes/97_otu_taxonomy.txt", sep='')

In [3]:
# Read OTUs table
train_OTUs_ <- read.csv(train_OTUs_filename, header=TRUE, sep="\t", check.names=FALSE, row.names=1)
test_OTUs_ <- read.csv(test_OTUs_filename, header=TRUE, sep="\t", check.names=FALSE, row.names=1)

In [4]:
# Read Metadata
train_MetaData_ <- read.csv(train_MetaData_filename, header=TRUE, sep="\t", row.names=1, check.names=FALSE)
test_MetaData_ <- read.csv(test_MetaData_filename, header=TRUE, sep="\t", row.names=1, check.names=FALSE)

In [5]:
# Read taxonomy
taxonomy <- read.csv(Taxonomy_filename, header=TRUE, sep="\t", check.names=FALSE)
colnames(taxonomy) <- c("id", "taxonomy")

In [5]:
# Read Tree
tree <- read.tree(Tree_filename, tree.names=1)

# Analysis

Parameters

In [7]:
phenotypes <- c('PhClasses')

data_type = "16S_"
ncores <- 3
OUTPUTS <- paste(ROOT, "/outputs/PhyloFactor/", sep='')

factors <- 40
fam <- 'binomial'
max_var='F'
phenotype_type <- "Categorical" # Categorical or Continuous

Analysis

In [8]:
for (i in 1:length(phenotypes)){
    phenotype <- phenotypes[i]
    model <- paste(phenotypes[i], "~Data", sep="")
    
    # Drop nans
    train_MetaData <- train_MetaData_[!is.na(train_MetaData_[,phenotype]),]
    test_MetaData <- test_MetaData_[!is.na(test_MetaData_[,phenotype]),]
    train_OTUs <- na.omit(train_OTUs_)
    test_OTUs <- na.omit(test_OTUs_)
    
    # Filter OTUs by samples in metadata
    train_OTUs_in_meta <- rownames(train_MetaData)
    test_OTUs_in_meta <- rownames(test_MetaData)

    train_OTUs_not_in_meta <- train_OTUs_in_meta[which(!train_OTUs_in_meta %in% colnames(train_OTUs))]
    test_OTUs_not_in_meta <- test_OTUs_in_meta[which(!test_OTUs_in_meta %in% colnames(test_OTUs))]

    train_OTUs_order_by_meta <- train_OTUs_in_meta[which(train_OTUs_in_meta %in% colnames(train_OTUs))]
    test_OTUs_order_by_meta <- test_OTUs_in_meta[which(test_OTUs_in_meta %in% colnames(test_OTUs))]
    
    # Sort by metadata order (columns)
    train_OTUs <- train_OTUs[,train_OTUs_order_by_meta]
    test_OTUs <- test_OTUs[,test_OTUs_order_by_meta]
    
    # Filter tree
    OTUs_in_tree <- rownames(train_OTUs)[rownames(train_OTUs) %in% tree$tip.label]
    tips_not_in_OTUs <- tree$tip.label[which(!tree$tip.label %in% rownames(train_OTUs))]
    filtered_tree <- drop.tip(tree, tips_not_in_OTUs)
    OTUs_in_filtered_tree <- filtered_tree$tip.label
    
    # Filter OTUs by species in tree
    train_OTUs <- train_OTUs[filtered_tree$tip.label,]
    test_OTUs <- test_OTUs[filtered_tree$tip.label,]
    
    # Run PhyloFactor
    pf_PhyloFactor <- PhyloFactor(train_OTUs, filtered_tree, train_MetaData, frmla = model, nfactors=factors,
                                  choice=max_var,ncores=ncores, family=fam)
    
    pf_test <- PhyloFactor(test_OTUs, filtered_tree, test_MetaData, frmla = model, nfactors=1,
                              choice=max_var,ncores=ncores, family=fam)
    
    # Train model
    X <- train_MetaData
    X$Data <- t(t(pf_PhyloFactor$basis[,1:factors]) %*% log(pf_PhyloFactor$Data))
    #logit <- glm(model, data = X, family = fam)
    logit <- randomForest(x = X$Data, y= X[,phenotype], ntree=1000)
    
    # Predict on test data
    Y <- test_MetaData
    
    Y$Data <- t(t(pf_PhyloFactor$basis[,1:factors]) %*% log(pf_test$Data))
    #predicted_y <- predict(logit, newdata = Y, type = "response")
    predicted_y <- predict(logit, newdata = Y$Data, type = "response")
    values <- data.frame(pf_test$X[,phenotype], predicted_y)
    colnames(values) <- c('True', 'Predicted')
    
    # Save results
    write.csv(values, file = paste(OUTPUTS, data_type, phenotype, ".csv", sep=""))
    save.image(file=paste(OUTPUTS, data_type, phenotype, factors, ".Rdata", sep=""))
}

“Data has zeros and will receive default modification of zeros. Zeros will be replaced column wise with delta*min(x[x>0]), default delta=0.65”

 3 factors completed in 0.0221 minutes.     Estimated time of completion: 2019-06-10 16:31:48   

“glm.fit: fitted probabilities numerically 0 or 1 occurred”

 4 factors completed in 0.029 minutes.     Estimated time of completion: 2019-06-10 16:31:48   

“glm.fit: fitted probabilities numerically 0 or 1 occurred”

 6 factors completed in 0.0423 minutes.     Estimated time of completion: 2019-06-10 16:31:47   

“Objective function produced 2 identical groups. Will choose group at random.”

 37 factors completed in 0.288 minutes.     Estimated time of completion: 2019-06-10 16:31:49    

“Objective function produced 2 identical groups. Will choose group at random.”

 40 factors completed in 0.315 minutes.     Estimated time of completion: 2019-06-10 16:31:49   

“Data has zeros and will receive default modification of zeros. Zeros will be replaced column wise with delta*min(x[x>0]), default delta=0.65”

 1 factor completed in 0.00944 minutes.    Estimated time of completion: 2019-06-10 16:31:53   

“The response has five or fewer unique values.  Are you sure you want to do regression?”