In [1]:
library(plyr)
library(boot)

In [3]:
## Use the same data reading procedure as in metabolome heatmap script.

# Load integrals, peak ranges, and sample data
path <- "~/Documents/Projects/asf_interactions/data/"
integral_file <- 'integrals_20170323.txt'
integral_range_file <- 'coculture_peak_ranges.txt'
integrals = read.table(paste0(path,integral_file),header=FALSE,
                         sep = ",")
integral_ranges = read.table(paste0(path,integral_range_file),header=TRUE,
                             sep = ",")
master <- read.table(file=paste0(path,"merged_metadata.txt"),header=TRUE,sep='\t')

# set column names of integrals using integral_ranges met names
colnames(integrals) <- integral_ranges$met_name
mets = colnames(integrals)
# Set rownames using experiment_id. This should be ordered correctly.
integrals$experiment_id <- master$experiment_id
# merge so that classes and integrals are in one dataframe
all_data <- join(master,integrals,by="experiment_id")

# save the merged data file for metabolights submission and sharing
write.table(all_data,file=paste0(path,paste0("integrals_with_metadata.txt")),row.names=FALSE,sep='\t')

# remove non-growing samples from the data
# 500 and 492 did not grow in experiment 3
# 502 did not grow in experiment 2 or 3
all_data = all_data[!(all_data$Run == 2 & grepl('502',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('502',all_data$species)),]
all_data = all_data[!(grepl('502',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('500',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('492',all_data$species)),]
# remove experiment 4 (3-species subset including only 356,361,519)
all_data = all_data[all_data$Run != "4",]

Below, we test the DNA abundance data from qPCR for normality in all conditions, and for differential abundance between mono- and co-culture. We use the two-sided Mann-Whitney test and control the false discovery rate using the Benjamini-Hochberg (BH) procedure. For the group size for correction with BH, we use only samples containing the species being tested (e.g. only co-cultures that contain that species). 

In [3]:
# Test for differential DNA abundance
# positive t-statistic means the MONOCULTURE had HIGHER abundance than the co-culture being tested.
# (i.e. coculture had negative impact on growth)

# Test all groups for normality in DNA abundance
# For Shapiro-wilks test, the null hypothesis is that the population is normally distributed.
# Therefore, if p < threshold, the population in NOT normally distributed
all_normality_results = data.frame()
path <- "~/Documents/Projects/asf_interactions/results/"
species_list = c('356','360','361','492','500','519')
for (species_tested in species_list) {
  # subset the master dataframe by cultures that included species and are from experiments 1, 2, and 3
  contains_species = all_data[grepl(species_tested,all_data$species),]
  contains_species = contains_species[contains_species$Run %in% c(1,2,3),]
  contains_species = droplevels(contains_species)
    
  # get the correct qPCR probe for the species being tested
  qPCR_column = colnames(contains_species)[grepl(species_tested,colnames(contains_species))]
  unique_combos = as.character(unique(contains_species$species)[!(unique(contains_species$species) %in% c(species_tested))])
  
  # for each unique combo, perform the test
  x = contains_species[contains_species$species == species_tested,][,qPCR_column]
  x_shapiro_result = shapiro.test(x)
  
  results = lapply(seq_along(unique_combos), function (n) {
    y = contains_species[contains_species$species == unique_combos[n],][,qPCR_column]
    result = wilcox.test(x,y)
    return(result)})
    
  # also calculate difference between monoculture mean and co-culture
  group_diffs = lapply(seq_along(unique_combos), function (n) {
    y = contains_species[contains_species$species == unique_combos[n],][,qPCR_column]
    result = mean(x) - mean(y)
    return(result)})
    
    
  y_shapiro_results = lapply(seq_along(unique_combos), function (n) {
    y = contains_species[contains_species$species == unique_combos[n],][,qPCR_column]
    result = shapiro.test(y)
    return(result)})
                             
  names(results) <- paste(matrix(species_tested, ncol = 2, byrow = TRUE)[,1], matrix(unlist(unique_combos), ncol = 1, byrow = TRUE), sep = " vs. ")
  names(group_diffs) <- paste(matrix(species_tested, ncol = 2, byrow = TRUE)[,1], matrix(unlist(unique_combos), ncol = 1, byrow = TRUE), sep = " vs. ")
  names(y_shapiro_results) <- paste(matrix(species_tested, ncol = 2, byrow = TRUE)[,1], matrix(unlist(unique_combos), ncol = 1, byrow = TRUE), sep = " vs. ")
  
  # generate vector of p values for multiple testing correction
  p_init = numeric()
  p_vector = c(p_init)
  w_stat_init = numeric()
  w_stat_vector = c(w_stat_init)
  diff_init = numeric()
  diff_vector = c(diff_init)
  for (i in 1:length(names(results))) {
    p_vector = c(p_vector,results[[names(results)[i]]]$p.value)
    w_stat_vector = c(w_stat_vector,results[[names(results)[i]]]$statistic)
    diff_vector = c(diff_vector,group_diffs[[names(results)[i]]])
  }
  names(p_vector) = names(results)
  names(w_stat_vector) = names(results)
  
  # perform the correction
  final_p = p.adjust(p_vector,method="BH")
  print(w_stat_vector)
  print(final_p)
  resultframe = data.frame(final_p,w_stat_vector,diff_vector)
  
  # save the results
  write.table(resultframe,file=paste0(path,paste0(species_tested,"_diff_abundance.txt")),row.names=TRUE,sep='\t')
  
  ## compile the normality testing results
  # make placeholder vectors, then add the monoculture result
  p_init = numeric()
  p_vector = c(p_init)
  shapiro_stat_init = numeric()
  shapiro_stat_vector = c(w_stat_init)
  for (i in 1:length(names(y_shapiro_results))) {
    p_vector = c(p_vector,results[[names(y_shapiro_results)[i]]]$p.value)
    shapiro_stat_vector = c(shapiro_stat_vector,y_shapiro_results[[names(y_shapiro_results)[i]]]$statistic)
  }
  names(p_vector) = names(y_shapiro_results)
  names(shapiro_stat_vector) = names(y_shapiro_results)
  resultframe = data.frame(p_vector,shapiro_stat_vector)

  # add the monoculture test result
  single = data.frame(x_shapiro_result$p.value,x_shapiro_result$statistic)
  rownames(single) = species_tested
  colnames(single) = colnames(resultframe)
  resultframe = rbind(resultframe,single)
  
  # combine with the existing normality test results
  all_normality_results = rbind(all_normality_results,resultframe)
  
}
write.table(resultframe,file=paste0(path,paste0("DNA_abundance_normality_tests.txt")),row.names=TRUE,sep='\t')


356 vs. 356,360 356 vs. 356,500 356 vs. 356,492 356 vs. 356,361 356 vs. 356,519 
             45              34              24              81              15 
356 vs. 356,360 356 vs. 356,500 356 vs. 356,492 356 vs. 356,361 356 vs. 356,519 
   0.7756243756    0.7599067599    0.7756243756    0.0002056767    0.0610859729 
360 vs. 356,360 360 vs. 360,500 360 vs. 360,492 360 vs. 360,361 360 vs. 360,519 
             81              36              44              81              74 
360 vs. 356,360 360 vs. 360,500 360 vs. 360,492 360 vs. 360,361 360 vs. 360,519 
   0.0001028383    0.3276723277    0.0619380619    0.0001028383    0.0030851501 
361 vs. 360,361 361 vs. 361,492 361 vs. 361,500 361 vs. 356,361 361 vs. 361,519 
             22               4               0              47              14 
361 vs. 360,361 361 vs. 361,492 361 vs. 361,500 361 vs. 356,361 361 vs. 361,519 
    0.141865487     0.011988012     0.001998002     0.604812834     0.031262855 
492 vs. 360,492 492 vs. 492,

In [4]:
# How many conditions had normal distributions?
print(dim(all_normality_results))
print(dim(all_normality_results[all_normality_results$p_vector < 0.05,]))

[1] 36  2
[1] 22  2


The only monoculture in which non-normality appears is for ASF492, but >50% of the co-culture conditions appear to be non-normally distributed. Thus, we will not assume normality is present in the population for both groups in each comparison, thus a t-test is not appropriate.

In [5]:
# Test for normality in the metabolomics data for monocultures
species_list = c('356','360','361','492','500','519')
# get metabolite abundances for blanks
blanks = all_data[all_data$species == "0",]

# first, test for normality in the blanks.
contains_species = all_data[all_data$species=="0",]
contains_species = droplevels(contains_species)

# for each metabolite, perform a shapiro-wilks test of normality
met_p_init = numeric()
met_ps = (met_p_init)
met_t_init = numeric()
met_ts = c(met_t_init)
met_order_init = character()
met_order = c(met_order_init)
for (met in mets) {
  result = shapiro.test(contains_species[,met])
  met_ps = c(met_ps,result$p.value)
  met_ts = c(met_ts,result$statistic)
  met_order = c(met_order,met)
}

# save the results and print values to get the number of metabolites for which the null was rejected
blank_normality_test = data.frame(met_ps,met_ts,met_order)
write.table(blank_normality_test,file=paste0(path,paste0("blank_metabolite_normality_tests.txt")),row.names=TRUE,sep='\t')
print(dim(blank_normality_test))
print(dim(blank_normality_test[blank_normality_test$met_ps < 0.05,]))

# initialize empty variable to bind new results dataframes to
all_met_tests = NULL
for (species_tested in species_list) {
  # subset the master dataframe by cultures that were monocultures of the species
  contains_species = all_data[grepl(species_tested,all_data$species),]
  contains_species = droplevels(contains_species)
    
  # for each metabolite, perform a shapiro-wilks test of normality
  met_p_init = numeric()
  met_ps = (met_p_init)
  met_shapiro_init = numeric()
  met_shapiros = c(met_t_init)
  met_order_init = character()
  met_order = c(met_order_init)
  for (met in mets) {
    result = shapiro.test(contains_species[,met])
    met_ps = c(met_ps,result$p.value)
    met_shapiros = c(met_shapiros,result$statistic)
    met_order = c(met_order,met)
  }
  
  # make new df from test results
  collapse = data.frame(met_ps,met_shapiros,met_order,species_tested)
  all_met_tests = rbind(collapse,all_met_tests)
  
}

write.table(all_met_tests,file=paste0(path,paste0("all_metabolite_normality_tests.txt")),row.names=TRUE,sep='\t')
print(dim(all_met_tests))
print(dim(all_met_tests[all_met_tests$met_ps < 0.05,]))

[1] 86  3
[1] 3 3
[1] 516   4
[1] 431   4


In [6]:
# test for differential metabolite abundance in each culture group vs. blank
# get metabolite abundances for blanks
blanks = all_data[all_data$species == "0",]
species = unique(all_data$species)[!unique(all_data$species) %in% c('0')]
# initialize empty variable to bind new results dataframes to
all_met_tests = NULL
for (species_tested in species) {
  # subset the master dataframe by cultures that contain the species of interest
  contains_species = all_data[all_data$species ==species_tested,]
  contains_species = droplevels(contains_species)
  
  # for each metabolite, perform a t-test of monoculture vs. blank
  met_p_init = numeric()
  met_ps = (met_p_init)
  met_t_init = numeric()
  met_ts = c(met_t_init)
  met_order_init = character()
  met_order = c(met_order_init)
  diff_avg_init = numeric()
  diff_avgs = c(diff_avg_init)
  blank_avg_init = numeric()
  blank_avgs = c(blank_avg_init)
  group_avg_init = numeric()
  group_avgs = c(group_avg_init)
  for (met in mets) {
    result = wilcox.test(contains_species[,met],blanks[,met],exact=FALSE)
    met_ps = c(met_ps,result$p.value)
    met_ts = c(met_ts,result$statistic)
    diff_avgs = c(diff_avgs,mean(contains_species[,met]) - mean(blanks[,met]))
    blank_avgs = c(blank_avgs,mean(blanks[,met]))
    group_avgs = c(group_avgs,mean(contains_species[,met]))
    met_order = c(met_order,met)
  }
  
  # make new df from p,t,met, and combo
  collapse = data.frame(met_ps,met_ts,diff_avgs,blank_avgs,group_avgs,met_order,species_tested)
  all_met_tests = rbind(collapse,all_met_tests)
  
}

# FDR correct the p values
all_met_tests$met_ps = p.adjust(all_met_tests$met_ps,method="BH")
# save the result
write.table(all_met_tests,file=paste0(path,"all_vs_blank_diff_metabolites.txt"),row.names=FALSE,sep='\t')

# How many metabolites were differentially abundant?
dim(all_met_tests)
dim(all_met_tests[all_met_tests$met_ps < 0.05,])

In [7]:
# Generate the metrics for the co-culture resource allocation model (CRAM)
# center using the mean of blanks
blanks = subset(all_data,species == '0')
blank_means = colMeans(blanks[,names(blanks) %in% mets])
subtracted_integrals = sweep(all_data[,mets], 2, blank_means, `-`)
all_data[,mets] = subtracted_integrals

In [8]:
# calculate metabolite yields for each mono-culture sample by dividing the z-score by species abundance.
species_list = c('356','360','361','492','500','519')

single_species_all = all_data[all_data$species %in% species_list,]
qpcr_cols = colnames(single_species_all)[grep('qpcr',colnames(single_species_all))]
# match the species names to the qpcr column and get the per-biomass yield
for (species_name in species_list) {
  qpcr_name = qpcr_cols[grep(species_name,qpcr_cols)]
  single_species_all[single_species_all$species == species_name,][,mets] = single_species_all[single_species_all$species == species_name,][,mets]/single_species_all[single_species_all$species == species_name,][,qpcr_name]
}

# take the mean yield for each metabolite for each species
avg_single_yield = aggregate(single_species_all[,mets], list(single_species_all$species), mean)
rownames(avg_single_yield) = avg_single_yield$Group.1
avg_single_yield$Group.1 = NULL
copy_all_data = all_data
# replace NA with 0
copy_all_data[is.na(copy_all_data)] = 0
copy_all_data[,mets] = 0
# derive expected metabolite values using the per-biomass yields and observed biomass in co-culture
for (species_name in species_list) {
  qpcr_name = qpcr_cols[grep(species_name,qpcr_cols)]
  copy_all_data[,mets] = copy_all_data[,mets] + as.matrix(copy_all_data[,qpcr_name])%*%as.matrix(avg_single_yield[species_name,])
}

# save the calculated expected metabolite values
write.table(copy_all_data,file=paste0(path,'expected_metabolite_abundances.tsv'),row.names=FALSE,sep='\t')


In [9]:

# for each co-culture, generate a predicted metabolite abundance using the following, where
# all yields are the average across all monoculture samples:
# predicted = yield[species1]*abundance[species1] + yield[species2]*abundance[species2]

# now compare each co-culture group in copy_all_data (predictions) to all_data (actual values)
# for each metabolite, and report the p adj and t stat.
all_combos = unique(copy_all_data$species)
all_combos = all_combos[!(all_combos %in% species_list)]
all_combos = all_combos[!(all_combos %in% c('0'))]
all_combos = droplevels(all_combos)

#only test known metabolites
#known_mets = mets[-grep("unknown", mets)]


all_met_tests = NULL
for (combo in all_combos) {
  subset_predicted = copy_all_data[copy_all_data$species == combo,mets]
  subset_observed = all_data[all_data$species == combo,mets]
  combo_p_init = numeric()
  combo_ps = (combo_p_init)
  combo_t_init = numeric()
  combo_ts = c(combo_t_init)
  met_order_init = character()
  met_order = c(met_order_init)
  diff_avg_init = numeric()
  diff_avgs = c(diff_avg_init)
  observed_avg_init = numeric()
  observed_avgs = c(observed_avg_init)
  expected_avg_init = numeric()
  expected_avgs = c(expected_avg_init)
  for (met in mets) {
    result = wilcox.test(subset_observed[,met],subset_predicted[,met],exact=FALSE)
    combo_ps = c(combo_ps,result$p.value)
    combo_ts = c(combo_ts,result$statistic)
    diff_avgs = c(diff_avgs,mean(subset_observed[,met]) - mean(subset_predicted[,met]))
    observed_avgs = c(observed_avgs,mean(subset_observed[,met]))
    expected_avgs = c(expected_avgs,mean(subset_predicted[,met]))
    met_order = c(met_order,met)
  }
  
  # make new df from p,t,met, and combo
  collapse = data.frame(combo_ps,combo_ts,diff_avgs,observed_avgs,expected_avgs,met_order,combo)
  all_met_tests = rbind(collapse,all_met_tests)
  
    
}

# FDR correct the p values
all_met_tests$combo_ps = p.adjust(all_met_tests$combo_ps,method="BH")
# save the result
write.table(all_met_tests,file=paste0(path,"CRAM_all_diff_metabolites.txt"),row.names=FALSE,sep='\t')

In [10]:
# perform the expectation calculation again, this time with leave-one-out resampling of yield
# calculate metabolite yields for each mono-culture sample by dividing the z-score by species abundance.
species_list = c('356','360','361','492','500','519')

single_species_all = all_data[all_data$species %in% species_list,]
qpcr_cols = colnames(single_species_all)[grep('qpcr',colnames(single_species_all))]
# match the species names to the qpcr column and get the per-biomass yield
for (species_name in species_list) {
  qpcr_name = qpcr_cols[grep(species_name,qpcr_cols)]
  single_species_all[single_species_all$species == species_name,][,mets] = single_species_all[single_species_all$species == species_name,][,mets]/single_species_all[single_species_all$species == species_name,][,qpcr_name]
}

bootdata = data.frame(single_species_all)
# take the mean yield for each metabolite for each species, this time with leave-one-out resampling
# 
bootmean = function(species,bootsize){
    xboot = sample(1:(sum(single_species_all$species==species)),size = sum(single_species_all$species==species)-bootsize,replace=FALSE)
    print(xboot)
    avg = apply(single_species_all[single_species_all$species==species,mets][xboot,],2,mean)
    return(avg)
}
master_boot = list()
iters = 50
for (i in 1:iters){
    # get a single bootstrap mean for each species
    avg_single_yield = data.frame(matrix(ncol=length(mets),nrow=0))
    colnames(avg_single_yield) = mets
    for(species in unique(single_species_all$species)) {
        print(species)
        tempdata = t(data.frame(bootmean(species,2)))
        row.names(tempdata) = species
        avg_single_yield = rbind(avg_single_yield,tempdata)    
    }

    
    copy_all_data = data.frame(all_data)
    # replace NA with 0
    copy_all_data[is.na(copy_all_data)] = 0
    copy_all_data[,mets] = 0
    # derive expected metabolite values using the per-biomass yields and observed biomass in co-culture
    for (species_name in species_list) {
      qpcr_name = qpcr_cols[grep(species_name,qpcr_cols)]
      copy_all_data[,mets] = copy_all_data[,mets] + as.matrix(copy_all_data[,qpcr_name])%*%as.matrix(avg_single_yield[species_name,])
    }

    all_combos = unique(copy_all_data$species)
    all_combos = all_combos[!(all_combos %in% species_list)]
    all_combos = all_combos[!(all_combos %in% c('0'))]
    all_combos = droplevels(all_combos)

    #only test known metabolites
    #known_mets = mets[-grep("unknown", mets)]


    all_met_tests = NULL
    for (combo in all_combos) {
      subset_predicted = copy_all_data[copy_all_data$species == combo,mets]
      subset_observed = all_data[all_data$species == combo,mets]
      combo_p_init = numeric()
      combo_ps = (combo_p_init)
      combo_t_init = numeric()
      combo_ts = c(combo_t_init)
      met_order_init = character()
      met_order = c(met_order_init)
      diff_avg_init = numeric()
      diff_avgs = c(diff_avg_init)
      observed_avg_init = numeric()
      observed_avgs = c(observed_avg_init)
      expected_avg_init = numeric()
      expected_avgs = c(expected_avg_init)
      for (met in mets) {
        result = wilcox.test(subset_observed[,met],subset_predicted[,met],exact=FALSE)
        combo_ps = c(combo_ps,result$p.value)
        combo_ts = c(combo_ts,result$statistic)
        diff_avgs = c(diff_avgs,mean(subset_observed[,met]) - mean(subset_predicted[,met]))
        observed_avgs = c(observed_avgs,mean(subset_observed[,met]))
        expected_avgs = c(expected_avgs,mean(subset_predicted[,met]))
        met_order = c(met_order,met)
      }

      # make new df from p,t,met, and combo
      collapse = data.frame(combo_ps,combo_ts,diff_avgs,observed_avgs,expected_avgs,met_order,combo)
      all_met_tests = rbind(collapse,all_met_tests)


    }

    # FDR correct the p values
    all_met_tests$combo_ps = p.adjust(all_met_tests$combo_ps,method="BH")
    master_boot[[i]] = all_met_tests
}


[1] "356"
[1] 8 5 4 7 2 3 1
[1] "361"
[1] 3 5 8 4 2 7 6
[1] "360"
[1] 4 2 9 6 1 3 5
[1] "492"
[1] 4 6 1 3
[1] "519"
[1] 5 3 1 6 9 2 8
[1] "500"
[1] 5 2 4 3
[1] "356"
[1] 6 4 8 9 2 5 3
[1] "361"
[1] 4 9 1 2 7 6 8
[1] "360"
[1] 8 7 1 3 2 6 9
[1] "492"
[1] 3 1 5 4
[1] "519"
[1] 4 7 5 9 3 1 8
[1] "500"
[1] 6 2 3 5
[1] "356"
[1] 5 7 9 2 1 8 4
[1] "361"
[1] 8 1 2 9 4 5 7
[1] "360"
[1] 5 4 8 6 9 7 1
[1] "492"
[1] 6 5 1 4
[1] "519"
[1] 1 6 9 5 7 2 4
[1] "500"
[1] 6 4 3 5
[1] "356"
[1] 2 5 3 4 8 1 9
[1] "361"
[1] 6 3 4 7 9 1 8
[1] "360"
[1] 7 9 2 5 1 3 4
[1] "492"
[1] 1 4 3 5
[1] "519"
[1] 4 7 3 2 5 8 1
[1] "500"
[1] 5 4 2 6
[1] "356"
[1] 2 8 5 3 9 4 7
[1] "361"
[1] 7 6 1 8 3 9 4
[1] "360"
[1] 3 4 8 9 5 2 6
[1] "492"
[1] 5 3 6 1
[1] "519"
[1] 1 2 6 9 3 5 8
[1] "500"
[1] 6 2 5 4
[1] "356"
[1] 9 4 6 7 1 8 3
[1] "361"
[1] 1 9 6 8 5 3 2
[1] "360"
[1] 1 6 8 7 2 9 5
[1] "492"
[1] 2 1 6 3
[1] "519"
[1] 2 9 6 3 5 1 4
[1] "500"
[1] 2 6 5 4
[1] "356"
[1] 6 9 5 2 3 7 4
[1] "361"
[1] 4 2 7 1 6 3 8
[1] "360

In [11]:
library(dplyr)
master_boot_frame = master_boot %>% Reduce(function(dtf1,dtf2) left_join(dtf1,dtf2,by=c("met_order","combo")), .)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [12]:
master_boot_frame

combo_ps.x,combo_ts.x,diff_avgs.x,observed_avgs.x,expected_avgs.x,met_order,combo,combo_ps.y,combo_ts.y,diff_avgs.y,⋯,combo_ps.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x,combo_ts.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x,diff_avgs.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x,observed_avgs.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x,expected_avgs.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x.x,combo_ps.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y,combo_ts.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y,diff_avgs.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y,observed_avgs.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y,expected_avgs.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y.y
0.266629960,57,0.0008416837,-0.002996192,-3.837875e-03,unknown1,360519,0.044165058,68,0.0013034670,⋯,0.060248707,66,0.0012475459,-0.002996192,-0.004243738,0.167803621,60,0.0008986643,-0.002996192,-3.894856e-03
0.147898235,20,-0.0006636677,0.001895789,2.559457e-03,unknown2,360519,0.308478622,25,-0.0004242387,⋯,0.286655794,25,-0.0004142868,0.001895789,0.002310076,0.092087055,17,-0.0007995933,0.001895789,2.695382e-03
0.377375022,54,0.1099962365,-0.488759444,-5.987557e-01,unknown3,360519,0.064917426,66,0.1901739220,⋯,0.100986183,63,0.1770853267,-0.488759444,-0.665844771,0.373888879,54,0.1186072770,-0.488759444,-6.073667e-01
0.266629960,57,0.3371289537,-0.969844167,-1.306973e+00,Butyrate,360519,0.029417270,70,0.4731846215,⋯,0.027552372,70,0.4572322132,-0.969844167,-1.427076380,0.124907368,62,0.3666762095,-0.969844167,-1.336520e+00
0.470803253,52,0.0853552694,-0.291411111,-3.767664e-01,Isovalerate,360519,0.029417270,70,0.2194423631,⋯,0.041198749,68,0.2026515124,-0.291411111,-0.494062624,0.428984062,53,0.1059622559,-0.291411111,-3.973734e-01
0.427509890,53,0.0566044377,-0.816431944,-8.730364e-01,Leucine,360519,0.205156459,59,0.2988778469,⋯,0.188060087,59,0.2795233565,-0.816431944,-1.095955301,0.472363927,52,0.0335973077,-0.816431944,-8.500293e-01
0.035111701,12,-0.3808020988,0.620036111,1.000838e+00,Valine,360519,0.111740119,18,-0.3378910026,⋯,0.100986183,18,-0.3353761127,0.620036111,0.955412224,0.022758746,10,-0.4278042434,0.620036111,1.047840e+00
0.051897038,14,-0.2409283466,0.329054167,5.699825e-01,Isoleucine,360519,0.064917426,15,-0.2266626052,⋯,0.086946630,17,-0.2236037227,0.329054167,0.552657889,0.028364442,11,-0.2757569423,0.329054167,6.048111e-01
0.051897038,14,-0.4599352109,1.127489306,1.587425e+00,Propionate,360519,0.035690863,12,-0.5065271971,⋯,0.041198749,13,-0.4980347933,1.127489306,1.625524099,0.043129940,13,-0.4970985036,1.127489306,1.624588e+00
0.018887424,72,0.0057933095,0.005836694,4.338495e-05,unknown4,360519,0.019581815,72,0.0063089131,⋯,0.018408711,72,0.0070188185,0.005836694,-0.001182124,0.018408711,72,0.0058310145,0.005836694,5.679902e-06


In [13]:
met_subset = c('Proline','Lactose','Alanine','Valine','Isoleucine')
new = master_boot_frame %>% filter(combo=='356,519', met_order %in% met_subset) %>% select(starts_with("combo_ps"), "met_order")

In [14]:
row.names(new) = new$met_order
new = new[,!(names(new) == 'met_order')]

In [15]:
library(ggplot2)
p_dist = data.frame(t(new))
p = ggplot(data=p_dist,aes(p_dist$Proline)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/proline_p.svg',p,width=6,height=6)

p = ggplot(data=p_dist,aes(p_dist$Lactose)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/lactose_p.svg',p,width=6,height=6)

p = ggplot(data=p_dist,aes(p_dist$Alanine)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/alanine_p.svg',p,width=6,height=6)

p = ggplot(data=p_dist,aes(p_dist$Valine)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/Valine_p.svg',p,width=6,height=6)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


In [16]:
new = master_boot_frame %>% filter(combo=='356,519', met_order %in% met_subset) %>% select(starts_with("diff_avgs"), "met_order")
row.names(new) = new$met_order
new = new[,!(names(new) == 'met_order')]

In [17]:
diff_dist = data.frame(t(new))
p = ggplot(data=diff_dist,aes(diff_dist$Proline)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/proline_diff.svg',p,width=6,height=6)

p = ggplot(data=diff_dist,aes(diff_dist$Lactose)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/lactose_diff.svg',p,width=6,height=6)

p = ggplot(data=diff_dist,aes(diff_dist$Alanine)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/alanine_diff.svg',p,width=6,height=6)

p = ggplot(data=diff_dist,aes(diff_dist$Valine)) + geom_histogram() + theme(axis.title.x = element_text(size=20),
                                                                  axis.title.y = element_text(size=20),
                                                                      axis.text.x = element_text(size=12),
                                                                      axis.text.y = element_text(size=12))
ggsave('../results/bootstrap/Valine_diff.svg',p,width=6,height=6)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
