In [1]:
library(plyr)

In [2]:
## Use the same data reading procedure as in metabolome heatmap script.

# Load integrals, peak ranges, and sample data
path <- "~/Documents/Projects/asf_interactions/data/"
integral_file <- 'integrals_20170323.txt'
integral_range_file <- 'coculture_peak_ranges.txt'
integrals = read.table(paste0(path,integral_file),header=FALSE,
                         sep = ",")
integral_ranges = read.table(paste0(path,integral_range_file),header=TRUE,
                             sep = ",")
master <- read.table(file=paste0(path,"merged_metadata.txt"),header=TRUE,sep='\t')

# set column names of integrals using integral_ranges met names
colnames(integrals) <- integral_ranges$met_name
mets = colnames(integrals)
# Set rownames using experiment_id. This should be ordered correctly.
integrals$experiment_id <- master$experiment_id
# merge so that classes and integrals are in one dataframe
all_data <- join(master,integrals,by="experiment_id")

# remove non-growing samples from the data
# 500 and 492 did not grow in experiment 3
# 502 did not grow in experiment 2 or 3
all_data = all_data[!(all_data$Run == 2 & grepl('502',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('502',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('500',all_data$species)),]
all_data = all_data[!(all_data$Run == 3 & grepl('492',all_data$species)),]
# remove experiment 4 (3-species subset including only 356,361,519)
all_data = all_data[all_data$Run != "4",]

Below, we test the DNA abundance data from qPCR for normality in all conditions, and for differential abundance between mono- and co-culture. We use the two-sided Student's t-test and control the false discovery rate using the Benjamini-Hochberg (BH) procedure. For the group size for correction with BH, we use only samples containing the species being tested (e.g. only co-cultures that contain that species). 

In [3]:
# Test for differential DNA abundance
# positive t-statistic means the MONOCULTURE had HIGHER abundance than the co-culture being tested.
# (i.e. coculture had negative impact on growth)

# Test all groups for normality in DNA abundance
# For Shapiro-wilks test, the null hypothesis is that the population is normally distributed.
# Therefore, if p < threshold, the population in NOT normally distributed
all_normality_results = data.frame()
path <- "~/Documents/Projects/asf_interactions/results/"
species_list = c('356','360','361','492','500','502','519')
for (species_tested in species_list) {
  # subset the master dataframe by cultures that included species and are from experiments 1, 2, and 3
  contains_species = all_data[grepl(species_tested,all_data$species),]
  contains_species = contains_species[contains_species$Run %in% c(1,2,3),]
  contains_species = droplevels(contains_species)
  # get the correct qPCR probe for the species being tested
  qPCR_column = colnames(contains_species)[grepl(species_tested,colnames(contains_species))]
  unique_combos = as.character(unique(contains_species$species)[!(unique(contains_species$species) %in% c(species_tested))])
  # for each unique combo, perform the t test
  x = contains_species[contains_species$species == species_tested,][,qPCR_column]
  x_shapiro_result = shapiro.test(x)
  
  results = lapply(seq_along(unique_combos), function (n) {
    y = contains_species[contains_species$species == unique_combos[n],][,qPCR_column]
    result = t.test(x,y)
    return(result)})
 
  y_shapiro_results = lapply(seq_along(unique_combos), function (n) {
    y = contains_species[contains_species$species == unique_combos[n],][,qPCR_column]
    result = shapiro.test(y)
    return(result)})
                             
  names(results) <- paste(matrix(species_tested, ncol = 2, byrow = TRUE)[,1], matrix(unlist(unique_combos), ncol = 1, byrow = TRUE), sep = " vs. ")
  names(y_shapiro_results) <- paste(matrix(species_tested, ncol = 2, byrow = TRUE)[,1], matrix(unlist(unique_combos), ncol = 1, byrow = TRUE), sep = " vs. ")
      # generate vector of p values for multiple testing correction
  p_init = numeric()
  p_vector = c(p_init)
  t_stat_init = numeric()
  t_stat_vector = c(t_stat_init)
  for (i in 1:length(names(results))) {
    p_vector = c(p_vector,results[[names(results)[i]]]$p.value)
    t_stat_vector = c(t_stat_vector,results[[names(results)[i]]]$statistic)
  }
  names(p_vector) = names(results)
  names(t_stat_vector) = names(results)
  # perform the correction
  final_p = p.adjust(p_vector,method="BH")
  print(t_stat_vector)
  print(final_p)
  resultframe = data.frame(final_p,t_stat_vector)
  # save the results
  write.table(resultframe,file=paste0(path,paste0(species_tested,"_diff_abundance.txt")),row.names=TRUE,sep='\t')
  
  ## compile the normality testing results
  # make placeholder vectors, then add the monoculture result
  p_init = numeric()
  p_vector = c(p_init)
  shapiro_stat_init = numeric()
  shapiro_stat_vector = c(t_stat_init)
  for (i in 1:length(names(y_shapiro_results))) {
    p_vector = c(p_vector,results[[names(y_shapiro_results)[i]]]$p.value)
    shapiro_stat_vector = c(shapiro_stat_vector,y_shapiro_results[[names(y_shapiro_results)[i]]]$statistic)
  }
  names(p_vector) = names(y_shapiro_results)
  names(shapiro_stat_vector) = names(y_shapiro_results)
  resultframe = data.frame(p_vector,shapiro_stat_vector)
  # add the monoculture test result
  single = data.frame(x_shapiro_result$p.value,x_shapiro_result$statistic)
  rownames(single) = species_tested
  colnames(single) = colnames(resultframe)
  resultframe = rbind(resultframe,single)
  
  # combine with the existing normality test results
  all_normality_results = rbind(all_normality_results,resultframe)
  
}
write.table(resultframe,file=paste0(path,paste0("DNA_abundance_normality_tests.txt")),row.names=TRUE,sep='\t')


356 vs. 356,360 356 vs. 356,500 356 vs. 356,502 356 vs. 356,492 356 vs. 356,361 
      0.9915253       1.0800073       2.4123659      -0.1559613       5.0577760 
356 vs. 356,519 
     -2.7590187 
356 vs. 356,360 356 vs. 356,500 356 vs. 356,502 356 vs. 356,492 356 vs. 356,361 
    0.410631538     0.410631538     0.083526545     0.879027953     0.005803032 
356 vs. 356,519 
    0.046879041 
360 vs. 356,360 360 vs. 360,500 360 vs. 360,492 360 vs. 360,502 360 vs. 360,361 
       7.561391        1.726090        2.098821       -3.839930        7.270433 
360 vs. 360,519 
       4.075094 
360 vs. 356,360 360 vs. 360,500 360 vs. 360,492 360 vs. 360,502 360 vs. 360,361 
   0.0001448592    0.1164800324    0.0702615629    0.0078204394    0.0001448592 
360 vs. 360,519 
   0.0029332625 
361 vs. 360,361 361 vs. 361,492 361 vs. 361,500 361 vs. 356,361 361 vs. 361,519 
     -1.9180279      -2.7076050      -5.9723892       0.1897398      -2.7802716 
361 vs. 361,502 
     -3.6326063 
361 vs. 360,361 361 

In [4]:
print(dim(all_normality_results))
print(dim(all_normality_results[all_normality_results$p_vector < 0.05,]))
all_normality_results[all_normality_results$p_vector < 0.05,]

[1] 49  2
[1] 25  2


Unnamed: 0,p_vector,shapiro_stat_vector
"356 vs. 356,502",0.04176327,0.9905363
"356 vs. 356,361",0.0009671721,0.8838843
"356 vs. 356,519",0.01562635,0.7712456
"360 vs. 356,360",4.82864e-05,0.9560665
"360 vs. 360,502",0.005213626,0.8197986
"360 vs. 360,361",4.055348e-05,0.8523739
"360 vs. 360,519",0.001466631,0.8198887
"361 vs. 361,492",0.03449032,0.8032618
"361 vs. 361,500",0.0002116914,0.7953701
"361 vs. 361,519",0.01836927,0.9355257


The only monoculture in which non-normality appears is for ASF492--although ~50% of the co-culture conditions appear to be non-normally distributed, we will assume the Student's t-test is more appropriate than the non-parametric equivalent (since the monoculture population in the test is always normally-distributed, except for tests involving ASF492).