In [1]:
library(Boruta)
library(dplyr)
library(readr)

"package 'Boruta' was built under R version 4.4.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
##Read CSV file
whole_data <- read.csv("C8_Heart_featuremeta.csv")

#Subset DataFrame to include only specific cohort, if needed. Remove this line and adjust the code accordingly.
subset_data <- whole_data[whole_data$`ATTRIBUTE_Cohort` == 1, ]

#Prints subset_data to verify what we're grabbing.
subset_data

Unnamed: 0_level_0,sampleid,ATTRIBUTE_TYPE_tissue,ATTRIBUTE_Type,Tube_Number,ATTRIBUTE_Cohort,Mouse.ID,ATTRIBUTE_Group_number,ATTRIBUTE_Euthanasia_date,ATTRIBUTE_DPI,ATTRIBUTE_organ,⋯,X749.5031_3.04_5615,X611.053_2.47_5616,X502.8139_2.39_5617,X550.8507_2.49_5618,X539.8441_2.48_5619,X581.7023_2.46_5620,X232.1907_2.41_5621,X724.4499_3.17_5622,X204.0863_6.12_5623,X760.5834_2.86_5624
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<int>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Upper_right_1_434_905_1_44943_210_heart_P268_Y_A1,Sample_heart,Sample,434,1,905,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
2,Upper_right_1_435_906_1_44943_210_heart_P268_Y_A2,Sample_heart,Sample,435,1,906,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
3,Upper_right_1_436_907_1_44943_210_heart_P268_Y_A3,Sample_heart,Sample,436,1,907,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
4,Upper_right_1_437_908_1_44943_210_heart_P268_Y_A4,Sample_heart,Sample,437,1,908,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
5,Upper_right_1_438_930_2_44943_210_heart_P268_Y_A5,Sample_heart,Sample,438,1,930,Group2,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
6,Upper_right_1_439_931_2_44943_210_heart_P268_Y_A6,Sample_heart,Sample,439,1,931,Group2,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
7,Upper_right_1_440_932_2_44943_210_heart_P268_Y_A7,Sample_heart,Sample,440,1,932,Group2,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
8,Upper_right_1_441_793_3_44943_210_heart_P268_Y_A8,Sample_heart,Sample,441,1,793,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
9,Upper_right_1_442_794_3_44943_210_heart_P268_Y_A9,Sample_heart,Sample,442,1,794,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
10,Upper_right_1_443_795_3_44943_210_heart_P268_Y_A10,Sample_heart,Sample,443,1,795,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0


In [3]:
all_groups <- unique(subset_data$ATTRIBUTE_Group_number)
reference_groups <- c("Group1", "Group3") #Change this if needed. These should be your control groups.
boruta_results <- list()

In [4]:
#Loop. Compares each treatment group to either G1 or G3.
for (ref_group in reference_groups) {
  other_groups <- setdiff(all_groups, ref_group)
  
  for (target_group in other_groups) {
    comp_label <- paste0(ref_group, "-", target_group)
    message("Running Boruta for: ", comp_label)
    
    #Subsets for current comparison
    pair_data <- subset_data %>%
      filter(ATTRIBUTE_Group_number %in% c(ref_group, target_group))

    X <- pair_data[, 14:3414]  #Adjust this range for metadata columns
    y <- as.factor(pair_data$ATTRIBUTE_Group_number)
    
    #Boruta
    set.seed(123)  #I'm putting the seed here in case there's any anomalies with specific groups; this allows for reproducibility through all runs.
    boruta_model <- Boruta(X, y, doTrace = 1, maxRuns = 100) #100 is just the cap. If it can't find anything after 100 (or sooner) runs it'll stop.
    
    #Extract importance scores
    boruta_importance <- attStats(boruta_model)
    
    #Filter to confirmed features only
    confirmed_df <- boruta_importance[boruta_importance$decision %in% c("Confirmed","Tentative"), , drop = FALSE]
    
    #Create result data frame
    if (nrow(confirmed_df) > 0) {
      result_df <- data.frame(
        feature = rownames(confirmed_df),
        MDA = confirmed_df$meanImp
      )
      colnames(result_df)[2] <- comp_label
    } else {
      result_df <- data.frame(
        feature = character(0)
      )
      result_df[[comp_label]] <- numeric(0)
    }
    #Puts results into a list.
    boruta_results[[comp_label]] <- result_df
  }
}

Running Boruta for: Group1-Group2

After 19 iterations, +7.6 secs: 

 rejected 3400 attributes: X100.0759_0.73_4272, X100.076_2.55_128, X1001.6537_2.97_1138, X1005.6845_3.07_5596, X1005.685_3.05_2120 and 3395 more;

 still have 1 attribute left.


After 84 iterations, +8.6 secs: 

 confirmed 1 attribute: X846.4406_2.86_355;

 no more attributes left.


Running Boruta for: Group1-Group3

After 19 iterations, +6.3 secs: 

 rejected 3385 attributes: X100.0759_0.73_4272, X1001.6537_2.97_1138, X1005.6845_3.07_5596, X1005.685_3.05_2120, X1013.6537_2.87_2185 and 3380 more;

 still have 16 attributes left.


After 23 iterations, +6.4 secs: 

 rejected 2 attributes: X100.076_2.55_128, X570.4566_3.15_385;

 still have 14 attributes left.


After 73 iterations, +7.1 secs: 

 rejected 1 attribute: X511.374_3.41_423;

 still have 13 attributes left.


Running Boruta for: Group1-Group4

After 19 iterations, +6 secs: 

 rejected 3373 attributes: X100.0759_0.73_4272, X100.076_2.55_128, X1001.6537_2.97

In [5]:
#Merge results across all comparisons
combined_boruta <- Reduce(function(x, y) merge(x, y, by = "feature", all = TRUE), boruta_results)

#Replace NAs with 0 (feature not selected = 0 importance)
combined_boruta[is.na(combined_boruta)] <- 0

In [6]:
#After this point, a new CSV was manually created from what was considered tentative or confirmed.
#I used our existing metadata and copied over the m/z featuredata for those features to make a new CSV.
#From that point on I continued using that new CSV (named 'Significant_Features_Preliminary_....).
write.csv(combined_boruta, "Combined_Boruta_SelectedFeatures_C8_HEART.csv", row.names = FALSE)