In [1]:
##Load required libraries
library(randomForest)
library(dplyr)
library(ggplot2)
library(reshape2)
library(pROC) 

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: 'dplyr'


The following object is masked from 'package:randomForest':

    combine


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'ggplot2' was built under R version 4.4.3"

Attaching package: 'ggplot2'


The following object is masked from 'package:randomForest':

    margin




In [3]:
##Read CSV file
whole_data <- read.csv("C8_Heart_featuremeta.csv")

#Subset DataFrame to include only specific groups
subset_data <- whole_data[whole_data$`ATTRIBUTE_Cohort` == 1, ]
  

##Prints subsetted df.
dim(subset_data)

#G1-G3 Random Forest. 2000 trees.

subset_data_G1_G3 <- subset_data %>%
  filter(ATTRIBUTE_Group_number %in% c('Group1', 'Group3'))

subset_data_G1_G3

X_G1_G3 <- subset_data_G1_G3[, 14:3414]
y_G1_G3 <- as.factor(subset(subset_data, ATTRIBUTE_Group_number %in% c("Group1", "Group3"))$ATTRIBUTE_Group_number)
X_G3_G1 <- subset_data_G1_G3[, 14:3414]
y_G3_G1 <- as.factor(subset(subset_data, ATTRIBUTE_Group_number %in% c("Group3", "Group1"))$ATTRIBUTE_Group_number)

sampleid,ATTRIBUTE_TYPE_tissue,ATTRIBUTE_Type,Tube_Number,ATTRIBUTE_Cohort,Mouse.ID,ATTRIBUTE_Group_number,ATTRIBUTE_Euthanasia_date,ATTRIBUTE_DPI,ATTRIBUTE_organ,⋯,X749.5031_3.04_5615,X611.053_2.47_5616,X502.8139_2.39_5617,X550.8507_2.49_5618,X539.8441_2.48_5619,X581.7023_2.46_5620,X232.1907_2.41_5621,X724.4499_3.17_5622,X204.0863_6.12_5623,X760.5834_2.86_5624
<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<int>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Upper_right_1_434_905_1_44943_210_heart_P268_Y_A1,Sample_heart,Sample,434,1,905,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_435_906_1_44943_210_heart_P268_Y_A2,Sample_heart,Sample,435,1,906,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_436_907_1_44943_210_heart_P268_Y_A3,Sample_heart,Sample,436,1,907,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_437_908_1_44943_210_heart_P268_Y_A4,Sample_heart,Sample,437,1,908,Group1,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_441_793_3_44943_210_heart_P268_Y_A8,Sample_heart,Sample,441,1,793,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_442_794_3_44943_210_heart_P268_Y_A9,Sample_heart,Sample,442,1,794,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_443_795_3_44943_210_heart_P268_Y_A10,Sample_heart,Sample,443,1,795,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
Upper_right_1_444_796_3_44943_210_heart_P268_Y_A11,Sample_heart,Sample,444,1,796,Group3,44943,210,heart,⋯,0,0,0,0,0,0,0,0,0,0
middle_top_1_380_897_1_44936_203_heart_P266_B_A1,Sample_heart,Sample,380,1,897,Group1,44936,203,heart,⋯,0,0,0,0,0,0,0,0,0,0
middle_top_1_381_898_1_44936_203_heart_P266_B_A2,Sample_heart,Sample,381,1,898,Group1,44936,203,heart,⋯,0,0,0,0,0,0,0,0,0,0


In [4]:
n_runs <- 100           #Number of times to run RF
top_n <- 50            #Top N features to keep from each run
set.seed(123)          #Seed set for reproducibility
feature_counts <- list()

In [None]:
#Store top features from each run
for (i in 1:n_runs) {
  rf_model <- randomForest(X_G1_G3, y_G1_G3, ntree = 2000, importance = TRUE)
  importance_vals <- importance(rf_model, type = 1)  
  top_features <- names(sort(importance_vals[,1], decreasing = TRUE))[1:top_n]
  feature_counts[[i]] <- top_features
}

#Combine all top feature lists
all_top_features <- unlist(feature_counts)

#Count how many times each feature appeared
feature_freq <- table(all_top_features)
feature_freq <- sort(feature_freq, decreasing = TRUE)

#Convert to df
consensus_df <- data.frame(
  feature = names(feature_freq),
  count = as.integer(feature_freq),
  frequency = as.integer(feature_freq) / n_runs
)

#Filter by stability threshold, appears in 70% or more of runs
stable_features <- subset(consensus_df, frequency >= 0.7)

print(stable_features)

In [None]:
#G3-G1 Random Forest. 2000 trees.

#Store top features from each run
for (i in 1:n_runs) {
  rf_model <- randomForest(X_G1_G3, y_G1_G3, ntree = 2000, importance = TRUE)
  importance_vals <- importance(rf_model, type = 1)  
  top_features <- names(sort(importance_vals[,1], decreasing = TRUE))[1:top_n]
  feature_counts[[i]] <- top_features
}

#Combine all top feature lists
all_top_features <- unlist(feature_counts)

# Count how many times each feature appeared
feature_freq <- table(all_top_features)
feature_freq <- sort(feature_freq, decreasing = TRUE)

#Convert to df
consensus_df <- data.frame(
  feature = names(feature_freq),
  count = as.integer(feature_freq),
  frequency = as.integer(feature_freq) / n_runs
)

#Filter by stability threshold, appears in 70% or more of runs
stable_features <- subset(consensus_df, frequency >= 0.7)

print(stable_features)

In [6]:
importance_list <- list()

In [8]:
#G1 vs G3, messing with the cutoff

#Store top features and importance values from each run
for (i in 1:n_runs) {
  rf_model <- randomForest(X_G1_G3, y_G1_G3, ntree = 2000, importance = TRUE)
  importance_vals <- importance(rf_model, type = 1)[,1]  # MDA
  importance_list[[i]] <- importance_vals
  
  top_features <- names(sort(importance_vals, decreasing = TRUE))[1:top_n]
  feature_counts[[i]] <- top_features
}

#Count frequency of top features
all_top_features <- unlist(feature_counts)
feature_freq <- table(all_top_features)
feature_freq <- sort(feature_freq, decreasing = TRUE)

#Convert frequency table to df
consensus_df <- data.frame(
  feature = names(feature_freq),
  count = as.integer(feature_freq),
  frequency = as.integer(feature_freq) / n_runs
)

#Combine importance values across runs
all_features <- unique(unlist(lapply(importance_list, names)))
importance_matrix <- sapply(importance_list, function(x) {
  x[all_features]  #get importance values for all features, NA if missing
})
rownames(importance_matrix) <- all_features

#Compute mean importance per feature
mean_importance <- rowMeans(importance_matrix, na.rm = TRUE)
importance_df <- data.frame(feature = names(mean_importance), mean_importance = mean_importance)

#Merge with consensus_df
merged_df <- merge(consensus_df, importance_df, by = "feature")

#Filter: at least 70% frequency and at least the designated importance
filtered <- subset(merged_df, frequency >= 0.7 & mean_importance >= 1.3)

print(filtered)

write.csv(filtered, "1.3_cutoff_70freq_G1_G3.csv", row.names = FALSE)

                feature count frequency mean_importance
4    X119.0853_2.86_358    92      0.92        3.156983
15   X153.0405_0.36_315   100      1.00        5.275954
21   X161.0919_0.32_127   100      1.00        4.360428
24   X166.0861_0.31_265    83      0.83        2.523439
27   X179.0622_2.25_378   100      1.00        3.706793
33   X199.0478_0.3_1479    84      0.84        2.385700
36   X211.0785_2.88_399    93      0.93        3.071341
42   X220.9677_0.24_366    95      0.95        3.181573
45  X227.1098_2.61_1216    96      0.96        3.225876
52   X256.1902_2.85_154    99      0.99        3.610782
53   X256.2632_3.24_320    81      0.81        2.433481
56    X268.1036_0.34_74   100      1.00        3.898986
62    X285.0206_2.41_49    83      0.83        2.892156
65   X287.0174_2.41_138    92      0.92        3.127872
72   X307.1721_2.83_268    97      0.97        3.741026
83   X325.2191_3.04_186   100      1.00        4.112880
91    X336.253_3.31_422    82      0.82        2

In [9]:
##G3 vs G1, messing with the cutoff

#Store top features and importance values from each run
for (i in 1:n_runs) {
  rf_model <- randomForest(X_G1_G3, y_G1_G3, ntree = 2000, importance = TRUE)
  importance_vals <- importance(rf_model, type = 1)[,1]  # MDA
  importance_list[[i]] <- importance_vals
  
  top_features <- names(sort(importance_vals, decreasing = TRUE))[1:top_n]
  feature_counts[[i]] <- top_features
}

#Count frequency of top features
all_top_features <- unlist(feature_counts)
feature_freq <- table(all_top_features)
feature_freq <- sort(feature_freq, decreasing = TRUE)

#Convert frequency table to df
consensus_df <- data.frame(
  feature = names(feature_freq),
  count = as.integer(feature_freq),
  frequency = as.integer(feature_freq) / n_runs
)

#Combine importance values across runs
all_features <- unique(unlist(lapply(importance_list, names)))
importance_matrix <- sapply(importance_list, function(x) {
  x[all_features]  #get importance values for all features, NA if missing
})
rownames(importance_matrix) <- all_features

#Compute mean importance per feature
mean_importance <- rowMeans(importance_matrix, na.rm = TRUE)
importance_df <- data.frame(feature = names(mean_importance), mean_importance = mean_importance)

#Merge with consensus_df
merged_df <- merge(consensus_df, importance_df, by = "feature")

#Filter: at least 70% frequency and at least the designated importance
filtered <- subset(merged_df, frequency >= 0.7 & mean_importance >= 1.3)

print(filtered)

write.csv(filtered, "1.3_cutoff_70freq_G3_G1.csv", row.names = FALSE)

                feature count frequency mean_importance
4    X119.0853_2.86_358    94      0.94        3.205083
16   X153.0405_0.36_315   100      1.00        5.109915
22   X161.0919_0.32_127   100      1.00        4.447827
25   X166.0861_0.31_265    71      0.71        2.441099
27   X179.0622_2.25_378   100      1.00        3.794690
32   X199.0478_0.3_1479    73      0.73        2.279072
34   X211.0785_2.88_399    92      0.92        3.161953
40   X220.9677_0.24_366    97      0.97        3.324612
43  X227.1098_2.61_1216    99      0.99        3.237973
49   X256.1902_2.85_154    99      0.99        3.602505
55    X268.1036_0.34_74   100      1.00        3.829959
60    X285.0206_2.41_49    85      0.85        2.795370
62   X287.0174_2.41_138    89      0.89        3.044039
70   X307.1721_2.83_268    99      0.99        3.804052
79   X325.2191_3.04_186    99      0.99        4.157928
86    X336.253_3.31_422    79      0.79        2.244896
87     X338.3412_3.58_5    86      0.86        2