In [4]:
#02_finding_differentially_abundant_proteins.R         Sep.12.2023
#Kevin

#objective: read in each of the ten training sets and do linear modeling between
#           GCA and healthy controls. 
#caution [1]: Don't use the columns that aren't proteins
#caution [2]: Only do linear modeling using the training data
             

#input: "/Users/m197371/vasculitis_project_quantile_normalization/analysis/classifying_active_vs_control_v2/"


In [5]:
library("dplyr")
get_sigfeatures = function(num_sigfigs, sigfeature_list, data_df, results_df){
  #Gather the significant features into a new dataframe
  for (i in 1:num_sigfigs) {
    feature1 <- sigfeature_list[i,1]
    feature_to_add <- data_df[feature1]
    results_df[,i] <- feature_to_add
  }
  return(results_df)
}

linear_modeling_of_train_and_test_sets = function(train1,test1,nfold){
    #get the number of proteins and columns
    num_proteins = ncol(train1)-5
    #num_proteins
    num_columns = ncol(train1)
    #num_columns
    train_gca = filter(train1,train1$Study_group == "Active")
    train_gca_proteins = data.frame(train_gca[,6:num_columns])
    train_control = filter(train1,train1$Study_group == "Healthy_Control")
    train_control_proteins = data.frame(train_control[,6:num_columns])

    test_gca = filter(test1,test1$Study_group == "Active")
    test_gca_proteins = data.frame(test_gca[,6:num_columns])
    test_control = filter(test1,test1$Study_group == "Healthy_Control")
    test_control_proteins = data.frame(test_control[,6:num_columns])


    #get the protein names for linear modeling
    protein_names = data.frame(matrix(data=0,nrow=num_proteins,ncol=1))
    colnames(protein_names) = "Protein"
    protein_names[,1] = colnames(train1[6:num_columns])
    #head(protein_names)
    #make a dataframe to hold the linear modeling results
    linear_modeling_results = data.frame(matrix(data=0,nrow=num_proteins,ncol=6))
    colnames(linear_modeling_results) = c("Protein","Nominal_pvalue","Active_mean","Control_mean","the_log(fold_change)","Nominal_pvalue_and_fold_change")
    linear_modeling_results[,1] = protein_names
    #head(linear_modeling_results)

    #do the linear modeling
    for(i in 1:num_proteins){
        current_protein = protein_names[i,1]
        linear_model = lm(data=train1,paste0(current_protein,"~ Study_group"))
        pvalue1 <- summary(linear_model)$coefficients[2,4]
        linear_modeling_results[i,2] = pvalue1
        #print(summary(linear_model))
        #print(pvalue1)
    }
    #print("done")

    #add the active mean, control mean, and fold change
    for(i in 1:num_proteins){
        active_protein = train_gca_proteins[,i]
        #print(active_protein)
        active_mean = mean(active_protein)
        #print(active_mean)
        linear_modeling_results[i,3] = active_mean

        control_protein = train_control_proteins[,i]
        #print(control_protein)
        control_mean = mean(control_protein)
        linear_modeling_results[i,4] = control_mean

        log_fold_change = log2(active_mean/control_mean)
        linear_modeling_results[i,5] = log_fold_change

        pval = linear_modeling_results[i,2]
        pval_and_fold_change = log_fold_change*(-log10(pval))
        linear_modeling_results[i,6] = pval_and_fold_change
    }
    #head(linear_modeling_results)

    #order the results based on the last column
    linear_modeling_results = linear_modeling_results[order(-linear_modeling_results$Nominal_pvalue_and_fold_change),]
    #save linear modeling results
    #print(head(linear_modeling_results))
    write_file = paste0("../../analysis/classifying_active_vs_control_v2/",nfold)
    write_file = paste0(write_file,"fold/linear_modeling_results.csv")
    write.csv(linear_modeling_results,write_file,row.names=FALSE)

    
    return
}

In [6]:
#read in the train and test sets
for(i in 1:10){
    train_file = paste0("../../analysis/classifying_active_vs_control_v2/",i)
    train_file = paste0(train_file,"fold/training_data.csv")
    train1 = read.csv(train_file)


    test_file = paste0("../../analysis/classifying_active_vs_control_v2/",i)
    test_file = paste0(test_file,"fold/test_data.csv")
    test1 = read.csv(test_file)
    #linear_modeling_of_train_and_test_sets = function(train1,test1,nfold)
    linear_modeling_of_train_and_test_sets(train1,test1,i)
    
}
print("done")

[1] "done"
