In [1]:
library("dplyr")
get_sigfeatures = function(num_sigfigs, sigfeature_list, data_df, results_df){
  #Gather the significant features into a new dataframe
  for (i in 1:num_sigfigs) {
    feature1 <- sigfeature_list[i,1]
    feature_to_add <- data_df[feature1]
    results_df[,i] <- feature_to_add
  }
  return(results_df)
}

linear_modeling_of_train_and_test_sets = function(train1,test1,nfold){
    #get the number of proteins and columns
    num_proteins = ncol(train1)-5
    #num_proteins
    num_columns = ncol(train1)
    #num_columns
    train_gca = filter(train1,train1$Study_group == "Inactive")
    train_gca_proteins = data.frame(train_gca[,6:num_columns])
    train_control = filter(train1,train1$Study_group == "Healthy_Control")
    train_control_proteins = data.frame(train_control[,6:num_columns])

    test_gca = filter(test1,test1$Study_group == "Inactive")
    test_gca_proteins = data.frame(test_gca[,6:num_columns])
    test_control = filter(test1,test1$Study_group == "Healthy_Control")
    test_control_proteins = data.frame(test_control[,6:num_columns])


    #get the protein names for linear modeling
    protein_names = data.frame(matrix(data=0,nrow=num_proteins,ncol=1))
    colnames(protein_names) = "Protein"
    protein_names[,1] = colnames(train1[6:num_columns])
    #head(protein_names)
    #make a dataframe to hold the linear modeling results
    linear_modeling_results = data.frame(matrix(data=0,nrow=num_proteins,ncol=6))
    colnames(linear_modeling_results) = c("Protein","Nominal_pvalue","Active_mean","Control_mean","the_log(fold_change)","Nominal_pvalue_and_fold_change")
    linear_modeling_results[,1] = protein_names
    #head(linear_modeling_results)


    #read in linear modling results
    linear_modeling_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",nfold)
    linear_modeling_file = paste0(linear_modeling_file,"fold/linear_modeling_results.csv")
    linear_modeling_results = read.csv(linear_modeling_file)
    linear_modeling_results = linear_modeling_results[order(-linear_modeling_results$Nominal_pvalue_and_fold_change),]


    #make a dataframe of the protein abundances for both active and control
    #using the ordered protein names
    ordered_gca_proteins_df = data.frame(matrix(data=0,nrow=27,ncol=num_proteins))
    colnames(ordered_gca_proteins_df) = linear_modeling_results[,1]
    ordered_protein_names = data.frame(linear_modeling_results[,1])

    #get_sigfeatures = function(num_sigfigs, sigfeature_list, data_df, results_df)
    ordered_gca_proteins_df = get_sigfeatures(num_proteins,ordered_protein_names, train_gca_proteins, ordered_gca_proteins_df)
    #head(ordered_gca_proteins_df)
    gca_details = data.frame(matrix(data=0,nrow=27,ncol=4))
    colnames(gca_details) = c("Study_group","maskID","sample_ID","Classifier")
    gca_details$Study_group = train_gca$Study_group
    gca_details$sample_ID = train_gca$sample_ID
    gca_details$maskID = train_gca$maskID
    gca_details$Classifier = 1
    #head(gca_details)
    all_gca_df = cbind(gca_details,ordered_gca_proteins_df)

    #now gather the control proteins
    ordered_control_proteins_df = data.frame(matrix(data=0,nrow=27,ncol=num_proteins))
    colnames(ordered_control_proteins_df) = linear_modeling_results[,1]
    ordered_control_proteins_df = get_sigfeatures(num_proteins,ordered_protein_names,train_control_proteins,ordered_control_proteins_df)
    #head(ordered_control_proteins_df)
    control_details = data.frame(matrix(data=0,nrow=27,ncol=4))
    colnames(control_details) = c("Study_group","maskID","sample_ID","Classifier")
    control_details$Study_group = train_control$Study_group
    control_details$sample_ID = train_control$sample_ID
    control_details$maskID = train_control$maskID
    control_details$Classifier = 0
    #head(control_details)
    all_control_df = cbind(control_details,ordered_control_proteins_df)
    all_ordered_training_samples_df = rbind(all_gca_df,all_control_df)
    #head(all_ordered_training_samples_df)



    #now order the test gca and control proteins in the same order
    gca_test_proteins = data.frame(matrix(data=0,nrow=3,ncol=num_proteins))
    colnames(gca_test_proteins) = ordered_protein_names[,1]
    #get_sigfeatures = function(num_sigfigs, sigfeature_list, data_df, results_df)
    gca_test_proteins = get_sigfeatures(num_proteins,ordered_protein_names,test_gca_proteins,gca_test_proteins)
    #head(gca_test_proteins)
    gca_test_details = data.frame(matrix(data=0,nrow=3,ncol=4))
    colnames(gca_test_details) = c("Study_group","maskID","sample_ID","Classifier")
    gca_test_details$Study_group = test_gca$Study_group
    gca_test_details$maskID = test_gca$maskID
    gca_test_details$sample_ID = test_gca$sample_ID
    gca_test_details$Classifier = 1
    #head(gca_test_details)
    all_gca_test = cbind(gca_test_details,gca_test_proteins)
    #head(all_gca_test)
    #do the same for the test control proteins
    control_test_proteins = data.frame(matrix(data=0,nrow=3,ncol=num_proteins))
    colnames(control_test_proteins) = ordered_protein_names[,1]
    #get_sigfeatures = function(num_sigfigs, sigfeature_list, data_df, results_df)
    control_test_proteins = get_sigfeatures(num_proteins,ordered_protein_names,test_control_proteins,control_test_proteins)
    #head(control_test_proteins)
    #get the control details
    control_test_details = data.frame(matrix(data=0,nrow=3,ncol=4))
    colnames(control_test_details) = c("Study_group","maskID","sample_ID","Classifier")
    control_test_details$Study_group = test_control$Study_group
    control_test_details$maskID = test_control$maskID
    control_test_details$sample_ID = test_control$sample_ID
    control_test_details$Classifier = 0
    #head(control_test_details)
    all_control_test = cbind(control_test_details,control_test_proteins)


    #combine all testing data
    all_ordered_test_data_df = rbind(all_gca_test,all_control_test)
    #head(all_ordered_test_data_df)
    #save the final train and test set, I have removed the indices so I can
    #now use these datasets later

    train_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",nfold)    
    test_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",nfold)
    
    #save all proteins for train and test
    all_ordered_training_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",nfold)
    all_ordered_training_file = paste0(all_ordered_training_file,"fold/train/all_proteins_train")
    all_ordered_training_file = paste0(all_ordered_training_file,nfold)
    all_ordered_training_file = paste0(all_ordered_training_file,".csv")    
    write.csv(all_ordered_training_samples_df,all_ordered_training_file,row.names=FALSE)
    
    all_ordered_testing_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",nfold)
    all_ordered_testing_file = paste0(all_ordered_testing_file,"fold/test/all_proteins_test")
    all_ordered_testing_file = paste0(all_ordered_testing_file,nfold)
    all_ordered_testing_file = paste0(all_ordered_testing_file,".csv")
    write.csv(all_ordered_test_data_df,all_ordered_testing_file,row.names=FALSE)
    
    
    
    feature_sizes = data.frame(matrix(data=0,ncol=1,nrow=7))
    colnames(feature_sizes) = "Feature_sizes"
    feature_sizes[1,1] = 10
    feature_sizes[2,1] = 25
    feature_sizes[3,1] = 50
    feature_sizes[4,1] = 100
    feature_sizes[5,1] = 150
    feature_sizes[6,1] = 200
    feature_sizes[7,1] = 250
    num_feature_sizes = nrow(feature_sizes)

    for(i in 1:num_feature_sizes){
        feature_size = feature_sizes[i,1]
        top_training_df = data.frame(all_ordered_training_samples_df[,1:(feature_size+4)])
        top_training_file = paste0(train_file,"fold/train/")
        top_training_file = paste0(top_training_file,"top_")
        top_training_file = paste0(top_training_file,feature_size)
        top_training_file = paste0(top_training_file,"_proteins_train")
        top_training_file = paste0(top_training_file,nfold)
        top_training_file = paste0(top_training_file,".csv")    
        #print(top_training_file)
        write.csv(top_training_df,top_training_file,row.names=FALSE)

        top_testing_df = data.frame(all_ordered_test_data_df[,1:(feature_size+4)])
        top_testing_file = paste0(test_file,"fold/test/")
        top_testing_file = paste0(top_testing_file,"top_")
        top_testing_file = paste0(top_testing_file,feature_size)
        top_testing_file = paste0(top_testing_file,"_proteins_test")
        top_testing_file = paste0(top_testing_file,nfold)
        top_testing_file = paste0(top_testing_file,".csv")    
        #print(top_testing_file)
        write.csv(top_testing_df,top_testing_file,row.names=FALSE)
    }
    return
}


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
#read in the train and test sets
for(i in 1:10){
    train_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",i)
    train_file = paste0(train_file,"fold/training_data.csv")
    train1 = read.csv(train_file)


    test_file = paste0("../../analysis/classifying_inactive_vs_control_v2/",i)
    test_file = paste0(test_file,"fold/test_data.csv")
    test1 = read.csv(test_file)
    #linear_modeling_of_train_and_test_sets = function(train1,test1,nfold)
    linear_modeling_of_train_and_test_sets(train1,test1,i)
    
}
print("done")

[1] "done"
