This script uses the quantile normalized data as well as the clinical and demographic data to build linear regression models between Inactive GCA patients and Healthy controls. Marginal linear regression models were run on six clinical and demographic variables (age, smoking status, sex, prednisone use, aspirin use, and methotrexate use) between Inactive GCA and Healthy controls. Variables with linear regression model P-values < 0.05 were identified as significant confounders and included in the full multiple linear regression models. In the full multiple linear regression models, the P-value from the study group variable was used to identify differentially abundant proteins between Inactive GCA and Healthy controls. A threshold of P < 0.01 was applied to all plasma proteins for significance.

In [None]:
#Protein ~ Study_group	Age	Smoking	Prednisone	Methotrexate	Aspirin
#function_name <- function(arg_1, arg_2, ...) {
#   Function body 
#}
make_linear_model <- function(binary_results){
    #binary_results is the row from the binary data, 7 columns with 6 values
    model_string = "~ Study_group"
    if(binary_results[1,2] == 1){
        model_string = paste0(model_string," + Age")
    }
    if(binary_results[1,3] == 1){
        model_string = paste0(model_string," + Sex")
    }
    if(binary_results[1,4] == 1){
        model_string = paste0(model_string," + Smoking")
    }
    if(binary_results[1,5] == 1){
        model_string = paste0(model_string," + Prednisone")
    }    
    if(binary_results[1,6] == 1){
        model_string = paste0(model_string," + Methotrexate")
    }    
    if(binary_results[1,7] == 1){
        model_string = paste0(model_string," + Aspirin")
    }    
    return(model_string)
}


In [17]:
library("dplyr")
library("ggpubr")
library("lme4")
library("lmerTest")
library("ggplot2")
#read in the proteomics data and the patient demographics data
#vasculitis_df <- read.csv("../../data/vasculitis_patients_final.csv")
vasculitis_demographics_df <- read.csv("../../data/vasculitis_patient_info.csv")

#read in all quantile data
all_quantile_df = read.csv("../../data/all_quantile_data.csv")
#head(all_quantile_df)
#get the number of columns and rows 
num_columns <- ncol(all_quantile_df)
num_proteins <- ncol(all_quantile_df) - 3
num_samples <- nrow(all_quantile_df)


#filter only active gca patients
active_df <- filter(all_quantile_df,all_quantile_df$Study_group == "Active")
active_demographics_df <- filter(vasculitis_demographics_df,vasculitis_demographics_df$Study_group == "Active")
active_proteins_only <- active_df[,4:num_columns]

#filter inactive gca patients
inactive_df = filter(all_quantile_df,all_quantile_df$Study_group == "Inactive")
inactive_demographics_df <- filter(vasculitis_demographics_df,vasculitis_demographics_df$Study_group == "Inactive")
inactive_proteins_only <- inactive_df[,4:num_columns]

#read in controls data
controls_demographics_df <- read.csv("../../data/controls_demographics.csv")
controls_df <- filter(all_quantile_df,all_quantile_df$Study_group == "Healthy_Control")
controls_proteins_only <- controls_df[,4:num_columns]

proteins_only <- data.frame(active_df[ ,4:num_columns])
#seperate the protein names for the linear model
protein_names <- data.frame(matrix(data=0,nrow=num_proteins,ncol=1))
colnames(protein_names) <- "Protein"
protein_names[ , 1] <- data.frame(colnames(proteins_only))
#protein_names



In [18]:
#gather the demographics I need for both active GCA and controls
#age,sex,smoking,prednisone,methotrexate,and aspirin
#check that the demographics and protein data is in the same order
print(inactive_demographics_df$maskid == inactive_df$maskID)
#head(inactive_demographics_df)
#print(dim(inactive_demographics_df))
inactive_demographics_df = inactive_demographics_df[order(inactive_demographics_df$maskid),]
inactive_df = inactive_df[order(inactive_df$maskID),]
#head(inactive_demographics_df)
#print(dim(inactive_demographics_df))
#inactive_df = inactive_df[order(inactive_df$maskID),]
#print(head(inactive_df))
print(inactive_demographics_df$maskid == inactive_df$maskID)

inactive_gca_demographics_and_proteins_df <- data.frame(inactive_demographics_df$Age)
colnames(inactive_gca_demographics_and_proteins_df) <- "Age"
inactive_gca_demographics_and_proteins_df$Sex <- inactive_demographics_df$Sex
inactive_gca_demographics_and_proteins_df$Smoking <- inactive_demographics_df$Smoking
inactive_gca_demographics_and_proteins_df$Prednisone <- inactive_demographics_df$PrednisoneCurrentlyReceiving
inactive_gca_demographics_and_proteins_df$Methotrexate <- inactive_demographics_df$MethotrexateWasTaken
inactive_gca_demographics_and_proteins_df$Aspirin <- inactive_demographics_df$Aspirin
inactive_gca_demographics_and_proteins_df2 <- cbind(inactive_gca_demographics_and_proteins_df,inactive_df)

dim(inactive_gca_demographics_and_proteins_df2)
head(inactive_gca_demographics_and_proteins_df2)
#now I have all the active demographics and protein abundance together, do the same for controls

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE


Unnamed: 0_level_0,Age,Sex,Smoking,Prednisone,Methotrexate,Aspirin,Study_group,maskID,sample_ID,CRBB2_10000.28,⋯,YIPF6_9984.12,Neuropeptide.W_9986.14,LRC25.CD_9987.30,LRC24_9989.12,EMIL3.region.2_9991.112,ZN264_9993.11,ATP4B_9994.217,DUT_9995.6,UBXN4.CD.1_9997.12,IRF6_9999.1
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,69.3,Male,No,Yes,No,No,Inactive,GCA-1,KP5W,689.0178,⋯,666.6633,1980.68,538.4122,507.4411,586.0383,1256.4778,1651.703,1692.672,8440.326,1184.1156
10,72.2,Female,No,No,No,Yes,Inactive,GCA-10,KP8Z,509.9244,⋯,579.5956,1294.9889,579.0083,407.8878,595.7067,1157.93,1769.919,6646.468,26205.253,3649.0967
11,80.0,Male,No,Yes,No,No,Inactive,GCA-11,KPAN,1503.5411,⋯,571.3867,1581.3544,481.5956,420.6511,567.4211,877.5811,1278.554,5780.831,26205.253,3216.15
12,63.6,Female,Yes,Yes,No,Yes,Inactive,GCA-12,KNZD,567.8378,⋯,853.2322,16046.8056,597.7856,474.4144,620.9267,1292.6711,1801.653,1110.298,4930.646,964.1144
13,65.6,Female,Yes,No,No,Yes,Inactive,GCA-13,KNVL,551.0106,⋯,601.7589,719.4522,638.4144,412.5933,496.0989,1421.49,1750.598,2370.842,11896.302,1195.5356
14,72.8,Female,No,No,No,No,Inactive,GCA-14,KP7R,818.7789,⋯,611.1356,6525.2078,963.4044,475.0489,996.1478,1645.2478,1745.823,1413.522,6016.766,833.8333


In [19]:
print(controls_demographics_df$case_id == controls_df$maskID)
#put the controls data in order using case id for the demographics 
#and the maskID for the protein abundance
controls_demographics_df <- controls_demographics_df[order(controls_demographics_df$case_id),]
controls_df <- controls_df[order(controls_df$maskID),]
print(controls_demographics_df$case_id == controls_df$maskID)
#everything is in the same order, I can begin gather all demographic
#data that I need and combine with the protein abundance
control_demographics_and_proteins_df <- data.frame(controls_demographics_df$Age)
colnames(control_demographics_and_proteins_df) <- ("Age")
control_demographics_and_proteins_df$Sex <- controls_demographics_df$control_gender
control_demographics_and_proteins_df$Smoking <- controls_demographics_df$SLmoking
control_demographics_and_proteins_df$Prednisone <- controls_demographics_df$Prednisone
control_demographics_and_proteins_df$Methotrexate <- controls_demographics_df$Methotrexate
control_demographics_and_proteins_df$Aspirin <- controls_demographics_df$Aspirin
control_demographics_and_proteins_df2 <- cbind(control_demographics_and_proteins_df,controls_df)
dim(control_demographics_and_proteins_df2)

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE


In [20]:
#now combine everything and run linear models
inactive_and_controls_demographics_and_proteins <- rbind(inactive_gca_demographics_and_proteins_df2,control_demographics_and_proteins_df2)
dim(inactive_and_controls_demographics_and_proteins)
#inactive_and_controls_demographics_and_proteins

In [21]:
#now I can start the linear modeling
#make a dataframe to hold the results
inactive_linear_modeling_results <- data.frame(matrix(data=0,nrow=num_proteins,ncol=7))
colnames(inactive_linear_modeling_results) <- cbind("Proteins","Age","Sex","Smoking","Prednisone","Methotrexate","Aspirin")
inactive_linear_modeling_results[,1] <- protein_names


In [22]:
#loop through all proteins using all covariates
for(i in 1:num_proteins){
    protein_name <- protein_names[i,1]
    age_model <- lm(paste0(protein_name, " ~ Age"),data = inactive_and_controls_demographics_and_proteins)
    pvalue1 <- summary(age_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,2] <- pvalue1
    
    sex_model <- lm(paste0(protein_name, " ~ Sex"),data = inactive_and_controls_demographics_and_proteins)
    pvalue2 <- summary(sex_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,3] <- pvalue2
    
    smoking_model <- lm(paste0(protein_name, " ~ Smoking"),data = inactive_and_controls_demographics_and_proteins)
    pvalue3 <- summary(smoking_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,4] <- pvalue3
    
    prednisone_model <- lm(paste0(protein_name, " ~ Prednisone"),data = inactive_and_controls_demographics_and_proteins)
    pvalue4 <- summary(prednisone_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,5] <- pvalue4
    
    methotrexate_model <- lm(paste0(protein_name, " ~ Methotrexate"),data = inactive_and_controls_demographics_and_proteins)
    pvalue5 <- summary(methotrexate_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,6] <- pvalue5
    
    
    aspirin_model <- lm(paste0(protein_name, " ~ Aspirin"),data = inactive_and_controls_demographics_and_proteins)
    pvalue6 <- summary(aspirin_model)$coefficients[2,4]
    inactive_linear_modeling_results[i,7] <- pvalue6
    
}
print("Done with linear models.")

[1] "Done with linear models."


In [23]:
head(inactive_linear_modeling_results)

Unnamed: 0_level_0,Proteins,Age,Sex,Smoking,Prednisone,Methotrexate,Aspirin
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,CRBB2_10000.28,0.204116564,0.79801017,0.4563687,0.0012441261,0.890268714,0.64202651
2,c.Raf_10001.7,0.107937637,0.63701055,0.4317544,0.0001893987,0.039013734,0.01059678
3,ZNF41_10003.15,0.688196428,0.63642831,0.7767302,0.1102232044,0.001235063,0.14366297
4,ELK1_10006.25,0.230443909,0.05225988,0.8239646,0.1807871635,0.746208106,0.66369455
5,GUC1A_10008.43,0.320817191,0.38977785,0.6877522,0.1231599756,0.848377925,0.10447903
6,BECN1_10010.10,0.009791296,0.73966883,0.8125307,0.8861102836,0.818486385,0.9176022


In [92]:
#save results
write.csv(inactive_linear_modeling_results,"../../analysis/linear_modeling_inactive_and_controls/Inactive_vs_Control_demographic_variables_linear_modeling_results.csv",row.names=FALSE)

In [24]:
#binarize the results
inactive_binary_modeling_results <- data.frame(matrix(data = 0,nrow=num_proteins,ncol=7))
colnames(inactive_binary_modeling_results) <- colnames(inactive_linear_modeling_results)
inactive_binary_modeling_results[,1] <- protein_names
for(i in 1:num_proteins){
    for(j in 2:7){
        current_pval <- inactive_linear_modeling_results[i,j]
        if(current_pval < 0.05){
            inactive_binary_modeling_results[i,j] <- 1
        }
        else{
            inactive_binary_modeling_results[i,j] <- 0
        }
    }    
}

In [91]:
#save binary results
write.csv(inactive_binary_modeling_results,"../../analysis/linear_modeling_inactive_and_controls/inactive_binary_modeling_results.csv",row.names=FALSE)

In [25]:
#I have a function that takes the binaryized data and returns a string to use in the model. Run for all proteins
inactive_adjusted_linear_modeling_results <- data.frame(matrix(data=0,nrow=num_proteins,ncol= 6))
colnames(inactive_adjusted_linear_modeling_results) <- cbind("Protein","Adjusted_linear_model","linear_modeling_pvalue","adjusted_pvalues","fold_change","coefficient")
inactive_adjusted_linear_modeling_results[,1] <- protein_names
for(i in 1:num_proteins){
    #get protein name
    current_protein <- protein_names[i,1]
    #make linear model string
    binary_results <- inactive_binary_modeling_results[i,]
    model_string <- ""
    model_string <- make_linear_model(binary_results)
    inactive_adjusted_linear_modeling_results[i,2] <- model_string
    adjusted_linear_model <- lm(paste0(current_protein,model_string),data = inactive_and_controls_demographics_and_proteins)
    pvalue1 <- summary(adjusted_linear_model)$coefficients[2,4]
    inactive_adjusted_linear_modeling_results[i,3] <- pvalue1
    coefficient_value <- summary(adjusted_linear_model)$coefficients[2,1]
    inactive_adjusted_linear_modeling_results[i,6] <- coefficient_value
}
print("Done with models.")

[1] "Done with models."


In [26]:
head(inactive_adjusted_linear_modeling_results)


Unnamed: 0_level_0,Protein,Adjusted_linear_model,linear_modeling_pvalue,adjusted_pvalues,fold_change,coefficient
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,CRBB2_10000.28,~ Study_group + Prednisone,0.460432058,0,0,51.48781
2,c.Raf_10001.7,~ Study_group + Prednisone + Methotrexate + Aspirin,0.001965581,0,0,-301.48136
3,ZNF41_10003.15,~ Study_group + Methotrexate,0.844371015,0,0,2.83606
4,ELK1_10006.25,~ Study_group,0.284338811,0,0,1023.15169
5,GUC1A_10008.43,~ Study_group,0.165504267,0,0,55.93024
6,BECN1_10010.10,~ Study_group + Age,0.058162557,0,0,70.36173


In [27]:
#add the (BH) adjusted pvalues
inactive_adjusted_linear_modeling_results[,4] <- p.adjust(inactive_adjusted_linear_modeling_results$linear_modeling_pvalue, method="hochberg")


In [28]:
#find the means of active and controls
fold_changes <- data.frame(matrix(data=0,nrow=num_proteins,ncol=4))
colnames(fold_changes) <- cbind("Proteins", "Inactive_mean","Control_mean","fold_change")
fold_changes[,1] <- protein_names
for(i in 1:num_proteins){
    fold_changes[i,2] <- mean(inactive_proteins_only[,i])
    fold_changes[i,3] <- mean(controls_proteins_only[,i])
}
fold_changes[,4] <- log2(fold_changes[,2]/fold_changes[,3])

In [29]:
#add the fold changes
inactive_adjusted_linear_modeling_results[,5] <- fold_changes[,4]

#add entrez and target names
protein_key <- read.csv("../../data/key_for_protein_names.csv")
inactive_adjusted_linear_modeling_results$Target <- protein_key[,2]
inactive_adjusted_linear_modeling_results$Entrez <- protein_key[,3]
inactive_adjusted_linear_modeling_results$Uniprot = protein_key$Uniprot


#save the results
write.csv(inactive_adjusted_linear_modeling_results,"../../analysis/linear_modeling_inactive_and_controls/inactive_adjusted_linear_modeling_results.csv",row.names=FALSE)

In [30]:
#find the significant pvalues and summarize the models
significant_adjusted_linear_modeling_results <- filter(inactive_adjusted_linear_modeling_results,inactive_adjusted_linear_modeling_results$linear_modeling_pvalue < 0.01)
dim(significant_adjusted_linear_modeling_results)
significant_adjusted_linear_modeling_results <- significant_adjusted_linear_modeling_results[order(significant_adjusted_linear_modeling_results$linear_modeling_pvalue),]
#head(significant_adjusted_linear_modeling_results)


In [31]:
#find those that are lower in active compared to controls
#and those that are higher in active compared to controls
higher_in_inactive_proteins <- filter(significant_adjusted_linear_modeling_results,significant_adjusted_linear_modeling_results$fold_change > 0)
dim(higher_in_inactive_proteins)
write.csv(higher_in_inactive_proteins,"../../analysis/linear_modeling_inactive_and_controls/higher_in_inactive_proteins.csv",row.names=FALSE)

lower_in_inactive_proteins <- filter(significant_adjusted_linear_modeling_results,significant_adjusted_linear_modeling_results$fold_change < 0)
dim(lower_in_inactive_proteins)
write.csv(lower_in_inactive_proteins,"../../analysis/linear_modeling_inactive_and_controls/lower_in_inactive_proteins.csv",row.names=FALSE)
