# Description

In [1]:
# libraries
library(Seurat)
library(tidyverse)
library(igraph)
require(circlize)
library(R.utils)
library(data.table) #to read gz file

The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)

Attaching SeuratObject

── [1mAttaching core tidyverse packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[

### Read in the expression data of interacting cells:
The dataset used here is publicly available single-cell data from XXX. The data was processed, and filtered by applying XXX. 

In [2]:
input_dir <- "../../../../../results/data_preprocessing/vanGalen_Hourigan/batch_corrected/"
output_dir <- "../../../../../results/method_comparison/compare_algorithms/vanGalen/CPDB/"
final_out <- "../../../../../results/method_comparison/compare_algorithms/vanGalen/CPDB/"

In [3]:
# # load counts
# print("load counts")
# counts <- read.table(gzfile(paste0(path_in,"/counts_corr.csv.gz")
#                             )
#                      ,sep = ","
#                      ,row.names = 1
#                      ,header = TRUE
#                      )
# # load counts

counts <- fread(paste0(input_dir,"counts_corr.csv.gz"), header = TRUE,check.names=FALSE)
counts <- as.data.frame(counts)
rownames(counts) <- counts$gene_symbol
counts <- counts[,-1]
# head(str(counts))
# print(str(counts))

In [11]:
# load cell annotation
print("load cell annotation")
anno_cells <- read.table(paste0(input_dir,"anno_cells_corr.txt")
                         ,sep = "\t"
                         ,row.names = 1
                         ,header = TRUE
                         #,check.names=FALSE
                         )
# print(str(anno_cells))

[1] "load cell annotation"


In [12]:
#set rownames of annotation to cell_ids
rownames(anno_cells) <- anno_cells$cell_ID

In [13]:
#set colnames of counts to cell_ids
colnames(counts) <- rownames(anno_cells)

In [17]:
#create a Seurat object
srt=CreateSeuratObject(counts=counts, meta.data=anno_cells)

In [18]:
#peek into the number of cells for case/control
srt@meta.data$health_status %>% table()

.
    AML healthy 
  11897   62686 

In [19]:
#peek into the number of cell types
srt@meta.data$cell_type %>% table()

.
    B    DC   Ery  HSPC  Mono     T 
 6662  3929  4973  7608 13558 37853 

In [20]:
#set the indent to cell_type
Idents(srt) <- "cell_type"

In [21]:
# initialize empty vector for storing DEGs
DEGs <- c()

# iterate over each unique cell type 
for (cell in unique(srt@meta.data$cell_type)) {
  
  # subset Seurat object to only include cells of current cell type
  seurat_obj_receiver <- subset(srt, idents = cell)
  
  # set cell identity using the "health_status" feature
  seurat_obj_receiver <- SetIdent(seurat_obj_receiver, value = seurat_obj_receiver[["health_status"]])
  
  # specify the two conditions to compare
  condition_oi <- "AML"
  condition_reference <- "healthy" 
  
  # find differentially expressed genes between the two conditions
  DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
                                   ident.1 = condition_oi, 
                                   ident.2 = condition_reference, 
                                   min.pct = 0.10) %>%
    # convert row names to a separate "gene" column
    rownames_to_column("gene")
  
    
    
    if(nrow(DE_table_receiver) == 0){
        cluster <- cell
        gene <- c("None")
        avg_log2FC <- c(0)
        p_val_adj <- c(0)
        p_val <- c(0)
        pct.1<- c(0)
        pct.2<- c(0)
        DE_table_receiver <- data.frame(cluster,gene,p_val, avg_log2FC,pct.1,pct.2,p_val_adj)
        DEGs <- rbind(DEGs, DE_table_receiver)
        print(nrow(DE_table_receiver))
    }
    
    
    
  # add cell type information to the DEG table
  DE_table_receiver <- data.frame(cluster = cell, DE_table_receiver)
  
  # filter DEGs based on statistical significance and fold change threshold
  DE_table_receiver <- DE_table_receiver %>% 
    filter(p_val_adj <= 0.05 & abs(avg_log2FC) >= 0.25)
  
  # print cell type and number of DEGs found
  print(cell)
  print(nrow(DE_table_receiver))
  
  # append DEGs to the vector of all DEGs
  DEGs <- rbind(DEGs, DE_table_receiver)
}


[1] "HSPC"
[1] 8


“No features pass logfc.threshold threshold; returning empty data.frame”


[1] 1
[1] "Mono"
[1] 0
[1] "DC"
[1] 8
[1] "Ery"
[1] 17


“No features pass logfc.threshold threshold; returning empty data.frame”


[1] 1
[1] "T"
[1] 0
[1] "B"
[1] 12


In [22]:
# write.table(DEGs, file =paste0(output_dir,"samples_DEGs/DEGs.tsv"), sep = '\t', quote = F, row.names = F)

In [23]:
meta <- anno_cells["cell_type"] %>% rownames_to_column("Cell")

Below code takes an expression counts matrix (counts) and an annotation data frame (anno_cells) and writes out a separate counts file and metadata file for each sample ID in the sample_ID column of anno_cells. Each metadata file contains a single column (cell_type) and a row for each cell in the sample (required by CellPhoneDB), while the counts file contains the expression counts for each gene in each cell.

In [24]:
# create a directory "samples_DEGs" to save the subsetted counts and annotation files. 
dir.create(file.path(output_dir, "samples_DEGs"))

# loop over each unique sample ID in the "sample_ID" column of the "anno_cells" data frame
for (sample in unique(anno_cells$sample_ID)) {
  
  # filter the annotation data frame to include only cells from the current sample
  anno_filtered <- filter(anno_cells, sample_ID == sample)
  
  # subset the expression counts matrix to the current sample
  subset_counts <- counts[, rownames(anno_filtered)]
  
  # subset the annotation data frame (required by CellPhoneDB)
  subset_meta <- anno_filtered["cell_type"] %>% rownames_to_column("Cell")
    
  # subset DEGs
  subset_DEGs <- DEGs %>% filter(cluster %in% unique(subset_meta$cell_type))
  
  # write the subsetted annotation data frame to a tab-separated value (TSV) file
  write.table(subset_meta, paste0(output_dir,"samples_DEGs/", sample, "_meta.tsv"), sep = '\t', quote = F, row.names = F)
  
  # write the subsetted counts matrix to a TSV file
  write.table(subset_counts, paste0(output_dir,"samples_DEGs/", sample, "_counts.tsv"), sep = '\t', quote = F)

  write.table(subset_DEGs, paste0(output_dir,"samples_DEGs/", sample, "_DEGs.tsv"), sep = '\t', quote = F)



}


“'../../../../../results/method_comparison/compare_algorithms/vanGalen/CPDB//samples_DEGs' already exists”


Below is the content of shell script (`./runCPDB.sh`) that performs CellPhoneDB using DEG analysis method for each sample in the /samples_DEGs/ directory.

For each sample, the script creates a new directory `(${sample}_results)` to store the results of the CellPhoneDB analysis. The cellphonedb method degs_analysis command runs the DEG analysis method on the metadata and counts files for the current sample, using the `../DEGs.tsv` file as input for the list of differentially expressed genes. The `--database` option specifies the path to the CellPhoneDB database to use for the analysis, while the `--counts-data` option specifies the type of gene identifier used in the counts file (in this case, `hgnc_symbol`). The `--output-path` option specifies the directory where the analysis results will be saved.

`'./runCPDB.sh'`

```bash
# Set the directory path to the directory containing the DEG samples
samples_dir=../../../../../results/method_comparison/compare_algorithms/CPDB/samples_DEGs/

# Get a list of sample names
my_vars=$(ls "$samples_dir" | cut -d_ -f1 | uniq)

# Set the path to the custom database file
custom_db=../../../../../results/method_comparison/build_customDB/CPDB/custom_cellphone.db

# Loop over each sample variable name
for sample in $my_vars;
do
  # Create a subdirectory for the sample results
  mkdir ${samples_dir}${sample}_results;

  # Run CellPhoneDB's DEG analysis method on the sample using the custom database, with input files in the sample directory and output files in the sample results subdirectory
  cellphonedb method degs_analysis ${samples_dir}${sample}_meta.tsv ${samples_dir}${sample}_counts.tsv ${samples_dir}${sample}_DEGs.tsv --database $custom_db --counts-data hgnc_symbol --output-path ${samples_dir}${sample}_results/;
done;
```

In [15]:
#make sure cpdb is installed in the env.
run_CPDB <- './runCPDB.sh'

In [25]:
system('conda run -n cpdb ./runCPDB_vanGalen.sh')

### Restructure CellPhoneDB's outputs

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# !!!!!!!!!!!!!!!!!!!we gotta explain why we are using means file not the significant ones

In [26]:
results_dir <- list.dirs(path = paste0(output_dir,"samples_DEGs/"), full.names = TRUE)

In [27]:
results_dir <- results_dir[grepl("_results", results_dir, fixed = TRUE)]

In [28]:
# Define a function called 'restructure_result' that takes one argument, 'cpdb_means'
restructure_result <- function(cpdb_means) {
  
  # Subset the columns of 'cpdb_means' that contain 'interacting_pair' or '|'
  cpdb_means <- cpdb_means[, grepl('interacting_pair|\\|', colnames(cpdb_means))]
  
  # Pivot the data to long format and split the 'interacting_pair' column into 'sending_protein' and 'receiving_protein' columns
  # Split the 'cell_types' column into 'sending_celltype' and 'receiving_celltype' columns
  # Unite the 'sending_celltype' and 'sending_protein' columns into a single column called 'sender'
  # Unite the 'receiving_celltype' and 'receiving_protein' columns into a single column called 'receiver'
  # Unite the 'sender' and 'receiver' columns into a single column called 'interacting_pairs'
  # Select the 'interacting_pairs' and 'value' columns
  conversion <- cpdb_means %>%
    pivot_longer(cols = -interacting_pair, names_to = "cell_types", values_to = "value") %>%
    separate(interacting_pair, c("sending_protein", "receiving_protein"), sep = "_") %>%
    separate(cell_types, c("sending_celltype", "receiving_celltype"), sep = "\\|") %>%
    unite(sender, c("sending_celltype", "sending_protein"), sep = ":", remove = FALSE) %>%
    unite(receiver, c("receiving_celltype", "receiving_protein"), sep = ":", remove = FALSE) %>%
    unite(interacting_pairs, c("sender", "receiver"), sep = "_", remove = FALSE) %>%
    select(interacting_pairs, value)
  
  # Return the processed data
  return(conversion)
}


In [29]:
results=list()
for (sample in results_dir){
    
    file <- paste0(sample,"/relevant_interactions.txt")
    
    sample_id <- basename(sample)
    sample_id <- strsplit(sample_id, '_')[[1]][1]
    
    
    if (file.exists(file)){
        
        cpdb_means <- read.csv(file, sep = "\t",  check.names = FALSE)
        
        
        sample_result <- restructure_result(cpdb_means)
        colnames(sample_result) <- c("interaction_ID",sample_id)
        results[[sample_id]] <- sample_result
        
    }
    
}

In [30]:
means=list()
for (sample in results_dir){
    
    file <- paste0(sample,"/means.txt")
    
    sample_id <- basename(sample)
    sample_id <- strsplit(sample_id, '_')[[1]][1]
    
    
    if (file.exists(file)){
        
        cpdb_means <- read.csv(file, sep = "\t",  check.names = FALSE)
        
        
        sample_result <- restructure_result(cpdb_means)
        colnames(sample_result) <- c("interaction_ID",sample_id)
        means[[sample_id]] <- sample_result
        
    }
    
}

In [31]:
# Define a variable called `result` that will hold the output of the Reduce function
means <- Reduce(
  
  # The `Reduce()` function takes two arguments: a function and a list.
  # In this case, the function is an anonymous function defined using the `function()` keyword.
  # This function takes two arguments `x` and `y` and performs a full join between them using the `full_join()` function from the `dplyr` package.
  # The `by = "interaction"` argument specifies that the join should be performed on the "interaction" column.
  function(x, y) full_join(x, y, by = "interaction_ID"), 
  
  # The second argument to the `Reduce()` function is a list called `results`.
  # This list contains data frames that need to be joined together.
  means
)

In [32]:
# Define a variable called `result` that will hold the output of the Reduce function
matrix_result <- Reduce(
  
  # The `Reduce()` function takes two arguments: a function and a list.
  # In this case, the function is an anonymous function defined using the `function()` keyword.
  # This function takes two arguments `x` and `y` and performs a full join between them using the `full_join()` function from the `dplyr` package.
  # The `by = "interaction"` argument specifies that the join should be performed on the "interaction" column.
  function(x, y) full_join(x, y, by = "interaction_ID"), 
  
  # The second argument to the `Reduce()` function is a list called `results`.
  # This list contains data frames that need to be joined together.
  results
)

In [33]:
matrix_result[is.na(matrix_result)] <- 0

In [34]:
head(matrix_result)

interaction_ID,AML1012-D0,AML210A-D0,AML328-D0,AML419A-D0,AML420B-D0,AML556-D0,AML707B-D0,AML916-D0,AML921A-D0,⋯,BM-Q,BM-R,BM-Sk2,BM-T,BM-U,BM-W,BM1,BM2,BM3,BM4
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
DC:FN1_DC:VCAN,1,1,1,1,0,1,1,0,1,⋯,1,1,1,1,1,1,1,1,1,1
DC:FN1_Ery:VCAN,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
DC:FN1_HSPC:VCAN,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
DC:FN1_Mono:VCAN,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
DC:FN1_T:VCAN,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Ery:FN1_DC:VCAN,1,1,1,1,0,1,1,0,0,⋯,1,1,1,1,1,1,1,1,1,1


In [35]:
str(matrix_result)

tibble [1,824 × 34] (S3: tbl_df/tbl/data.frame)
 $ interaction_ID: chr [1:1824] "DC:FN1_DC:VCAN" "DC:FN1_Ery:VCAN" "DC:FN1_HSPC:VCAN" "DC:FN1_Mono:VCAN" ...
 $ AML1012-D0    : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML210A-D0    : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML328-D0     : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML419A-D0    : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML420B-D0    : int [1:1824] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML556-D0     : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML707B-D0    : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ AML916-D0     : int [1:1824] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML921A-D0    : int [1:1824] 1 0 0 0 0 0 0 0 0 0 ...
 $ BM-A          : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ BM-B          : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ BM-C1         : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ BM-E          : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ BM-F          : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ BM-G          : int [1:1824] 1 0 0 0 0 1 0 0 0 0 ...
 $ 

In [36]:
# str(matrix_result[rowSums(matrix_result[, -1] != 0, na.rm = TRUE) > 0, ])

In [37]:
str(matrix_result %>%
  filter(rowSums(. == 1) > 0))

tibble [405 × 34] (S3: tbl_df/tbl/data.frame)
 $ interaction_ID: chr [1:405] "DC:FN1_DC:VCAN" "Ery:FN1_DC:VCAN" "HSPC:FN1_DC:VCAN" "Mono:FN1_DC:VCAN" ...
 $ AML1012-D0    : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML210A-D0    : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML328-D0     : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML419A-D0    : int [1:405] 1 1 1 1 0 1 1 1 1 0 ...
 $ AML420B-D0    : int [1:405] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML556-D0     : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML707B-D0    : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML916-D0     : int [1:405] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML921A-D0    : int [1:405] 1 0 1 1 1 1 0 1 1 1 ...
 $ BM-A          : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-B          : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-C1         : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-E          : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-F          : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-G          : int [1:405] 1 1 1 1 1 1 1 1 1 1 ...
 $ BM-H          : in

In [38]:
matrix_result <- matrix_result[rowSums(matrix_result[, -1] != 0, na.rm = TRUE) > 0, ]

In [39]:
# Example list of strings
strings <- matrix_result$interaction_ID

# Initialize empty vectors for each column
sender_celltype <- c()
sender_gene <- c()
receiver_celltype <- c()
receiver_gene <- c()

# Loop through each string and split it
for (string in strings) {
  parts <- strsplit(string, "_")
  
  # Split the sender part
  sender_parts <- strsplit(parts[[1]][1], ":")
  sender_celltype <- c(sender_celltype, sender_parts[[1]][1])
  sender_gene <- c(sender_gene, sender_parts[[1]][2])
  
  # Split the receiver part
  receiver_parts <- strsplit(parts[[1]][2], ":")
  receiver_celltype <- c(receiver_celltype, receiver_parts[[1]][1])
  receiver_gene <- c(receiver_gene, receiver_parts[[1]][2])
}

# Create a dataframe with the splitted values
df <- data.frame(
  sender_celltype = sender_celltype,
  sender_gene = sender_gene,
  receiver_celltype = receiver_celltype,
  receiver_gene = receiver_gene
)



In [40]:
# Create interaction annotation df with log2FC values
for (row in 1:nrow(df)){
    each_row <- df[row,]
    sender_cell <- each_row$sender_celltype
    sender_gene <- each_row$sender_gene
    receiver_cell <- each_row$receiver_celltype
    receiver_gene <- each_row$receiver_gene
    
    ligand_log2FC <- subset(DEGs, cluster == sender_cell & gene == sender_gene)$avg_log2FC
    
    ligand_p_val_adj <- subset(DEGs, cluster == sender_cell & gene == sender_gene)$p_val_adj
    
    receptor_log2FC <- subset(DEGs, cluster == receiver_cell & gene == receiver_gene)$avg_log2FC
    
    receptor_p_val_adj <- subset(DEGs, cluster == receiver_cell & gene == receiver_gene)$p_val_adj

    if (length(ligand_log2FC) == 0) {
        df[row,"ligand_log2FC"] <- NA
        df[row,"ligand_p_val_adj"] <- NA
    } else {
        df[row,"ligand_log2FC"] <- ligand_log2FC
        df[row,"ligand_p_val_adj"] <- ligand_p_val_adj
    }
    
    if (length(receptor_log2FC) == 0) {
        df[row,"receptor_log2FC"] <- NA
        df[row,"receptor_p_val_adj"] <- NA
    } else {
        df[row,"receptor_log2FC"] <- receptor_log2FC
        df[row,"receptor_p_val_adj"] <- receptor_p_val_adj
    }
}


In [41]:
#see where both components (sender/receiever) have log2FC value
df[complete.cases(df$ligand_log2FC, df$receptor_log2FC), ]

Unnamed: 0_level_0,sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
111,DC,VCAN,DC,SELL,-0.2515238,2.4912720000000002e-40,-0.3316847,0.0
113,DC,VCAN,HSPC,SELL,-0.2515238,2.4912720000000002e-40,-0.3060212,0.0
352,DC,S100A8,B,CD69,-0.2773294,2.632622e-28,-0.2599869,6.425401000000001e-22


In [42]:
df["interaction_ID"] <- paste0(df$sender_celltype, ":",df$sender_gene , "_", df$receiver_celltype, ":", df$receiver_gene)

In [43]:
df[is.na(df$ligand_log2FC) & is.na(df$receptor_log2FC),]

sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,interaction_ID
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>


### get log2FC_weights

In [44]:
#subset means
means <- filter(means, interaction_ID %in% df$interaction_ID)

In [45]:
cols <- c("interaction_ID", grep("healthy", names(means), value = TRUE))
control_means <- means[, cols, drop = FALSE]

In [46]:
cols <- c("interaction_ID", grep("AML", names(means), value = TRUE))
case_means <- means[, cols, drop = FALSE]

In [47]:
numeric_cols <- control_means[, !names(control_means) %in% c("interaction_ID")]
control_means$row_means <- rowMeans(numeric_cols, na.rm = TRUE)

In [48]:
numeric_cols <- case_means[, !names(case_means) %in% c("interaction_ID")]
case_means$row_means <- rowMeans(numeric_cols, na.rm = TRUE)

In [49]:
df["mean_weigth_case"]=NA
df["mean_weight_control"]=NA

In [50]:
for (row in 1:nrow(df)){
    int_ID <- df[row,]$interaction_ID
    case_mean <- filter(case_means, interaction_ID==int_ID)$row_means
    control_mean <- filter(control_means, interaction_ID==int_ID)$row_means
    
    df[row,"mean_weigth_case"] <- case_mean
    df[row,"mean_weight_control"] <- control_mean
}

In [51]:
df["log2FC_weights"] = log2(df$mean_weigth_case/df$mean_weight_control)

In [52]:
names(df)

In [53]:
df <- df %>% select(interaction_ID, sender_celltype, sender_gene, receiver_celltype,receiver_gene,
             ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,mean_weigth_case,
                   mean_weight_control,log2FC_weights)

In [54]:
threshold_log2FC <- 1

In [55]:
# Creating a new column named 'direction' based on the conditions

df$direction <- ifelse(df$log2FC_weights > threshold_log2FC, "up",
                             ifelse(df$log2FC_weights < threshold_log2FC, "down",
                                    ifelse(df$log2FC_weights == threshold_log2FC, "unchanged", NA)))


### Significant weights
We possess a set of weights (referred to as "means" by CPDB) and a binary matrix containing values of 0 and 1 to indicate significance of DEGs (relevant_interactions.txt). In order to extract the significant weights, we perform a multiplication operation between the weight matrix and the significance matrix. By multiplying the two matrices element-wise, we retain only the significant weights while setting the non-significant weights to 0. Consequently, we obtain a matrix solely consisting of the significant weights.

In [56]:
# The purpose of using a for loop in this code snippet is to handle the mismatched order of rows between 
# the means dataframe and the binary matrix dataframe.

In [57]:
# Create an empty list to store the multiplied rows
multiplied_rows <- list()

# Iterate over the rows
for (i in 1:nrow(means)) {
  interaction_ID <- means$interaction_ID[i]
  
  # Find the matching row in the 'significant' dataframe based on 'interaction_ID'
  matching_row <- matrix_result[matrix_result$interaction_ID == interaction_ID, ]
  
  # Perform element-wise multiplication
  multiplied_values <- means[i, -1] * matching_row[, -1]
  
  # Create a row with interaction_ID and multiplied values
  row <- c(interaction_ID, multiplied_values)
  
  # Add the row to the list
  multiplied_rows[[i]] <- row
}

# Convert the list of rows into a dataframe
multiplied_df <- do.call(rbind, multiplied_rows)

colnames(multiplied_df) <- c("interaction_ID", colnames(means)[-1])

multiplied_df <- as.data.frame(multiplied_df)

In [58]:
# Convert columns to double data type
multiplied_df <- as.data.frame(multiplied_df) %>%
  mutate(across(-interaction_ID, as.double))

In [59]:
multiplied_df$interaction_ID <- as.character(multiplied_df$interaction_ID)

In [60]:
multiplied_df[is.na(multiplied_df)] <- 0

# fixing direction

Due to the lack of directions for pairs in CellPhoneDB, some of the pairs are swapped order (i.e: L1_R1 appears as R1_L1). To address this issue, we determine the interactions that contain swapped pairs and reorganize them in the correct order. This enables us to compare the interactions accurately.

In [61]:
library(community)

In [62]:
data(LR_database)

In [63]:
LR_DB <- LR_database

In [64]:
# LR_DB <- LR_DB %>% 
#         rename("Ligand" = "protein_name_a",
#                "Receptor" = "protein_name_b")

# LR_DB <- LR_DB[,-1]

In [65]:
df["pair"] <- paste0(df$sender_gene, "_", df$receiver_gene)
df["dup"] <- paste0(df$receiver_gene, "_", df$sender_gene)

In [66]:
# check if we have any duplicated swaps
df[df$pair %in% df$dup,]

interaction_ID,sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,mean_weigth_case,mean_weight_control,log2FC_weights,direction,pair,dup
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>


In [67]:
# check if all the items that are not present in the original database exist as swapped pairs in 
#the original database.
identical(df[!df$pair %in% LR_DB$Pair.Name,]$interaction_ID,df[df$dup %in% LR_DB$Pair.Name,]$interaction_ID)

In [69]:
nrow(df)

In [70]:
fix_df = df[!df$pair %in% LR_DB$Pair.Name,]

In [71]:
df = df[df$pair %in% LR_DB$Pair.Name,]

In [72]:
nrow(df) + nrow(fix_df)

In [73]:
all(fix_df$dup %in% LR_DB$Pair.Name)

In [74]:
# create a pair column, makes it easier to check
multiplied_df$pair <- sapply(strsplit(multiplied_df$interaction_ID, "_"), function(x) {
  genes <- gsub(".*:", "", x)
  paste(genes, collapse = "_")
})

In [75]:
fix_multiplied_df = multiplied_df[!multiplied_df$pair %in% LR_DB$Pair.Name,]

In [76]:
multiplied_df = multiplied_df[multiplied_df$pair %in% LR_DB$Pair.Name,]

In [77]:
head(fix_multiplied_df)

Unnamed: 0_level_0,interaction_ID,AML1012-D0,AML210A-D0,AML328-D0,AML419A-D0,AML420B-D0,AML556-D0,AML707B-D0,AML916-D0,AML921A-D0,⋯,BM-R,BM-Sk2,BM-T,BM-U,BM-W,BM1,BM2,BM3,BM4,pair
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,DC:FN1_DC:VCAN,0.3,0.286,0.504,0.467,0,0.485,0.372,0,0.098,⋯,0.326,0.225,0.33,0.316,0.158,0.196,0.289,0.264,0.369,FN1_VCAN
2,Ery:FN1_DC:VCAN,0.3,0.287,0.437,0.458,0,0.484,0.355,0,0.0,⋯,0.323,0.223,0.324,0.308,0.157,0.196,0.29,0.264,0.369,FN1_VCAN
3,HSPC:FN1_DC:VCAN,0.0,0.288,0.445,0.458,0,0.485,0.359,0,0.098,⋯,0.325,0.223,0.327,0.311,0.157,0.196,0.29,0.264,0.369,FN1_VCAN
4,Mono:FN1_DC:VCAN,0.301,0.289,0.452,0.461,0,0.483,0.36,0,0.098,⋯,0.324,0.224,0.324,0.307,0.157,0.195,0.29,0.263,0.367,FN1_VCAN
5,T:FN1_DC:VCAN,0.3,0.289,0.437,0.0,0,0.485,0.355,0,0.096,⋯,0.326,0.225,0.325,0.308,0.159,0.197,0.291,0.264,0.369,FN1_VCAN
6,DC:LDLR_DC:VCAN,0.316,0.303,0.458,0.478,0,0.514,0.383,0,0.11,⋯,0.339,0.229,0.342,0.336,0.174,0.205,0.312,0.287,0.398,LDLR_VCAN


In [78]:
# # Split values by underscore and swap
fix_multiplied_df$interaction_ID <- sapply(strsplit(fix_multiplied_df$interaction_ID, "_"), function(x) paste(rev(x), collapse = "_"))

In [79]:
colnames(fix_df)

In [80]:
new_df <- data.frame(
  interaction_ID = fix_df$interaction_ID,
  sender_celltype = fix_df$receiver_celltype,
  sender_gene = fix_df$receiver_gene,
  receiver_celltype = fix_df$sender_celltype,
  receiver_gene = fix_df$sender_gene,
  ligand_log2FC = fix_df$receptor_log2FC,
  ligand_p_val_adj = fix_df$receptor_log2FC,
  receptor_log2FC = fix_df$ligand_log2FC,
  receptor_p_val_adj = fix_df$ligand_p_val_adj,
  mean_weigth_case = fix_df$mean_weigth_case,
  mean_weight_control = fix_df$mean_weight_control,
  log2FC_weights = fix_df$log2FC_weights,
  direction = fix_df$direction
  )

In [81]:
new_df["interaction_ID"] <- paste0(new_df$sender_celltype, ":",new_df$sender_gene , "_", new_df$receiver_celltype, ":", new_df$receiver_gene)

In [82]:
multiplied_df <- rbind(fix_multiplied_df,multiplied_df)

In [83]:
df <- df[, !(names(df) %in% c("pair","dup"))]

In [84]:
df <- rbind(new_df,df)

In [85]:
all(multiplied_df$interaction_ID %in% df$interaction_ID)

In [86]:
multiplied_df <- multiplied_df[, !(names(multiplied_df) %in% c("pair","dup"))]

In [87]:
head(multiplied_df)

Unnamed: 0_level_0,interaction_ID,AML1012-D0,AML210A-D0,AML328-D0,AML419A-D0,AML420B-D0,AML556-D0,AML707B-D0,AML916-D0,AML921A-D0,⋯,BM-Q,BM-R,BM-Sk2,BM-T,BM-U,BM-W,BM1,BM2,BM3,BM4
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,DC:VCAN_DC:FN1,0.3,0.286,0.504,0.467,0,0.485,0.372,0,0.098,⋯,0.325,0.326,0.225,0.33,0.316,0.158,0.196,0.289,0.264,0.369
2,DC:VCAN_Ery:FN1,0.3,0.287,0.437,0.458,0,0.484,0.355,0,0.0,⋯,0.0,0.323,0.223,0.324,0.308,0.157,0.196,0.29,0.264,0.369
3,DC:VCAN_HSPC:FN1,0.0,0.288,0.445,0.458,0,0.485,0.359,0,0.098,⋯,0.323,0.325,0.223,0.327,0.311,0.157,0.196,0.29,0.264,0.369
4,DC:VCAN_Mono:FN1,0.301,0.289,0.452,0.461,0,0.483,0.36,0,0.098,⋯,0.322,0.324,0.224,0.324,0.307,0.157,0.195,0.29,0.263,0.367
5,DC:VCAN_T:FN1,0.3,0.289,0.437,0.0,0,0.485,0.355,0,0.096,⋯,0.323,0.326,0.225,0.325,0.308,0.159,0.197,0.291,0.264,0.369
6,DC:VCAN_DC:LDLR,0.316,0.303,0.458,0.478,0,0.514,0.383,0,0.11,⋯,0.34,0.339,0.229,0.342,0.336,0.174,0.205,0.312,0.287,0.398


In [88]:
write.csv(multiplied_df, paste0(final_out,"CPDB_significant_weights.csv"))

In [89]:
write.csv(df, paste0(final_out,"CPDB_anno_interactions.csv"))

In [None]:
write.csv(matrix_result, paste0(final_out,"CPDB_weights.csv"))

In [46]:
anno <- read.csv(paste0(final_out,"CPDB_anno_interaction.csv"))

In [47]:
results <- read.csv(paste0(final_out,"CPDB_results.csv"))

In [48]:
threshold_log2FC <- 1

In [49]:
upregulated_anno <- anno[anno$log2>1,]

In [50]:
downregulated_anno <- anno[anno$log2<1,]

In [51]:
upregulated <- filter(results, interaction_ID %in% upregulated_anno$interaction_ID)

In [52]:
downregulated <- filter(results, interaction_ID %in% downregulated_anno$interaction_ID)

In [59]:
write.csv(upregulated_anno, paste0(final_out,"upregulated_anno.csv"))
write.csv(downregulated_anno, paste0(final_out,"downregulated_anno.csv"))
write.csv(upregulated, paste0(final_out,"upregulated.csv"))
write.csv(downregulated, paste0(final_out,"downregulated.csv"))

In [221]:
getwd()