# Description

In [1]:
# libraries
library(Seurat)
library(tidyverse)
library(igraph)
require(circlize)
library(R.utils)
library(data.table) #to read gz file

Attaching SeuratObject

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.2.1      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.5.0 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘igraph’


The following objects are masked from ‘package:dpl

### Read in the expression data of interacting cells:
The dataset used here is publicly available single-cell data from XXX. The data was processed, and filtered by applying XXX. 

In [2]:
input_dir <- "../../../../../results/data_preprocessing/Lasry/preprocessed/"
output_dir <- "../../../../../results/method_comparison/compare_algorithms/CPDB/"
final_out <- "../../../../../results/method_comparison/compare_algorithms/CPDB/"

In [3]:
# # load counts
# print("load counts")
# counts <- read.table(gzfile(paste0(path_in,"/counts_corr.csv.gz")
#                             )
#                      ,sep = ","
#                      ,row.names = 1
#                      ,header = TRUE
#                      )
# # load counts

counts <- fread(paste0(input_dir,"counts_corr.csv.gz"), header = TRUE,check.names=FALSE)
counts <- as.data.frame(counts)
rownames(counts) <- counts$gene_symbol
counts <- counts[,-1]
# head(str(counts))
# print(str(counts))

In [4]:
# load cell annotation
print("load cell annotation")
anno_cells <- read.table(paste0(input_dir,"anno_cells_corr.txt")
                         ,sep = "\t"
                         ,row.names = 1
                         ,header = TRUE
                         ,check.names=FALSE
                         )
# print(str(anno_cells))

[1] "load cell annotation"


In [5]:
#set rownames of annotation to cell_ids
rownames(anno_cells) <- anno_cells$cell

In [6]:
#set colnames of counts to cell_ids
colnames(counts) <- rownames(anno_cells)

In [7]:
#create a Seurat object
srt=CreateSeuratObject(counts=counts, meta.data=anno_cells)

In [8]:
#peek into the number of cells for case/control
srt@meta.data$health_status %>% table()

.
    AML healthy 
  21311   25391 

In [9]:
#peek into the number of cell types
srt@meta.data$cell_type %>% table()

.
    B    DC   Ery  Gran  HSPC  Mono    NK     T 
 4765  1634  1674  2332  3169 18004  3078 12046 

In [10]:
#set the indent to cell_type
Idents(srt) <- "cell_type"

In [11]:
# initialize empty vector for storing DEGs
DEGs <- c()

# iterate over each unique cell type 
for (cell in unique(srt@meta.data$cell_type)) {
  
  # subset Seurat object to only include cells of current cell type
  seurat_obj_receiver <- subset(srt, idents = cell)
  
  # set cell identity using the "health_status" feature
  seurat_obj_receiver <- SetIdent(seurat_obj_receiver, value = seurat_obj_receiver[["health_status"]])
  
  # specify the two conditions to compare
  condition_oi <- "AML"
  condition_reference <- "healthy" 
  
  # find differentially expressed genes between the two conditions
  DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
                                   ident.1 = condition_oi, 
                                   ident.2 = condition_reference, 
                                   min.pct = 0.10) %>%
    # convert row names to a separate "gene" column
    rownames_to_column("gene")
  
  # add cell type information to the DEG table
  DE_table_receiver <- data.frame(cluster = cell, DE_table_receiver)
  
  # filter DEGs based on statistical significance and fold change threshold
  DE_table_receiver <- DE_table_receiver %>% 
    filter(p_val_adj <= 0.05 & abs(avg_log2FC) >= 0.25)
  
  # print cell type and number of DEGs found
  print(cell)
  print(nrow(DE_table_receiver))
  
  # append DEGs to the vector of all DEGs
  DEGs <- rbind(DEGs, DE_table_receiver)
}


[1] "Mono"
[1] 178
[1] "Gran"
[1] 103
[1] "T"
[1] 48
[1] "NK"
[1] 164
[1] "B"
[1] 74
[1] "HSPC"
[1] 90
[1] "Ery"
[1] 645
[1] "DC"
[1] 56


In [12]:
# write.table(DEGs, file =paste0(output_dir,"samples_DEGs/DEGs.tsv"), sep = '\t', quote = F, row.names = F)

In [13]:
meta <- anno_cells["cell_type"] %>% rownames_to_column("Cell")

Below code takes an expression counts matrix (counts) and an annotation data frame (anno_cells) and writes out a separate counts file and metadata file for each sample ID in the sample_ID column of anno_cells. Each metadata file contains a single column (cell_type) and a row for each cell in the sample (required by CellPhoneDB), while the counts file contains the expression counts for each gene in each cell.

In [14]:
# create a directory "samples_DEGs" to save the subsetted counts and annotation files. 
dir.create(file.path(output_dir, "samples_DEGs"))

# loop over each unique sample ID in the "sample_ID" column of the "anno_cells" data frame
for (sample in unique(anno_cells$sample_ID)) {
  
  # filter the annotation data frame to include only cells from the current sample
  anno_filtered <- filter(anno_cells, sample_ID == sample)
  
  # subset the expression counts matrix to the current sample
  subset_counts <- counts[, rownames(anno_filtered)]
  
  # subset the annotation data frame (required by CellPhoneDB)
  subset_meta <- anno_filtered["cell_type"] %>% rownames_to_column("Cell")
    
  # subset DEGs
  subset_DEGs <- DEGs %>% filter(cluster %in% unique(subset_meta$cell_type))
  
  # write the subsetted annotation data frame to a tab-separated value (TSV) file
  write.table(subset_meta, paste0(output_dir,"samples_DEGs/", sample, "_meta.tsv"), sep = '\t', quote = F, row.names = F)
  
  # write the subsetted counts matrix to a TSV file
  write.table(subset_counts, paste0(output_dir,"samples_DEGs/", sample, "_counts.tsv"), sep = '\t', quote = F)

  write.table(subset_DEGs, paste0(output_dir,"samples_DEGs/", sample, "_DEGs.tsv"), sep = '\t', quote = F)



}


“'../../../../../results/method_comparison/compare_algorithms/CPDB//samples_DEGs' already exists”


Below is the content of shell script (`./runCPDB.sh`) that performs CellPhoneDB using DEG analysis method for each sample in the /samples_DEGs/ directory.

For each sample, the script creates a new directory `(${sample}_results)` to store the results of the CellPhoneDB analysis. The cellphonedb method degs_analysis command runs the DEG analysis method on the metadata and counts files for the current sample, using the `../DEGs.tsv` file as input for the list of differentially expressed genes. The `--database` option specifies the path to the CellPhoneDB database to use for the analysis, while the `--counts-data` option specifies the type of gene identifier used in the counts file (in this case, `hgnc_symbol`). The `--output-path` option specifies the directory where the analysis results will be saved.

`'./runCPDB.sh'`

```bash
# Set the directory path to the directory containing the DEG samples
samples_dir=../../../../../results/method_comparison/compare_algorithms/CPDB/samples_DEGs/

# Get a list of sample names
my_vars=$(ls "$samples_dir" | cut -d_ -f1 | uniq)

# Set the path to the custom database file
custom_db=../../../../../results/method_comparison/build_customDB/CPDB/custom_cellphone.db

# Loop over each sample variable name
for sample in $my_vars;
do
  # Create a subdirectory for the sample results
  mkdir ${samples_dir}${sample}_results;

  # Run CellPhoneDB's DEG analysis method on the sample using the custom database, with input files in the sample directory and output files in the sample results subdirectory
  cellphonedb method degs_analysis ${samples_dir}${sample}_meta.tsv ${samples_dir}${sample}_counts.tsv ${samples_dir}${sample}_DEGs.tsv --database $custom_db --counts-data hgnc_symbol --output-path ${samples_dir}${sample}_results/;
done;
```

In [15]:
run_CPDB <- './runCPDB.sh'

In [16]:
system(run_CPDB)

“error in running command”


### Restructure CellPhoneDB's outputs

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# !!!!!!!!!!!!!!!!!!!we gotta explain why we are using means file not the significant ones

In [79]:
results_dir <- list.dirs(path = paste0(output_dir,"samples_DEGs/"), full.names = TRUE)

In [80]:
results_dir <- results_dir[grepl("_results", results_dir, fixed = TRUE)]

In [81]:
# Define a function called 'restructure_result' that takes one argument, 'cpdb_means'
restructure_result <- function(cpdb_means) {
  
  # Subset the columns of 'cpdb_means' that contain 'interacting_pair' or '|'
  cpdb_means <- cpdb_means[, grepl('interacting_pair|\\|', colnames(cpdb_means))]
  
  # Pivot the data to long format and split the 'interacting_pair' column into 'sending_protein' and 'receiving_protein' columns
  # Split the 'cell_types' column into 'sending_celltype' and 'receiving_celltype' columns
  # Unite the 'sending_celltype' and 'sending_protein' columns into a single column called 'sender'
  # Unite the 'receiving_celltype' and 'receiving_protein' columns into a single column called 'receiver'
  # Unite the 'sender' and 'receiver' columns into a single column called 'interacting_pairs'
  # Select the 'interacting_pairs' and 'value' columns
  conversion <- cpdb_means %>%
    pivot_longer(cols = -interacting_pair, names_to = "cell_types", values_to = "value") %>%
    separate(interacting_pair, c("sending_protein", "receiving_protein"), sep = "_") %>%
    separate(cell_types, c("sending_celltype", "receiving_celltype"), sep = "\\|") %>%
    unite(sender, c("sending_celltype", "sending_protein"), sep = ":", remove = FALSE) %>%
    unite(receiver, c("receiving_celltype", "receiving_protein"), sep = ":", remove = FALSE) %>%
    unite(interacting_pairs, c("sender", "receiver"), sep = "_", remove = FALSE) %>%
    select(interacting_pairs, value)
  
  # Return the processed data
  return(conversion)
}


In [82]:
results=list()
for (sample in results_dir){
    
    file <- paste0(sample,"/relevant_interactions.txt")
    
    sample_id <- basename(sample)
    sample_id <- strsplit(sample_id, '_')[[1]][1]
    
    
    if (file.exists(file)){
        
        cpdb_means <- read.csv(file, sep = "\t",  check.names = FALSE)
        
        
        sample_result <- restructure_result(cpdb_means)
        colnames(sample_result) <- c("interaction_ID",sample_id)
        results[[sample_id]] <- sample_result
        
    }
    
}

In [83]:
means=list()
for (sample in results_dir){
    
    file <- paste0(sample,"/means.txt")
    
    sample_id <- basename(sample)
    sample_id <- strsplit(sample_id, '_')[[1]][1]
    
    
    if (file.exists(file)){
        
        cpdb_means <- read.csv(file, sep = "\t",  check.names = FALSE)
        
        
        sample_result <- restructure_result(cpdb_means)
        colnames(sample_result) <- c("interaction_ID",sample_id)
        means[[sample_id]] <- sample_result
        
    }
    
}

In [84]:
# Define a variable called `result` that will hold the output of the Reduce function
means <- Reduce(
  
  # The `Reduce()` function takes two arguments: a function and a list.
  # In this case, the function is an anonymous function defined using the `function()` keyword.
  # This function takes two arguments `x` and `y` and performs a full join between them using the `full_join()` function from the `dplyr` package.
  # The `by = "interaction"` argument specifies that the join should be performed on the "interaction" column.
  function(x, y) full_join(x, y, by = "interaction_ID"), 
  
  # The second argument to the `Reduce()` function is a list called `results`.
  # This list contains data frames that need to be joined together.
  means
)

In [85]:
# Define a variable called `result` that will hold the output of the Reduce function
matrix_result <- Reduce(
  
  # The `Reduce()` function takes two arguments: a function and a list.
  # In this case, the function is an anonymous function defined using the `function()` keyword.
  # This function takes two arguments `x` and `y` and performs a full join between them using the `full_join()` function from the `dplyr` package.
  # The `by = "interaction"` argument specifies that the join should be performed on the "interaction" column.
  function(x, y) full_join(x, y, by = "interaction_ID"), 
  
  # The second argument to the `Reduce()` function is a list called `results`.
  # This list contains data frames that need to be joined together.
  results
)

In [86]:
matrix_result[is.na(matrix_result)] <- 0

In [87]:
head(matrix_result)

interaction_ID,AML-0024,AML-0160,AML-0693,AML-1371,AML-2123,AML-3133,AML-4340,healthy-1,healthy-2,healthy-3,healthy-4,healthy-4003,healthy-5
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
B:GNAS_B:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0
B:GNAS_DC:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0
B:GNAS_Ery:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0
B:GNAS_Gran:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0
B:GNAS_HSPC:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0
B:GNAS_Mono:ADRB2,0,0,0,0,0,0,0,0,0,0,0,0,0


In [88]:
str(matrix_result)

tibble [22,502 × 14] (S3: tbl_df/tbl/data.frame)
 $ interaction_ID: chr [1:22502] "B:GNAS_B:ADRB2" "B:GNAS_DC:ADRB2" "B:GNAS_Ery:ADRB2" "B:GNAS_Gran:ADRB2" ...
 $ AML-0024      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-0160      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-0693      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-1371      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-2123      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-3133      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-4340      : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-1     : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-2     : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-3     : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-4     : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-4003  : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-5     : int [1:22502] 0 0 0 0 0 0 0 0 0 0 ...


In [89]:
# str(matrix_result[rowSums(matrix_result[, -1] != 0, na.rm = TRUE) > 0, ])

In [90]:
str(matrix_result %>%
  filter(rowSums(. == 1) > 0))

tibble [2,421 × 14] (S3: tbl_df/tbl/data.frame)
 $ interaction_ID: chr [1:2421] "Ery:GNAS_DC:ADRB2" "Ery:GNAS_Gran:ADRB2" "Ery:GNAS_HSPC:ADRB2" "Ery:GNAS_Mono:ADRB2" ...
 $ AML-0024      : int [1:2421] 0 0 0 0 0 1 0 0 0 0 ...
 $ AML-0160      : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-0693      : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-1371      : int [1:2421] 1 1 1 1 1 1 1 1 1 1 ...
 $ AML-2123      : int [1:2421] 0 0 0 0 1 0 0 0 0 0 ...
 $ AML-3133      : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ AML-4340      : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-1     : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-2     : int [1:2421] 0 1 0 0 0 0 0 1 0 0 ...
 $ healthy-3     : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...
 $ healthy-4     : int [1:2421] 1 0 0 0 1 0 1 0 0 0 ...
 $ healthy-4003  : int [1:2421] 0 1 0 0 0 0 0 1 0 0 ...
 $ healthy-5     : int [1:2421] 0 0 0 0 0 0 0 0 0 0 ...


In [91]:
matrix_result <- matrix_result[rowSums(matrix_result[, -1] != 0, na.rm = TRUE) > 0, ]

In [92]:
# Example list of strings
strings <- matrix_result$interaction_ID

# Initialize empty vectors for each column
sender_celltype <- c()
sender_gene <- c()
receiver_celltype <- c()
receiver_gene <- c()

# Loop through each string and split it
for (string in strings) {
  parts <- strsplit(string, "_")
  
  # Split the sender part
  sender_parts <- strsplit(parts[[1]][1], ":")
  sender_celltype <- c(sender_celltype, sender_parts[[1]][1])
  sender_gene <- c(sender_gene, sender_parts[[1]][2])
  
  # Split the receiver part
  receiver_parts <- strsplit(parts[[1]][2], ":")
  receiver_celltype <- c(receiver_celltype, receiver_parts[[1]][1])
  receiver_gene <- c(receiver_gene, receiver_parts[[1]][2])
}

# Create a dataframe with the splitted values
df <- data.frame(
  sender_celltype = sender_celltype,
  sender_gene = sender_gene,
  receiver_celltype = receiver_celltype,
  receiver_gene = receiver_gene
)



In [93]:
# Create interaction annotation df with log2FC values
for (row in 1:nrow(df)){
    each_row <- df[row,]
    sender_cell <- each_row$sender_celltype
    sender_gene <- each_row$sender_gene
    receiver_cell <- each_row$receiver_celltype
    receiver_gene <- each_row$receiver_gene
    
    ligand_log2FC <- subset(DEGs, cluster == sender_cell & gene == sender_gene)$avg_log2FC
    
    ligand_p_val_adj <- subset(DEGs, cluster == sender_cell & gene == sender_gene)$p_val_adj
    
    receptor_log2FC <- subset(DEGs, cluster == receiver_cell & gene == receiver_gene)$avg_log2FC
    
    receptor_p_val_adj <- subset(DEGs, cluster == receiver_cell & gene == receiver_gene)$p_val_adj

    if (length(ligand_log2FC) == 0) {
        df[row,"ligand_log2FC"] <- NA
        df[row,"ligand_p_val_adj"] <- NA
    } else {
        df[row,"ligand_log2FC"] <- ligand_log2FC
        df[row,"ligand_p_val_adj"] <- ligand_p_val_adj
    }
    
    if (length(receptor_log2FC) == 0) {
        df[row,"receptor_log2FC"] <- NA
        df[row,"receptor_p_val_adj"] <- NA
    } else {
        df[row,"receptor_log2FC"] <- receptor_log2FC
        df[row,"receptor_p_val_adj"] <- receptor_p_val_adj
    }
}


In [94]:
#see where both components (sender/receiever) have log2FC value
df[complete.cases(df$ligand_log2FC, df$receptor_log2FC), ]

Unnamed: 0_level_0,sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
103,Ery,MIF,Mono,ACKR3,-0.4807752,5.612281e-82,0.2595831,0.000000e+00
216,Ery,MIF,HSPC,CD44,-0.4807752,5.612281e-82,0.3099310,2.781776e-82
219,Ery,MIF,T,CD44,-0.4807752,5.612281e-82,0.2663509,7.130473e-170
290,NK,VIM,HSPC,CD44,0.2885934,8.814570e-40,0.3099310,2.781776e-82
293,NK,VIM,T,CD44,0.2885934,8.814570e-40,0.2663509,7.130473e-170
314,T,SRGN,HSPC,CD44,0.3019970,0.000000e+00,0.3099310,2.781776e-82
317,T,SRGN,T,CD44,0.3019970,0.000000e+00,0.2663509,7.130473e-170
427,Mono,ACKR3,Mono,ADM,0.2595831,0.000000e+00,0.4564719,0.000000e+00
441,Ery,CALM1,B,SELL,-0.4362474,1.540198e-89,-0.2740404,1.736327e-53
553,Gran,ITGA4,B,CD81,0.2693447,9.548232e-49,-0.2748321,2.954843e-56


In [95]:
df["interaction_ID"] <- paste0(df$sender_celltype, ":",df$sender_gene , "_", df$receiver_celltype, ":", df$receiver_gene)

In [96]:
df[is.na(df$ligand_log2FC) & is.na(df$receptor_log2FC),]

sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,interaction_ID
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>


### get log2FC_weights

In [97]:
#subset means
means <- filter(means, interaction_ID %in% df$interaction_ID)

In [98]:
cols <- c("interaction_ID", grep("healthy", names(means), value = TRUE))
control_means <- means[, cols, drop = FALSE]

In [99]:
cols <- c("interaction_ID", grep("AML", names(means), value = TRUE))
case_means <- means[, cols, drop = FALSE]

In [100]:
numeric_cols <- control_means[, !names(control_means) %in% c("interaction_ID")]
control_means$row_means <- rowMeans(numeric_cols, na.rm = TRUE)

In [101]:
numeric_cols <- case_means[, !names(case_means) %in% c("interaction_ID")]
case_means$row_means <- rowMeans(numeric_cols, na.rm = TRUE)

In [102]:
df["mean_weigth_case"]=NA
df["mean_weight_control"]=NA

In [103]:
for (row in 1:nrow(df)){
    int_ID <- df[row,]$interaction_ID
    case_mean <- filter(case_means, interaction_ID==int_ID)$row_means
    control_mean <- filter(control_means, interaction_ID==int_ID)$row_means
    
    df[row,"mean_weigth_case"] <- case_mean
    df[row,"mean_weight_control"] <- control_mean
}

In [104]:
df["log2FC_weights"] = log2(df$mean_weigth_case/df$mean_weight_control)

In [105]:
names(df)

In [106]:
df <- df %>% select(interaction_ID, sender_celltype, sender_gene, receiver_celltype,receiver_gene,
             ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,mean_weigth_case,
                   mean_weight_control,log2FC_weights)

In [107]:
threshold_log2FC <- 1

In [108]:
# Creating a new column named 'direction' based on the conditions

df$direction <- ifelse(df$log2FC_weights > threshold_log2FC, "up",
                             ifelse(df$log2FC_weights < threshold_log2FC, "down",
                                    ifelse(df$log2FC_weights == threshold_log2FC, "unchanged", NA)))


### Significant weights
We possess a set of weights (referred to as "means" by CPDB) and a binary matrix containing values of 0 and 1 to indicate significance of DEGs (relevant_interactions.txt). In order to extract the significant weights, we perform a multiplication operation between the weight matrix and the significance matrix. By multiplying the two matrices element-wise, we retain only the significant weights while setting the non-significant weights to 0. Consequently, we obtain a matrix solely consisting of the significant weights.

In [109]:
# The purpose of using a for loop in this code snippet is to handle the mismatched order of rows between 
# the means dataframe and the binary matrix dataframe.

In [110]:
# Create an empty list to store the multiplied rows
multiplied_rows <- list()

# Iterate over the rows
for (i in 1:nrow(means)) {
  interaction_ID <- means$interaction_ID[i]
  
  # Find the matching row in the 'significant' dataframe based on 'interaction_ID'
  matching_row <- matrix_result[matrix_result$interaction_ID == interaction_ID, ]
  
  # Perform element-wise multiplication
  multiplied_values <- means[i, -1] * matching_row[, -1]
  
  # Create a row with interaction_ID and multiplied values
  row <- c(interaction_ID, multiplied_values)
  
  # Add the row to the list
  multiplied_rows[[i]] <- row
}

# Convert the list of rows into a dataframe
multiplied_df <- do.call(rbind, multiplied_rows)

colnames(multiplied_df) <- c("interaction_ID", colnames(means)[-1])

multiplied_df <- as.data.frame(multiplied_df)

In [111]:
# Convert columns to double data type
multiplied_df <- as.data.frame(multiplied_df) %>%
  mutate(across(-interaction_ID, as.double))

In [112]:
multiplied_df$interaction_ID <- as.character(multiplied_df$interaction_ID)

In [113]:
multiplied_df[is.na(multiplied_df)] <- 0

# fixing direction

Due to the lack of directions for pairs in CellPhoneDB, some of the pairs are swapped order (i.e: L1_R1 appears as R1_L1). To address this issue, we determine the interactions that contain swapped pairs and reorganize them in the correct order. This enables us to compare the interactions accurately.

In [114]:
LR_DB <- read.csv("../build_customDB/LR_database.csv")

In [115]:
# LR_DB <- LR_DB %>% 
#         rename("Ligand" = "protein_name_a",
#                "Receptor" = "protein_name_b")

# LR_DB <- LR_DB[,-1]

In [116]:
df["pair"] <- paste0(df$sender_gene, "_", df$receiver_gene)
df["dup"] <- paste0(df$receiver_gene, "_", df$sender_gene)

In [117]:
# check if we have any duplicated swaps
df[df$pair %in% df$dup,]

interaction_ID,sender_celltype,sender_gene,receiver_celltype,receiver_gene,ligand_log2FC,ligand_p_val_adj,receptor_log2FC,receptor_p_val_adj,mean_weigth_case,mean_weight_control,log2FC_weights,direction,pair,dup
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>


In [118]:
# check if all the items that are not present in the original database exist as swapped pairs in 
#the original database.
identical(df[!df$pair %in% LR_DB$Pair.Name,]$interaction_ID,df[df$dup %in% LR_DB$Pair.Name,]$interaction_ID)

In [119]:
fix_df = df[!df$pair %in% LR_DB$Pair.Name,]

In [120]:
df = df[df$pair %in% LR_DB$Pair.Name,]

In [121]:
# create a pair column, makes it easier to check
multiplied_df$pair <- sapply(strsplit(multiplied_df$interaction_ID, "_"), function(x) {
  genes <- gsub(".*:", "", x)
  paste(genes, collapse = "_")
})

In [122]:
fix_multiplied_df = multiplied_df[!multiplied_df$pair %in% LR_DB$Pair.Name,]

In [123]:
multiplied_df = multiplied_df[multiplied_df$pair %in% LR_DB$Pair.Name,]

In [124]:
head(fix_multiplied_df)

Unnamed: 0_level_0,interaction_ID,AML-0024,AML-0160,AML-0693,AML-1371,AML-2123,AML-3133,AML-4340,healthy-1,healthy-2,healthy-3,healthy-4,healthy-4003,healthy-5,pair
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
43,NK:CD3D_B:HLA-C,0,0,0,1.099,0.808,0,0,0,0,0,0,0,0,CD3D_HLA-C
44,NK:CD3D_DC:HLA-C,0,0,0,0.99,0.0,0,0,0,0,0,0,0,0,CD3D_HLA-C
45,NK:CD3D_Ery:HLA-C,0,0,0,0.598,0.355,0,0,0,0,0,0,0,0,CD3D_HLA-C
46,NK:CD3D_Gran:HLA-C,0,0,0,0.919,0.769,0,0,0,0,0,0,0,0,CD3D_HLA-C
47,NK:CD3D_HSPC:HLA-C,0,0,0,0.758,0.598,0,0,0,0,0,0,0,0,CD3D_HLA-C
48,NK:CD3D_Mono:HLA-C,0,0,0,1.041,0.757,0,0,0,0,0,0,0,0,CD3D_HLA-C


In [125]:
# # Split values by underscore and swap
fix_multiplied_df$interaction_ID <- sapply(strsplit(fix_multiplied_df$interaction_ID, "_"), function(x) paste(rev(x), collapse = "_"))

In [126]:
colnames(fix_df)

In [127]:
new_df <- data.frame(
  interaction_ID = fix_df$interaction_ID,
  sender_celltype = fix_df$receiver_celltype,
  sender_gene = fix_df$receiver_gene,
  receiver_celltype = fix_df$sender_celltype,
  receiver_gene = fix_df$sender_gene,
  ligand_log2FC = fix_df$receptor_log2FC,
  ligand_p_val_adj = fix_df$receptor_log2FC,
  receptor_log2FC = fix_df$ligand_log2FC,
  receptor_p_val_adj = fix_df$ligand_p_val_adj,
  mean_weigth_case = fix_df$mean_weigth_case,
  mean_weight_control = fix_df$mean_weight_control,
  log2FC_weights = fix_df$log2FC_weights,
  direction = fix_df$direction
  )

In [128]:
new_df["interaction_ID"] <- paste0(new_df$sender_celltype, ":",new_df$sender_gene , "_", new_df$receiver_celltype, ":", new_df$receiver_gene)

In [129]:
multiplied_df <- rbind(fix_multiplied_df,multiplied_df)

In [130]:
df <- df[, !(names(df) %in% c("pair","dup"))]

In [131]:
df <- rbind(new_df,df)

In [132]:
multiplied_df <- multiplied_df[, !(names(multiplied_df) %in% c("pair","dup"))]

In [133]:
head(multiplied_df)

Unnamed: 0_level_0,interaction_ID,AML-0024,AML-0160,AML-0693,AML-1371,AML-2123,AML-3133,AML-4340,healthy-1,healthy-2,healthy-3,healthy-4,healthy-4003,healthy-5
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
43,B:HLA-C_NK:CD3D,0,0,0,1.099,0.808,0,0,0,0,0,0,0,0
44,DC:HLA-C_NK:CD3D,0,0,0,0.99,0.0,0,0,0,0,0,0,0,0
45,Ery:HLA-C_NK:CD3D,0,0,0,0.598,0.355,0,0,0,0,0,0,0,0
46,Gran:HLA-C_NK:CD3D,0,0,0,0.919,0.769,0,0,0,0,0,0,0,0
47,HSPC:HLA-C_NK:CD3D,0,0,0,0.758,0.598,0,0,0,0,0,0,0,0
48,Mono:HLA-C_NK:CD3D,0,0,0,1.041,0.757,0,0,0,0,0,0,0,0


In [134]:
write.csv(multiplied_df, paste0(final_out,"CPDB_significant_weights.csv"))

In [135]:
write.csv(df, paste0(final_out,"CPDB_anno_interaction.csv"))

In [None]:
write.csv(matrix_result, paste0(final_out,"CPDB_weights.csv"))

In [46]:
anno <- read.csv(paste0(final_out,"CPDB_anno_interaction.csv"))

In [47]:
results <- read.csv(paste0(final_out,"CPDB_results.csv"))

In [48]:
threshold_log2FC <- 1

In [49]:
upregulated_anno <- anno[anno$log2>1,]

In [50]:
downregulated_anno <- anno[anno$log2<1,]

In [51]:
upregulated <- filter(results, interaction_ID %in% upregulated_anno$interaction_ID)

In [52]:
downregulated <- filter(results, interaction_ID %in% downregulated_anno$interaction_ID)

In [59]:
write.csv(upregulated_anno, paste0(final_out,"upregulated_anno.csv"))
write.csv(downregulated_anno, paste0(final_out,"downregulated_anno.csv"))
write.csv(upregulated, paste0(final_out,"upregulated.csv"))
write.csv(downregulated, paste0(final_out,"downregulated.csv"))

In [221]:
getwd()