In [3]:
import csv
import numpy as np
import math
from numpy import array
import pandas as pd
import time

## Auxiliary functions

In [4]:
def read_tabular_file(file_path):
    data = []
    
    try:
        with open(file_path, 'r') as file:
            for line in file:
                elements = line.strip().split('\t')
                string_value = elements[0]
                array_value = elements[1:-1]
                    
                data.append((string_value, array_value))

    except FileNotFoundError:
        print("File not found.")
    
    return data

## File paths

* gene_sets_file: txt file with gene set IDs and gene lists (tab delimited). For example see `files/gene_sets_example.txt`.
* full_human_ppin: full human PPIN data, the file is in `file/edges_2021.dat`.
* exp_mat_file: expression matrix with gene_id as row names and cell_id as col names (tab delimeted).

In [5]:
gene_sets_file = "./files/gene_sets.txt" 
exp_mat_file = './files/expression_matrix.txt'
full_human_ppin_file = './files/edges_2021.dat'

## Read expression matrix

In [6]:
genes=[]
exp_matrix=[]

with open(exp_mat_file, 'r') as ff:
    readon = csv.reader(ff, delimiter='\t')
    headers = next(readon)
    for filas in readon:
        genes.append(filas[0]) #gene_list
        numfl=[float(x) for x in filas[1:]]
        exp_matrix.append(numfl) # NgenesxNcells expresion matrix

exp_matrix_t=np.transpose(exp_matrix) # NcellsxNgenes

genes_original = genes.copy() #original exp_mat order

Read gene sets to build the associated PPINs

In [7]:
gene_sets_tab = read_tabular_file(gene_sets_file)
gene_sets_df = pd.DataFrame(gene_sets_tab, columns=["ID", "gene_list"])

In [8]:
n_processes = []

for gene_set in gene_sets_df['gene_list']:
    n_genes = len(list(set(gene_set)))
    n_processes.append(n_genes)
genes_subsets_df = gene_sets_df.copy()
genes_subsets_df['N_processes'] = n_processes

Subset genes in processes belonging to gene list from the expression matrix

In [9]:
new_gene_list = []

for gene_set in gene_sets_df['gene_list']:
    gene_set_subset = list(set(gene_set) & set(genes))
    new_gene_list.append(gene_set_subset)

genes_subsets_df['gene_list'] = new_gene_list
genes_subsets_df['gs_index'] = list(genes_subsets_df.index)

Remove processes with less or eq than X genes

In [10]:
genes_subsets_df = genes_subsets_df[genes_subsets_df['gene_list'].apply(len) > 5]
index = genes_subsets_df.columns[-1:].append(genes_subsets_df.columns[:-1])
final_df = genes_subsets_df[index]
s = pd.Series(final_df['gene_list'])
gene_list_final = s.tolist()
sorted_gene_list = [sorted(inner_list) for inner_list in gene_list_final]
genes.sort()

# Select genes sets with more than X genes
go_bp_id_df = final_df[['ID', 'gs_index']]

## Load full PPIN

In [11]:
full_human_ppin = pd.read_csv(full_human_ppin_file, sep='\\t', header=None, engine='python')
mat = full_human_ppin.to_numpy(dtype=str).tolist()

lengths_list = [len(sublist) for sublist in sorted_gene_list]
go_bp = final_df['ID'].tolist()

# Compute activity

In [None]:
wfile = open('./activities.txt', 'a')

wfile.write('\t'.join(headers) + '\n')

kk=0
while kk < len(sorted_gene_list):
    wfile.writelines(go_bp[kk] + "\t")
    sel1=[]
    for gen in sorted_gene_list[kk]:
        sel1=sel1+[x for x in mat if gen in x[0]]

    sel2=[]
    for gen in sorted_gene_list[kk]:
        sel2=sel2+[x for x in sel1 if gen in x[1]]

    lst=[]
    num=0
    for gg in range(len(genes_original)):
        pos=[]
        i=0
        while i < len(sel2):
            if sel2[i][0] == genes_original[gg]:
                pos=pos+[sel2[i][1]]
            i=i+1

        nl=len(pos)
        if nl>0:

            for k in pos:
                try:
                    lst=lst+[[gg,genes_original.index(k)]]
                    num=num+1
                except:
                    continue
        
# remove duplicated
    if (1.0*len(lst))/(1.0*lengths_list[kk]) > 1:
        edges = [] 
        for i in lst:
            if [i[1],i[0]] not in edges:
                edges.append(i) 

        lon=len(edges)
        for celu in exp_matrix_t:
            activ=0.0
            meancel=np.mean(celu) 
            for i in range(lon):
                activ=activ+celu[edges[i][0]]*celu[edges[i][1]]
            
            wfile.writelines(str(activ/(lon*meancel))[0:5]+"\t") #normalize

        wfile.writelines("\n")
    kk=kk+1
wfile.close() 