# Imports, Initialization, and Hyperparameters

In [1]:
import pandas as pd
import torch
import numpy as np
import math
import scipy.linalg
import operator as op
import copy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
##Hyperparameters

#K: Maximum hop distance (K = 2)
K = 2

#Gamma: Hop discount factor (γ = 0.01)
gamma = 0.01

#Eta: Node degree threshold (η = 15)
eta = 15

#Phi: Compression ratio (φ = 0.2)
phi = 0.2

In [3]:
#Import PPI Data and construct protein set
data = pd.read_csv(r"C:\Users\colef\OneDrive - University of Miami\Documents\College\Research\Wuchty Lab\Data\HI-union.tsv",sep='\t')
data.drop(data[(data['Protein1'] == data['Protein2'])].index,inplace=True)
data.reset_index(drop=True,inplace=True)
protein_set = set([*data.Protein1,*data.Protein2])

## Data Processing

### Helper Functions

In [4]:
def get_neighbors(protein,data):
    rows = data.loc[(data['Protein1'] == protein) | (data['Protein2'] == protein)]
    neighbors = [*rows.Protein1,*rows.Protein2]
    neighbors = set(neighbors)
    neighbors.remove(protein)
    return neighbors

In [5]:
def get_degree(protein,neighbor_dict):
    return len(neighbor_dict[protein])

In [6]:
def get_k_neighbors(protein,neighbor_dict,k):
    neighbor_set = neighbor_dict[protein].copy()
    while k > 1:
        for protein in neighbor_set:
            temp_neighbors = neighbor_dict[protein]
            neighbor_set = neighbor_set | temp_neighbors
        k -= 1
    return neighbor_set

### Data Processing

In [7]:
#Initialize neighbor dictionary
neighbor_dict = {}
for protein in protein_set:
    neighbor_dict[protein] = get_neighbors(protein,data)

In [8]:
#Initialize degree dictionary and max_degree variable
degree_dict = {}
for protein in protein_set:
    degree_dict[protein] = get_degree(protein,neighbor_dict)
max_degree = max(degree_dict.values())

# Validate with degree-edge law
if sum(degree_dict.values()) != 2*len(data):
    print("ERROR")

# Feature Extraction

## Helper Functions

In [9]:
def get_s_features(protein):
    features = torch.zeros(1,round(math.log2(max_degree))+1)
    k = 1
    while k <= K:
        #Get neighbors within that hop distance
        neighbors =  get_k_neighbors(protein,neighbor_dict,k)
        neighbors.add(protein)
        temporary_features = np.zeros([1,round(math.log2(max_degree))+1])
        for neighbor in neighbors:
            idx = round(math.log2(degree_dict[neighbor]))
            temporary_features[0][idx] += 1
        temporary_features = pow(gamma,k-1)*temporary_features
        features += temporary_features
        k += 1
    return features

## Feature Extraction

In [10]:
#Get features for all proteins
s_features = torch.zeros(len(protein_set),round(math.log2(max_degree))+1,dtype=torch.float64)
for row,protein in enumerate(protein_set):
    temp_features = get_s_features(protein)
    s_features[row] = temp_features

# Embedding Learning

## Helper Functions

In [11]:
def GCN_layer(adjacency,features,weights):
    AF = torch.matmul(adjacency,features)
    AFW = torch.matmul(AF,weights)
    output = torch.tanh(AFW)
    return output

In [12]:
# Weight matrices are randomly initialized based on Glorot and Bengio approach
def initialize_weights(previous_size,current_size):
    dist = [-math.sqrt(6)/math.sqrt(current_size[1]+previous_size[1]),math.sqrt(6)/math.sqrt(current_size[1]+previous_size[1])]
    weights = np.random.rand(previous_size[1],current_size[1])
    weights = (dist[1] - dist[0]) * weights + dist[0]
    return torch.tensor(weights).double()

## Embedding Learning

In [13]:
#Initialize Adjacency matrix (Ajoin + I), and Degree matrix (D)
protein_list = list(protein_set)
Ajoin = torch.zeros(len(protein_list),len(protein_list),dtype=torch.float64)
D = torch.zeros(len(protein_list),len(protein_list),dtype=torch.float64)
for i,protein in enumerate(protein_list):
    neighbors = neighbor_dict[protein]
    for neighbor in neighbors:
        Ajoin[protein_list.index(protein)][protein_list.index(neighbor)] = 1
    Ajoin[i][i] = 1
    D[i][i] = degree_dict[protein]

In [14]:
#Compute ^Ajoin using Ajoin and D
D2 = torch.from_numpy(scipy.linalg.fractional_matrix_power(D,-1/2)).double()
temp_result = torch.matmul(D2,Ajoin)
Ajoin = torch.matmul(temp_result,D2)

In [15]:
#GCN Layer 1
features_size = s_features.size()
hidden_size = [round(math.log2(max_degree))+1,round(math.log2(max_degree))+1]
hidden_weights = initialize_weights(features_size,hidden_size)
H1 = GCN_layer(Ajoin,s_features,hidden_weights)

#GCN Layer 2
H1_size = H1.size()
embedding_size = [hidden_size[0]*2,hidden_size[0]*2]
embedding_weights = initialize_weights(H1_size,embedding_size)
output = GCN_layer(Ajoin,H1,embedding_weights)

In [16]:
#Initialize Feature dictionary
feature_dict = {}
for i,features in enumerate(output):
    protein = protein_list[i]
    feature_dict[protein] = features

# Guiding List

In [17]:
#Reduce size of protein list based on degree threshold η
reduced_protein_list = []
reduced_feature_dict = {}
for i,protein in enumerate(protein_list):
    if degree_dict[protein] > eta:
        reduced_protein_list.append(protein)
        reduced_feature_dict[protein] = feature_dict[protein]

In [18]:
#Initialize list of norms
norm_list = torch.zeros(len(reduced_protein_list))
for i,features in enumerate(reduced_feature_dict.values()):
    norm_list[i] = torch.linalg.vector_norm(features)

In [19]:
#Sort lists
(sorted_norm_list,arglist) = norm_list.sort(descending=True)
sorted_protein_list = [None]*(max(arglist)+1)
for i,row in enumerate(arglist):
    sorted_protein_list[row] = reduced_protein_list[i]

# Graph Compression

## Helper Functions

In [20]:
def remove_from_dicts(subnode,compressed_data,compressed_neighbor_dict,compressed_degree_dict,guided_list):
    compressed_data.drop(compressed_data[(compressed_data['Protein1'] == subnode) | (compressed_data['Protein2'] == subnode)].index,inplace=True)
    compressed_data.reset_index(drop=True,inplace=True)
    del compressed_neighbor_dict[subnode]
    del compressed_degree_dict[subnode]
    for node in compressed_neighbor_dict.keys():
        if subnode in compressed_neighbor_dict[node]:
            compressed_neighbor_dict[node].remove(subnode)
            compressed_degree_dict[node] = compressed_degree_dict[node] - 1
    if subnode in guided_list:
        guided_list.remove(subnode)

## Graph Compression

In [21]:
#Initializations
compress_rate = 0
compressed_data = data.copy()
compressed_neighbor_dict = neighbor_dict.copy()
compressed_degree_dict = degree_dict.copy()
guided_list = copy.copy(sorted_protein_list)

#Compress loop
while compress_rate < phi:
    
    #Iterate through guided list
    for protein in guided_list:
        
        #Get minimum degree of neighbors
        neighbor_list = compressed_neighbor_dict[protein]
        neighbor_degree_dict = {}
        for neighbor in neighbor_list:
            neighbor_degree_dict[neighbor] = compressed_degree_dict[neighbor]
        min_degree = min(neighbor_degree_dict.values())
        
        #Make list of nodes for compression based on min_degree
        compress_list = [protein]
        for neighbor in neighbor_list:
            if neighbor_degree_dict[neighbor] == min_degree:
                compress_list.append(neighbor)
        
        #If there are nodes to compress, do so
        if len(compress_list) > 1:
            
            supernode_neighbors = set()
            supernode = '+'.join(compress_list)
            compress_list = list(set(compress_list))
            
            #Remove subnodes and subnode connections from data and guiding list
            for subnode in compress_list:
                
                #Get subnode neighbors
                node_neighbors = compressed_neighbor_dict[subnode]
                supernode_neighbors = supernode_neighbors | node_neighbors
            
                #Remove subnodes from graph, guided_list and dicts
                remove_from_dicts(subnode,compressed_data,compressed_neighbor_dict,compressed_degree_dict,guided_list)
                
            #Remove neighbors that will be a part of the supernode
            supernode_neighbors = supernode_neighbors - set(compress_list)
            
            #Add supernode connections to compress_list, update dicts, and append to guided list
            for neighbor in supernode_neighbors:
                compressed_data.loc[len(compressed_data.index)] = [supernode,neighbor]
                compressed_neighbor_dict[neighbor] = compressed_neighbor_dict[neighbor] | set([supernode])
                compressed_degree_dict[neighbor] = compressed_degree_dict[neighbor] + 1
            guided_list.append(supernode)
            
            #Update dicts
            compressed_neighbor_dict[supernode] = supernode_neighbors
            compressed_degree_dict[supernode] = get_degree(supernode,compressed_neighbor_dict)
        
        # Validate with degree-edge law
        if sum(compressed_degree_dict.values()) != 2*len(compressed_data):
            print('ERROR')
            
        #Adjust compress_rate and check value against phi
        compress_rate = 1 - len(compressed_data.index)/len(data.index)
        if compress_rate >= phi:
            break

In [22]:
#Get set of supernodes and their subnodes
compressed_protein_set = set([*compressed_data.Protein1,*compressed_data.Protein2])
supernode_dict = {}
for protein in compressed_protein_set:
    if '+' in protein:
        supernode_dict[protein] = set(protein.split('+'))

In [23]:
#Save compressed graph
compressed_data.to_csv('HI-union_compressed.csv')