In [26]:
import numpy as np
import random
import igraph
import networkx as nx
import csv
import json
import pandas as pd
import time
from tqdm import tqdm
from collections import Counter
import matplotlib as plt

In [15]:
def create_graph(path, filename, weighted = False, directed = False):
    with open(path + filename, "r") as f:
        reader = csv.reader(f)
        edges  = list(reader)[1:]
    edges = [(int(edge[0]),int(edge[1])) for edge in edges]
    Nb_nodes = max([max(nodes) for nodes in edges])+1
    if directed : 
        g = igraph.Graph(directed = True)
    else :
        g = igraph.Graph()
    g.add_vertices(Nb_nodes)
    g.add_edges(edges)
    if weighted :
        g.es["weight"] = g.similarity_jaccard(pairs = edges)
    else :
        g.es["weight"] = 1
    return g

In [16]:
path = '..\data'
g_dir =  create_graph(path,'\HR_edges.csv', weighted = False, directed = True)

In [59]:
def overlap_generator(overlap_weighting, graph):
    """
    Function to generate weight for all of the edges.
    """
    edges = [e.tuple for e in graph.es ]
    if overlap_weighting == "normalized_overlap":
        weights = graph.similarity_jaccard(pairs = edges)
        weights = {e: weights[i] for i,e in enumerate(edges)}
        weights_prime = {(e[1], e[0]): value for e, value in weights.items()}
        weights.update(weights_prime)
    else:
        weights = {e: 1 for e in edges}
        weights_prime = {(e[1], e[0]): value for e, value in weights.items()}
        weights.update(weights_prime)
    return weights

In [60]:
weights = overlap_generator("normalized_overlap", g_dir)

In [18]:
class RandomWalker:
    """
    Class to generate vertex sequences.
    """
    def __init__(self, graph, repetitions, length):
        print("Model initialization started.")
        self.graph = graph
        self.nodes = [node.index for node in self.graph.vs]
        self.repetitions = repetitions
        self.length = length

    def small_walk(self, start_node):
        """
        Generate a node sequence from a start node.
        """
        return self.graph.random_walk(start_node, self.length)

    def count_frequency_values(self):
        """
        Calculate the co-occurence frequencies.
        """
        raw_counts = [node for walk in self.walks for node in walk]
        counts = Counter(raw_counts)
        self.degrees = [counts[i] for i in range(len(self.nodes))]

    def do_walks(self):
        """
        Do a series of random walks.
        """
        self.walks = []
        for rep in range(0, self.repetitions):
            random.shuffle(self.nodes)
            print(" ")
            print("Random walk series " + str(rep+1) + ". initiated.")
            print(" ")
            for node in tqdm(self.nodes):
                walk = self.small_walk(node)
                self.walks.append(walk)
        self.count_frequency_values()
        return self.degrees, self.walks

In [21]:
RW = RandomWalker(graph = g_dir,repatitions = 5, length = 80)

Model initialization started.


In [27]:
degrees, walks = RW.do_walks()

 
Random walk series 1. initiated.
 


100%|██████████| 54573/54573 [00:05<00:00, 9862.91it/s] 


 
Random walk series 2. initiated.
 


100%|██████████| 54573/54573 [00:05<00:00, 9721.87it/s] 


 
Random walk series 3. initiated.
 


100%|██████████| 54573/54573 [00:05<00:00, 9533.10it/s] 


 
Random walk series 4. initiated.
 


100%|██████████| 54573/54573 [00:05<00:00, 9918.27it/s] 


 
Random walk series 5. initiated.
 


100%|██████████| 54573/54573 [00:06<00:00, 8815.88it/s] 


In [87]:
print(degrees[11])
print(walks[11])

5
[34416, 36072, 40704, 9071, 48911, 50052, 53678, 17571]


In [65]:
def index_generation(weights, a_walk):
    """
    Function to generate overlaps and indices.
    """
    edges = [(a_walk[i], a_walk[i+1]) for i in range(0, len(a_walk)-1)]
    edge_set_1 = np.array(range(0, len(a_walk)-1))
    edge_set_2 = np.array(range(1, len(a_walk)))
    overlaps = np.array(list(map(lambda x: weights[x], edges))).reshape((-1, 1))
    return edge_set_1, edge_set_2, overlaps

In [84]:
index_1, index_2, overlaps = index_generation(weights = weights, a_walk = walks[11])
print(index_1)
print(index_2)
print(overlaps)

[0 1 2 3 4 5 6]
[1 2 3 4 5 6 7]
[[0.07692308]
 [0.04761905]
 [0.05681818]
 [0.05434783]
 [0.125     ]
 [0.06521739]
 [0.11764706]]


In [62]:
def batch_input_generator(a_walk, random_walk_length, window_size):
    """
    Function to generate features from a node sequence.
    """
    seq_1 = [a_walk[j] for j in range(random_walk_length-window_size)]
    seq_2 = [a_walk[j] for j in range(window_size, random_walk_length)]
    return np.array(seq_1 + seq_2)

In [86]:
batch_inputs = batch_input_generator(a_walk = walks[11], random_walk_length = len(walks[11]), window_size = 5)
print(batch_inputs)

[34416 36072 40704 50052 53678 17571]


In [89]:
def batch_label_generator(a_walk, random_walk_length, window_size):
    """
    Function to generate labels from a node sequence.
    """
    grams_1 = [a_walk[j+1:j+1+window_size] for j in range(random_walk_length-window_size)]
    grams_2 = [a_walk[j-window_size:j] for j in range(window_size, random_walk_length)]
    return np.array(grams_1 + grams_2)

In [90]:
batch_labels = batch_label_generator(a_walk = walks[11], random_walk_length = len(walks[11]), window_size = 5)
print(batch_labels)

[[36072 40704  9071 48911 50052]
 [40704  9071 48911 50052 53678]
 [ 9071 48911 50052 53678 17571]
 [34416 36072 40704  9071 48911]
 [36072 40704  9071 48911 50052]
 [40704  9071 48911 50052 53678]]


In [None]:
def gamma_incrementer(step, gamma_0, gamma_final, current_gamma, num_steps):
    if step > 1:
        exponent = (0-np.log10(gamma_0))/float(num_steps)
        current_gamma = current_gamma * (10 **exponent)*(gamma_final-gamma_0)
        current_gamma = current_gamma + gamma_0
    return current_gamma

In [None]:
def neural_modularity_calculator(graph, embedding, means):
    """
    Function to calculate the GEMSEC cluster assignments.
    """
    assignments = {}
    for node in graph.vs:
        positions = means-embedding[node.index, :]
        values = np.sum(np.square(positions), axis=1)
        index = np.argmin(values)
        assignments[int(node.index)] = int(index)
    modularity = graph.modularity(membership = assignments)
    return modularity, assignments