In [13]:
import numpy as np
from copy import deepcopy
import sys
sys.setrecursionlimit(5000)

In [4]:
class Node:
    def __init__(self, url):
        self.url = url
        self.title = None
        self.text = None

        self.inlinks = []
        self.outlinks = []

        self.labels = []
        self.label = None
        self.keyword = None
        self.tags = None
        self.features = None

In [30]:
def node_classifier(graph):
    not_labeled = set([i for i in range(len(graph)) if graph[i].label==None])
    prev_labels = [deepcopy(node.labels) for node in graph]

    iterations = 0
    while len(not_labeled)>0 and iterations<len(graph):  # Iterating until all nodes are classified
        iterations+=1

        max_i, max_v = -1,-2
        for i in not_labeled:
            node = graph[i]
            cnt_labels = np.zeros(4)

            # Counting type of labels in neighbouring nodes and storing them in a vector
            adj = node.inlinks + node.outlinks
            for u in adj:
                cnt_labels+=u.labels

            # Normalizing vector
            avg_labels = cnt_labels/len(adj)
            avg_labels /= max(1, np.linalg.norm(avg_labels))

            # Taking a dot product with same vector calculated in previous iteration to see its similarity
            score = np.dot(avg_labels, prev_labels[i])/(np.linalg.norm(avg_labels)*np.linalg.norm(prev_labels[i]))
            prev_labels[i] = avg_labels

            # Most similar vector with its previous value is most likely to have been converged
            if score>max_v:
                max_v=score
                max_i=i

        # Assigning value of index with most value in vector as label of the node
        graph[max_i].label = np.argmax(prev_labels[i])
        not_labeled.remove(max_i)

        for i in range(len(graph)):
            graph[i].labels = prev_labels[i]

In [31]:
import pickle
with open('editedGraph.txt','rb') as file:
    graph_dict = pickle.load(file)

graph = [*graph_dict.values()]

In [32]:
from collections import Counter
Counter([x.label for x in graph])

Counter({1: 34, None: 1095, 2: 19, 0: 4, 3: 8})

In [33]:
node_classifier(graph)

In [34]:
Counter([x.label for x in graph])

Counter({1: 1128, 0: 5, 2: 19, 3: 8})

In [14]:
graph_dict = {k:v for k,v in zip([*graph_dict.keys()], graph)}

with open('editedGraph.txt','wb') as file:
    pickle.dump(graph_dict, file)