In [7]:
import networkx as nx
from collections import defaultdict
import numpy as np
import pickle
import collections
from tqdm import tqdm, tnrange

### [RQ1]
**Creating Graph**

In [8]:
DG = nx.DiGraph()

In [9]:
path = "C:/Users/Asia/"

# i.e. our_edges = {node1:[node2,node4,node5], ..., 1:[2, 3, 5], 2:[5], 3:[6] ....}
# our_edges = defaultdict(list)

with open(path + "wiki-topcats-reduced.txt", "r") as f:
    #create graph
    for line in f.readlines():
        article1, article2 = line.split()
        DG.add_weighted_edges_from([(int(article1), int(article2), 1)])
#         our_edges[int(article1)].append(int(article2))

In [10]:
# Is graph directed?
print("Is graph directed? ", nx.is_directed(DG), "\n") # check whether graph is directed or not

# example;
print("Since the neighbors of 52nd node are:", *list(DG.neighbors(52)),
      ",52nd node is not neighbor for 1069112nd node which has connection with", *list(DG.neighbors(1069112))," nodes. \n")


Is graph directed?  True 

Since the neighbors of 52nd node are: 401135 1069112 1163551 ,52nd node is not neighbor for 1069112nd node which has connection with 1060396 1061304 1062611 1066969 1069008 1069113 1069258 1069275 1656982  nodes. 



In [11]:
# The number of nodes
nodes_num = DG.number_of_nodes()
print("The number of nodes:", nodes_num, "\n") # also len(DG) works

# The number of edges
edges_num = DG.number_of_edges()
print("The number of edges:", edges_num)

The number of nodes: 461193 

The number of edges: 2645247


**Graph density**
In mathematics, a dense graph is a graph in which the number of edges is close to the maximal number of edges. The opposite, a graph with only a few edges, is a sparse graph. The distinction between sparse and dense graphs is rather vague, and depends on the context.

For directed graphs, the graph density is defined as:
$$D = \frac{|E|}{|V|(|V|-1)}$$

where E is the number of edges and V is the number of vertices in the graph. The maximum number of edges for an directed graph is $|V|(|V|-1).$

In [5]:
density = nodes_num / (edges_num*(edges_num-1))
print(density);

6.590986317548208e-08


### [RQ2] 
Given a category $C_0 = \{article_1, article_2, \dots \}$ as input we want to rank all of the nodes in V according to the block-ranking, where the blocks are represented by the categories:
$$block_{RANKING} =\begin{bmatrix} C_0 \\ C_1 \\ \dots \\ C_c\\ \end{bmatrix}$$

Each category  corresponds to a list of nodes.

The first category of the rank, $C_0$, always corresponds to the input category. The order of the remaining categories is given by:

$$distance(C_0, C_i) = median(ShortestPath(C_0, C_i))$$

The lower is the distance from $C_0$, the higher is the $C_i$ position in the rank. $ShortestPath(C_0, C_i)$ is the set of all the possible shortest paths between the nodes of $C_0$ and $C_i$. Moreover, the length of a path is given by the sum of the weights of the edges it is composed by.

##### Reading the file with categories

In [14]:
with open(path + "wiki-topcats-categories.txt", "r") as f2:
    categories = {} # {category0 : [article1, article2, ...], ...., 5: [23, 45, 6]}
    categories_names = {} # {category_name : index, ...}
    for cat_indx, line in enumerate(f2.readlines()):
        line_content = line.split(";")
        categories[cat_indx] = list(map(int, line_content[1].split()))
        categories_names[line_content[0].split(":")[1]] = cat_indx

#### Provide the name of category $C0$

In [23]:
C0_name = input("Please, choose the name of category: \n\n")
C0_idx = categories_names[C0_name]
print("The index of selected category: ", C0_idx)

Please, choose the name of category: 

Fellows_of_the_Royal_Society
The index of selected category:  10839


#### Filtering categories which exist in our reduced graph:

In [24]:
tmp_selected_category_indx = []
# filtering categories with nodes more than 3500
for i in range(len(categories)):
    if len(categories[i]) > 3500:
        tmp_selected_category_indx.append(i)

#### Preparing categories for further analysis.
As one article might belong to a single category or multiple ones.
If the article belongs to the input categor it belongs to that one.

In [8]:
grouped_categories_nodes = [] # nodes grouped per category -- without C0 category
categories_nodes = set() # all nodes together -- without C0 category

# chose the category C0 with nodes only included in the DG graph:
C0 = set(categories[C0_idx]).intersection(DG.nodes)

final_selected_category_indx = []
# chose categories with nodes only included in the DG graph:
for idx in tmp_selected_category_indx[1:]:
    tmp_categ = set(categories[idx]).intersection(DG.nodes)
    # if C_i contains different nodes than C0:
    C_i = tmp_categ - C0
    if len(C_i) != 0 and len(C_i) < 100000:
        final_selected_category_indx.append(idx)
        grouped_categories_nodes.append(C_i)
        categories_nodes = categories_nodes.union(C_i)

## Algorithm for searching the Shortest path  -  BFS

In [None]:
def bfs_shortest_path(graph, start, categories_nodes):
    visited_dict = defaultdict(lambda:[False])
    queue = [start]
    visited_dict[start] = 0
    
    while queue:
        node = queue.pop(0)
        distance = visited_dict[node]
        try:
            for neighbour in graph.neighbors(node):
                if visited_dict[neighbour]==[False]:
                    visited_dict[neighbour] = distance + 1
                    queue.append(neighbour)
        except KeyError: pass
    return {node:visited_dict[node] for node in categories_nodes}

article_distances = {}
counter = 0

for idx, node in enumerate(C0):
    article_distances[node] = bfs_shortest_path(DG, node, categories_nodes)
    if (idx+1)%100==0:
        with open('distance_' + str(counter) + '.pkl', 'wb') as file:
            pickle.dump(article_distances, file, pickle.HIGHEST_PROTOCOL)
        article_distances = dict(); counter+=1

with open('distance_' + str(counter) + '.pkl', 'wb') as file:
    pickle.dump(article_distances, file, pickle.HIGHEST_PROTOCOL)

# Grouping categories with distances

In [11]:
path = "C:/Users/guilh/Desktop/ADM/HW5/distances_files/"

In [18]:
for j in tnrange(35):
    
    with open(path + 'distance_' + str(j) + '.pkl', 'rb') as file:
        distance_dict = pickle.load(file)
    
    distances_categories = []
    
    #for each category selected previously excluding C0
    for i in final_selected_category_indx:
        distances = []
        #for each node in this selected category underanalysis
        for node in categories[i]:
            #for the starting node of C0 into our distance file
            for starting_node in distance_dict:
                #try to find the distances from C0 node to the node inside the category under analysis
                try:
                    d = distance_dict[starting_node][node]
                    if d != [False]:
                        distances.append(d)
                        #if distance is false append 9999
                    else: distances.append(9999)
                        
                except: pass
        #append the results of distances to this category
        distances_categories.append(distances)
    
    
    #save this distances and after each iteration we concatenate to it
    if j == 0:
        distances_concatenated = np.array([distances_categories])
    else:
        np.concatenate((distances_concatenated, np.array([distances_categories])), axis = 1)


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))

Getting the median and making a dictionary to the category index

In [24]:
medians_dict = {}
for i in tnrange(len(final_selected_category_indx)):
    medians_dict[final_selected_category_indx[i]] = np.median(distances_concatenated[0][i])

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))

In [25]:
medians_dict

{869: 8.0,
 876: 12.0,
 898: 9999.0,
 900: 9999.0,
 949: 9999.0,
 1881: 9999.0,
 2791: 8.0,
 4160: 9.0,
 5619: 6.0,
 6067: 7.0,
 6795: 9999.0,
 6893: 10.0,
 7628: 8.0,
 7889: 7.0,
 8732: 9999.0,
 8733: 9999.0,
 10062: 7.0,
 10139: 6.0,
 10251: 6.0,
 10555: 6.0,
 10564: 6.0,
 11640: 6.0,
 11641: 6.0,
 11648: 6.0,
 11649: 6.0,
 11992: 7.0,
 12631: 6.0,
 12941: 9999.0,
 12966: 8.0,
 13838: 6.0,
 13996: 8.0,
 16956: 8.0}

## Block Ranging Algorithm - Step 1 , 2 , 3

In [None]:
import networkx as nx
G = nx.DiGraph()

Genergic Example

In [None]:
G.add_weighted_edges_from([('A','B',1)])
G.add_weighted_edges_from([('A','C',1)])
G.add_weighted_edges_from([('A','D',0)])
G.add_weighted_edges_from([('B','C',1)])
G.add_weighted_edges_from([('B','E',1)])
G.add_weighted_edges_from([('D','B',100)]) #its weight isnt important
G.add_weighted_edges_from([('D','E',1)])
G.add_weighted_edges_from([('D','G',1)])
G.add_weighted_edges_from([('E','C',100)]) #its weight isnt important
G.add_weighted_edges_from([('E','G',2)])
G.add_weighted_edges_from([('F','C',100)]) #its weight isnt important
G.add_weighted_edges_from([('G','F',1)])
G.add_weighted_edges_from([('H','G',1)])

In [None]:
categories = {1: ['A','B','C'], 2: ['D','E'], 3: ['F','G','H']}

## Step 1

In [None]:
C0 = categories[1]
induced_graph = G.subgraph(C0) # create sub graph for only category zero

In [None]:
def C0_sum_weights_inedges(induced_graph):
    # Iterate to get sum of weights of in-edges
    all_weights = {} # it will look like {node1:sum_of_weights, node2:sum_of_weights, ...}
    for (node1,node2,data) in induced_graph.edges(data=True):
        if node2 not in all_weights.keys(): # we consider node2 because we're checking how many "incoming" neighboors
            all_weights[node2] = data['weight'] # if node2 doesn't exist in all_weights, just add initial weight
        else:
            all_weights[node2] += data['weight'] # if node2 already exists in all_weights, add weight up like cumulate weight
    
    # if there is no incoming neigboors, detect these nodes and give their values as zero
    for zero_node in list(set(induced_graph.nodes()) - set(all_weights.keys())):
        all_weights[zero_node]=0
        
    all_weights = sorted(all_weights.items(), key=lambda kv: kv[1], reverse=True) # sort with descending order by values in dictionary 

    return all_weights

In [None]:
C0_score = C0_sum_weights_inedges(induced_graph)
C0_score # score for just category 0

## Step 2

In [None]:
C1 = categories[2] 
sub_graph = G.subgraph(list(C0) + C1) # create sub graph for only category 0 and category 1

In [None]:
C1_score = {} # it is the score for only category 1

""" apply same steps as step 1. The only difference is we use C0 as induced category. That means Since C0 is the first 
    ordered category, we need to take into account if there incoming arrows(directions) from catefory 0. We have to count
    also. So thats why we create sub graph with C0 and C1"""
for (node1,node2,data) in sub_graph.edges(data=True):
    if node2 in C1:
        if node2 not in C1_score.keys():
            C1_score[node2] = data['weight']
        else:
            C1_score[node2] += data['weight']
            
for zero_node in list(set(C1) - set(C1_score.keys())):
    C1_score[zero_node]=0
        
C1_score = sorted(C1_score.items(), key=lambda kv: kv[1], reverse=True) # sort by values
C1_score # for only category 1

## Step 3


In [None]:
def score(DG, ordered_category_indx, C0_score):
    C_0 = list(categories[ordered_category_indx[0]])
    cum_nodes_list = C_0 # cumulative nodes list
    all_weights = {}
    all_weights.update(C0_score) # append induced category rank and sum of weights

    for cat_idx in ordered_category_indx:
        C_i = list(categories[cat_idx])
        cum_nodes_list = cum_nodes_list + C_i # we need to build sub graph cumulatively
        sub_graph = DG.subgraph(cum_nodes_list)
        
        cat_weights = {} # weights for only category C_i
        for (node1,node2,data) in sub_graph.edges(data=True):
            if node2 in C_i:
                if node2 not in cat_weights.keys():
                    cat_weights[node2] = data['weight']
                else:
                    cat_weights[node2] += data['weight']
    
        for zero_node in list(set(C_i) - set(cat_weights.keys())):
            cat_weights[zero_node] = 0
            
        cat_weights = sorted(cat_weights.items(), key=lambda kv: kv[1], reverse=True) # before adding to all_weights sort weights just inside the category C_i
        all_weights.update(cat_weights) # add cat_weights to all_weights
    
    return all_weights

In [None]:
import time
start_time = time.time()

ordered_category_indx = [1,2,3] # in hw example order: C0,C1,C2
nodes_rank = score(G, ordered_category_indx, C0_score)
print("nodes rank and their weights:", nodes_rank)
print("nodes rank:", nodes_rank.keys())

delta = time.time() - start_time
print("running time:", delta)