In [None]:
import numpy as np
import time
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import collections

### Test graph

In [None]:
#importing data
data = pd.read_csv('wiki-topcats-reduced.txt', sep="	", header=None)
data.columns = ["from", "to"]
data.head()

In [None]:
#get the list of starting nodes 
list1 = data["from"].values

In [None]:
# get the list of ending nodes
list2 = data["to"].values

In [None]:
#get sets of starting and ending nodes list to calculate the total set
set1=set(list1)
set2=set(list2)
fin_set = set1.union(set2)

In [None]:
#number of connected nodes
len(fin_set)

In [None]:
#number of edges
len(data)

In [None]:
# using networkx library to create the graph
G = nx.DiGraph()

In [None]:
#first step of filling the empty graph: adding all the connected nodes
#also initializing the  categories feature 
G.add_nodes_from(fin_set, categories = [])

In [None]:
#adding edges to the graph (G)
for i in range(len(data)):
    G.add_edge(list1[i], list2[i])

In [None]:
#using number_of_nodes() method to re-calculate the total number of connected nodes
G.number_of_nodes()

In [None]:
# using G.number_of_edges() method to re-calculate the total number of edges
G.number_of_edges()

In [None]:
# creating a dictionary of tuples in which the first element is the name of that category
#and the second element is the list of articles in that category 
categoriesdata = {}
with open('wiki-topcats-categories.txt') as f:
    for i, line in enumerate(f):
        tmp = line.split(';')
        categoriesdata[i] = (tmp[0].replace('Category:', ''), [int(i) for i in tmp[1].split()])
f.close()

In [None]:
categoriesdata

In [None]:
#we fill the list of categories for every node in the graph
#updating the category feature for every node in graph 

for elem in categoriesdata: #for every key( category) in the categories dictionary
    for node in categoriesdata[elem][1]: # for every node in the list
        if(node in fin_set): # if node is connected
            if len(categoriesdata[elem][1]) > 3500: #if the length of list is more than 3500
                ncat = G.node[node]['categories'] + [categoriesdata[elem][0]] #get the list of categories and add the new category
                G.node[node].update(categories = ncat) # update the category feature in the graph

In [None]:
# printing the graph's information
print(nx.info(G))

In [None]:
#create a dictionary of all the categories if their number of articles are more than 3500
# the key of the dictionary is the name of the category and the value is all the nodes in the category
# which are connected 
categories = dict()
for tpl in categoriesdata:
    if len(categoriesdata[tpl][1]) > 3500:
        categories[categoriesdata[tpl][0]] = [node for node in categoriesdata[tpl][1] if node in fin_set]

In [None]:
#choose the input category
input_category = "Year_of_birth_unknown"

In [None]:
#using multiprocessing
from multiprocessing import Pool

In [None]:
inf=float("inf")
#writing empty files for each category that has more than 3500 articles and connected nodes
for category in categories:
    file=open("C:/Users/user-01/Desktop/ADM HW5/files/" + category, "w")
    file.close()

print("Sources:/n")
#BFS algo
def compute(source):

    print(source)
    
    level=0 #initializing the level zero as the starting level
    current_level=set() #initializing current level as an empty set
    next_level=set() #initializing next level as an empty set
    current_level.add(source) # in the first step current level is equal to the source
    visited=set() #initializing visited nodes as an empty set
    
    for link in list(G.neighbors(source)): #for iterating over the source's neighbors
        next_level.add(link) #add source's neighbors to the next level
    
    while next_level!=set(): #while next level is not empty
    
        next_level=set() # we re-initialize the next level
    
        for node in current_level: # assign the current level nodes to the visited set
            visited.add(node)

            if node == source: # if the node is source
                for category in G.node[node]['categories']: #for category in node's list of categories
                    file=open("C:/Users/user-01/Desktop/ADM HW5/files/" + category, "a") # opening the file with corresponding category
                    file.write("0")
                    file.write("/n")
                    file.close()
                    
            elif node!=source: #if the node is not the source
                for category in G.node[node]['categories']: #for category in node's list of categories
                    file=open("C:/Users/user-01/Desktop/ADM HW5/files/" + category, "a") # opening the file with corresponding category
                    file.write(str(level))
                    file.write("/n")
                    file.close()
       
            #filling the next level
            for link in list(G.neighbors(node)): 
                if link not in visited:
                    next_level.add(link)
                    
        level+=1
        current_level=next_level

pool = Pool() #initializing the multi core processing
pool.map(compute, categories[input_category]) # mapping the compute function to all the nodes in the input category
pool.close() # close the multi core processing object 
pool.join()# provides synchronization for possible debugging outputs

In [None]:
#calculating median of distances for each category from input category

category_dict=dict() #dictionary of distances between input category and every other category

for category in categories: # for each category
    
    if category == input_category: # if the category is the input category
        
        print(category)
        print(0.0)
        category_dict[category]=0. # set the distance to 0.0 by default
        
        
    else: # if the category is not the input category
        
        print(category)

        file=open("C:/Users/user-01/Desktop/ADM HW5/files/" + category, "r") # open the file of the category
        content=file.read().splitlines() #create a list of each line(distances)
        file.close()

        content=list(map(float, content)) #change type of the content to float

        content.sort() #sort the content(distances)

        missing_infinities=len(categories[input_category])*len(categories[category])-len(content) #computing the number of missing inf distances in the category file

        if (len(content) + missing_infinities)%2 != 0: #if the total number of distances is odd
            median_index = int(((len(content) + missing_infinities)-1)/2 + 1) # calculating the median index
            if median_index < len(content): # check if the median index falls inside the finite distances
                median = content[median_index] # get the corresponding value of this index
            else: #if the median falls in the infinite distances
                median = inf # corresponding value will be infinity

        if (len(content) + missing_infinities)%2 == 0: #if the total number of distances is even
            median_index1 = int(((len(content) + missing_infinities)-1)/2) # calculating the median index1
            median_index2 = int(((len(content) + missing_infinities)-1)/2 + 1) # calculating the median index2
            if (median_index1 < len(content)) and (median_index2 < len(content)): #check if the median index1 and median index2 falls inside the finite distances
                median = (content[median_index1] + content[median_index2])/2 # take the mean of the values for median index1 and median index2
            else: #if the median index1 and /or median index2 falls inside the infinite distances
                median = inf  # corresponding value will be infinity

        print(median)
        category_dict[category]=median #providing median of category_dict for each category

In [None]:
#using namedtuple structure for convenience in terms of indexing
my_rank=collections.namedtuple('Category', ["name", "score"]) 

In [None]:
 
category_list=list() #initializing an empty list of namedtuples

for category in category_dict: #for iterating over categories in category_dict
    category_list.append(my_rank(name=category, score=category_dict[category])) #filing the category list with namedtuples

print(category_list) 

In [None]:
#calculating block ranking
for i in range(len(category_list)-1): #until all the elements are sorted
    for j in range(len(category_list)-1): #for all the elements in the list except the last one
        if category_list[j].score > category_list[j+1].score: # if the score of the category is greater than the following
            temp=category_list[j] # we assign jth tuple to a temporary variable
            category_list[j] = category_list[j+1] # we swap jth with j+1th tuple
            category_list[j+1]=temp #we assign temporary variable to j+1th tuple
            
print(category_list)