In [14]:
import csv 
import networkx as nx
import matplotlib.pyplot as plt
from graph import Graph
from article import Article
import random
import pandas as pd

In [15]:
graph = Graph()

**Part 1 : read the articles category file**
Populate the graph with articles, categories and categories 

In [16]:
graph.update_graph("data/categories.tsv", edges=False, verbose=True)

The graph has 4599 articles, 127 categories, and 15 topics.
The number of edges is :
0 in the articles graph,
0 in the categories graph,
0 in the topics graph.


**Part 2 : Add the edges between the vertices**

In [17]:
graph.update_graph("data/paths_finished.tsv", edges=True, verbose=True) 

The graph has 4599 articles, 127 categories, and 15 topics.
The number of edges is :
50354 in the articles graph,
6530 in the categories graph,
208 in the topics graph.


In [18]:
graph.update_graph("data/paths_unfinished.tsv", edges=True, verbose=True)

The graph has 4599 articles, 127 categories, and 15 topics.
The number of edges is :
57572 in the articles graph,
6931 in the categories graph,
208 in the topics graph.


**get top 10 categories**

In [19]:
def verteces_to_df(graph, level):
    assert level in graph.authorized_levels
    _, verteces = graph.levels_map[level]
    header =["Topic", "Category", "nb_unique_edges", "nb_unique_outgoing_edges", "nb_unique_incoming_edges", "total_weight" ,"total_weight_out", "total_weight_in","average_weight_per_edge", "average_weight_out", "average_weight_in"]  
    if level == "categories":
        header.remove("Category")
        header.append("nb_articles")
    if level == "topics":
        header.remove("Topic")
        header.remove("Category")
        header.append("nb_categories")
        header.append("nb_articles")
    
    df = pd.DataFrame(columns=header)
    for vertex in verteces.values():
        nb_unique_outgoing_edges = vertex.get_nb_different_out_neighbours()
        nb_unique_incoming_edges = vertex.get_nb_different_in_neighbours()
        nb_unique_edges = nb_unique_outgoing_edges + nb_unique_incoming_edges
        
        if (nb_unique_edges != 0) :
            total_weight_out = vertex.get_total_out_weight()
            total_weight_in = vertex.get_total_in_weight()
            total_weight = total_weight_out + total_weight_in
            average_weight_out = round(total_weight_out/nb_unique_outgoing_edges, 2) if nb_unique_outgoing_edges != 0 else 0
            average_weight_in = round(total_weight_in/nb_unique_incoming_edges,2) if nb_unique_incoming_edges != 0 else 0
            average_weight_per_edge = round(total_weight/nb_unique_edges, 2)

            if level == "articles":
                df.loc[vertex.title] = [vertex.topic, vertex.category, nb_unique_edges, nb_unique_outgoing_edges, nb_unique_incoming_edges, total_weight, total_weight_out, total_weight_in, average_weight_per_edge, average_weight_out, average_weight_in]
            elif level == "categories":
                df.loc[vertex.title]= [vertex.topic, nb_unique_edges, nb_unique_outgoing_edges, nb_unique_incoming_edges, total_weight, total_weight_out, total_weight_in, average_weight_per_edge, average_weight_out, average_weight_in, vertex.get_nb_different_articles()]
            else:
                df.loc[vertex.title]= [nb_unique_edges, nb_unique_outgoing_edges, nb_unique_incoming_edges, total_weight, total_weight_out, total_weight_in, average_weight_per_edge, average_weight_out, average_weight_in, vertex.get_nb_different_categories(), vertex.get_nb_different_articles()]
    return df

In [20]:
df_articles = verteces_to_df(graph, level="articles")
df_categrories = verteces_to_df(graph, level="categories")
df_topics = verteces_to_df(graph, level="topics")

**in Size**

In [21]:
df_articles.sort_values(by=["average_weight_out"], ascending=False).head(5)


Unnamed: 0,Topic,Category,nb_unique_edges,nb_unique_outgoing_edges,nb_unique_incoming_edges,total_weight,total_weight_out,total_weight_in,average_weight_per_edge,average_weight_out,average_weight_in
Batman,Everyday_life,Cartoons,14,7,7,909,864,45,64.93,123.43,6.43
Theatre,Language_and_literature,Theatre,45,17,28,1893,1749,144,42.07,102.88,5.14
Animation,Everyday_life,Films,16,3,13,647,305,342,40.44,101.67,26.31
Asteroid,Science,Space_Astronomy,52,19,33,1921,1789,132,36.94,94.16,4.0
Pyramid,Design_and_Technology,Architecture,24,17,7,1626,1551,75,67.75,91.24,10.71


In [22]:
df_categrories.sort_values(by=["average_weight_out"], ascending=False).head(5)

Unnamed: 0,Topic,nb_unique_edges,nb_unique_outgoing_edges,nb_unique_incoming_edges,total_weight,total_weight_out,total_weight_in,average_weight_per_edge,average_weight_out,average_weight_in,nb_articles
North_American_Geography,Geography,225,102,123,31483,15206,16277,139.92,149.08,132.33,103
General_Biology,Science,150,69,81,17247,9049,8198,114.98,131.14,101.21,51
Countries,Countries,219,101,118,24228,11650,12578,110.63,115.35,106.59,229
European_Countries,Geography,227,105,122,23472,11217,12255,103.4,106.83,100.45,56
Geography_of_Great_Britain,Geography,221,101,120,20778,10191,10587,94.02,100.9,88.22,124


In [23]:
df_topics.sort_values(by=["average_weight_out"], ascending=False).head(5)

Unnamed: 0,nb_unique_edges,nb_unique_outgoing_edges,nb_unique_incoming_edges,total_weight,total_weight_out,total_weight_in,average_weight_per_edge,average_weight_out,average_weight_in,nb_categories,nb_articles
Geography,28.0,14.0,14.0,98827.0,47351.0,51476.0,3529.54,3382.21,3676.86,21.0,1084.0
Science,28.0,14.0,14.0,53946.0,28269.0,25677.0,1926.64,2019.21,1834.07,18.0,1122.0
History,28.0,14.0,14.0,37171.0,17873.0,19298.0,1327.54,1276.64,1378.43,14.0,545.0
Everyday_life,28.0,14.0,14.0,32718.0,17246.0,15472.0,1168.5,1231.86,1105.14,12.0,374.0
People,28.0,14.0,14.0,33142.0,16852.0,16290.0,1183.64,1203.71,1163.57,20.0,708.0


In [24]:
#nx.draw_random(G1,node_color = colors,labels = labels,with_labels = True) 
n = len(df_topics)
#color = color_list(n)
color =['#445148', '#4a136c', '#502590', '#5636b3', '#5653b1',
        '#4e7a88', '#47a05f', '#40c736', '#ffbc3f', '#ffa426',
        '#ff8c0d', '#ff7504', '#ff5e0b', '#ff4812', '#ff3219']

labels = list(graph.topics.keys())
# map each category to a color
color_map = {category: color[i] for i, category in enumerate(graph.topics.keys())}

G1 = nx.DiGraph()
size = [df_topics()*25 for t in list(graph.topics.values())]
#loop on graph.matrixt
for key, value in graph.matrix_topics.items():
    for key2, value2 in value.items():
            G1.add_edge(key,key2,weight=value2, color=color_map[key])


color = []
for node in G1.nodes(): 
    color.append(color_map[node])


label_dict = {}
for i in range(len(labels)):
    label_dict[labels[i]] = labels[i]
plt.figure(figsize=(15,15))

edges = G1.edges()
weights = [G1[u][v]['weight']/500 for u,v in edges]
colors = [G1[u][v]['color'] for u,v in edges]
nx.draw_circular(G1,node_size = size,node_color = color,labels=label_dict, width= weights, edge_color=colors)
plt.show()

TypeError: 'DataFrame' object is not callable

In [None]:
nodePos = nx.circular_layout(G1)

In [None]:
nodePos

In [None]:
graph.matrix['Art']

In [None]:
#design1


#nx.draw_random(G1,node_color = colors,labels = labels,with_labels = True) 
n = graph.nb_categories
#color = color_list(n)
color = ['#445148', '#4a136c', '#502590', '#5636b3', '#5653b1',
        '#4e7a88', '#47a05f', '#40c736', '#ffbc3f', '#ffa426',
        '#ff8c0d', '#ff7504', '#ff5e0b', '#ff4812', '#ff3219']
#'#440148
labels = list(graph.categories.keys())
# map each category to a color
color_map = {category: color[i] for i, category in enumerate(graph.categories.keys())}
print(color_map)
G1 = nx.DiGraph()
size = [t.size*40 for t in list(graph.categories.values())]
#loop on graph.matrixt
for key, value in graph.matrix.items():
    for key2, value2 in value.items():
     
            G1.add_edge(key,key2,weight=value2, color=color_map[key])
      


color = []
for node in G1.nodes(): 
    color.append(color_map[node])
  



label_dict = {}
for i in range(len(labels)):
    label_dict[labels[i]] = labels[i]
plt.figure(figsize=(15,15))

edges = G1.edges()
weights = [G1[u][v]['weight']/500 for u,v in edges]
colors = [G1[u][v]['color'] for u,v in edges]

nodePos = nx.circular_layout(G1)
nodePos['People'] = [ -0.5, 0]
nodePos['Citizenship'] = [0.7,0]
nodePos['Geography'] = [0,0.1]
nodePos['Science'] = [-0.5,0.5]
nodePos['Everyday_life'] = [-0.75,0.25]
nodePos['History'] = [0,0.5]
nodePos['Countries'] = [0.3,0.3]
nodePos['Design_and_Technology'] = [0,-0.3]
nodePos['Language_and_literature'] = [-0.3,-0.3]
nodePos['Religion'] = [0.1,-0.1]

nodePos['Mathematics'] = [0,-0.4]
nodePos['Art'] = [-0.5,-0.4]
nodePos['IT'] = [-1,0]

nodePos['Music'] = [0.5,-0.4]
nodePos['Business_Studies'] = [0.3,-0.3]

#people,science citizenship geograpgy  everydaylife
nx.draw(G1,node_size = size,pos = nodePos,node_color = color,labels=label_dict, width= weights, edge_color=colors)
plt.show()