# Import

In [7]:
from helpers import *
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import csv
from urllib.parse import unquote

# Downloading the data

In [8]:
dfs = preprocessing()

In [9]:
# Function to split the list based on '.' prefix
def split_list_on_prefix(row):
    list_ = []
    row_ = []
    for word in row:
        if word.startswith('.'):
            row_.append(list_)
            list_ = [word[1:]]
        else:
            list_.append(word)  
    row_.append(list_)
    return row_

In [6]:
# Create a list with all the peopls name
list_people = dfs['categories'].copy()[dfs['categories']['category'].str.contains('subject.People.')].article

# Look for all the finished path that target a people from the above list
path_target_people = pd.DataFrame(dfs['paths_finished'].path.copy().str.split(';'))
path_target_people['target'] = path_target_people.path.apply(lambda x: x[-1])
path_target_people = path_target_people[path_target_people.target.isin(list_people)]

#Remove all the path of length 1 (directly on the correct page)
path_target_people = path_target_people[path_target_people['path'].apply(lambda x: len(x) >= 2)]

# Separate the case of returns into list of list
path_target_people.path = [split_list_on_prefix(r) for r in path_target_people.path]

# Count the occurence of each name in target
target_distribution = path_target_people.target.value_counts()
path_target_people = path_target_people.merge(target_distribution, on = 'target', how = 'left')
path_target_people

target_people_list = path_target_people.target.drop_duplicates()
print(len(list_people), len(target_people_list))
display(target_people_list)

689 454


0              John_F._Kennedy
2       Elizabeth_I_of_England
7          Henry_David_Thoreau
8           Buckminster_Fuller
9          James_II_of_England
                 ...          
4254            Witold_Pilecki
4296            Donald_Bradman
4686              Sandy_Koufax
4738            Hannibal_Barca
5011               Franz_Kafka
Name: target, Length: 454, dtype: object

### Not all the people articles are targeted in the game (454 target are people out of the 689 possible names)

In [10]:
import matplotlib._color_data as mcd

def get_cat(Node):
    try:
        return result_dict[Node]["main_subject"]
    except KeyError as e:
        return np.nan

def get_color(Node):
    try:
        return map_cat_to_color[result_dict[Node]["main_subject"]]
    except KeyError as e:
        return "#FFFFFF"


all_cat = ['Science', 'Geography', 'People', 'History', 'Everyday_life', 'Citizenship', 'Design_and_Technology', 'Countries', 'Language_and_literature', 'Religion', 'Music', 'IT', 'Business_Studies', 'Mathematics', 'Art', None]
colors = list(mcd.XKCD_COLORS.values())[::7]
map_cat_to_color = {}
for i in range(len(all_cat)):
    map_cat_to_color[all_cat[i]] = colors[i]

j = 0
source_list = []

for people in target_people_list.iloc[7:17]:
    graph = nx.MultiDiGraph()
    target_article = people
    print(target_article)
    source_list = []

    #Find all the path that this target have
    for i, path in path_target_people[path_target_people.target == people].iterrows():
        # Need to add the path of each nodes
        path_target = path.path
        # Handle return case
        for j in range(len(path_target)):
 
            #Iterate to each words of the (partial) list
            partial_path = path_target[j]
            for k in range(len(partial_path)-1):
                target = partial_path[k+1]
                if target != target_article:
    
                    target = target
                # Add edges to the graph
                cur_nod = partial_path[k]
                if k == 0: 
                    source_list.append(cur_nod)
                graph.add_edge(cur_nod, target)
                graph.add_node(cur_nod)


    plt.figure(figsize=(8, 4))
    pos = nx.nx_agraph.graphviz_layout(G = graph.subgraph(set(graph) - {target_article}), prog="twopi", root=0)
    pos[target_article] = np.array([0, 0])

    #node size
    nd = 100
    color_noded = 'skyblue'
    node_names = list(graph.nodes())

    node_colors = [get_color(node) for node in node_names]
    color_of_target = get_color(target_article)
    color_of_source = [get_color(node) for node in source_list]
    print(color_of_source)

    color_edge_target = 'red'
    color_edge_source = 'green'
    #nx.flow_hierarchy(graph)
    nx.draw(graph, pos, with_labels=False, node_size=nd, node_color= node_colors, nodelist=node_names,  font_size=5, font_color='black', font_weight='bold', arrowsize=8, alpha = 0.2)
    nx.draw_networkx_nodes(graph, pos, nodelist=[target_article], node_color= color_of_target,  edgecolors = color_edge_target, node_size=nd, linewidths=2)
    nx.draw_networkx_nodes(graph, pos, nodelist=source_list, node_color= color_of_source, edgecolors=color_edge_source, node_size=nd, linewidths=2, alpha = 0.5)
    
    # Create a custom legend using the color map
    legend_labels = [f'{cat}' for cat in all_cat]  # Unicode character for a filled circle
    legend_colors = [map_cat_to_color[cat] for cat in all_cat]
    legend_markers = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in legend_colors]
    plt.legend(legend_markers, legend_labels, loc='upper left', bbox_to_anchor=(1, 1))

    plt.show()


Martina_Navratilova


ImportError: requires pygraphviz http://pygraphviz.github.io/

<Figure size 800x400 with 0 Axes>

# Repartition of last linkage

Dans mes plots on prends pas en compte le count des paths dans l'épaisseur des edge, ça pourrait être qch à étudier

In [13]:

# Add the name of the last article before reaching the target in the path 
path_target_people['last_before_target'] = 'None'
for i, row in path_target_people.iterrows():
    if len(row.path[-1]) <2:
        print(row.path[-1])
        print(i)
    path_target_people.loc[i, 'last_before_target'] = row.path[-1][-2]
path_target_people

# Count duplicates of the target, last_before_target pairs 
last_link = path_target_people.loc[:,['target', 'last_before_target']]
last_link['count_duplicates'] = last_link.groupby(['target', 'last_before_target'])['last_before_target'].transform('count')
last_link.drop_duplicates(subset=['target', 'last_before_target'], inplace=True)

# Add the percentage that the name of the last article before reaching the target represent
last_link['repartition_perc'] = 1000.0
for people in last_link.target.unique():
    l = last_link[last_link.target == people].count_duplicates
    last_link.loc[last_link.target == people, 'repartition_perc'] = last_link.loc[last_link.target == people, 'count_duplicates']/last_link[last_link.target == people].count_duplicates.sum()*100

# Add the general count of value according to the category of the last article before reaching the target
last_link['category_last'] = [ ('people' if last in list_people.values else 'non_people') for last in last_link.last_before_target ]
target_cat_last = last_link.groupby(['target', 'category_last'])['count_duplicates'].sum().reset_index()
target_cat_last.rename(columns={'count_duplicates': 'count_by_categories'}, inplace = True)
last_link = last_link.merge(target_cat_last, on = ['target', 'category_last'], how = 'left')

display(last_link)

Unnamed: 0,target,last_before_target,count_duplicates,repartition_perc,category_last,count_by_categories
0,John_F._Kennedy,President_of_the_United_States,12,54.545455,non_people,21
1,Elizabeth_I_of_England,Francis_Drake,1,5.000000,people,8
2,Elizabeth_I_of_England,James_I_of_England,1,5.000000,people,8
3,Elizabeth_I_of_England,Henry_VIII_of_England,5,25.000000,people,8
4,Elizabeth_I_of_England,Windsor_Castle,1,5.000000,non_people,12
...,...,...,...,...,...,...
1459,Felix_Mendelssohn,Romanticism,1,25.000000,non_people,4
1460,Zoroaster,Persian_Empire,1,25.000000,non_people,4
1461,Christopher_Columbus,15th_century,1,6.666667,non_people,15
1462,Robert_Oppenheimer,Richard_Feynman,1,20.000000,people,2


In [14]:
last_link[last_link.target == 'Robert_Oppenheimer']

Unnamed: 0,target,last_before_target,count_duplicates,repartition_perc,category_last,count_by_categories
825,Robert_Oppenheimer,History_of_nuclear_weapons,3,60.0,non_people,3
1214,Robert_Oppenheimer,Linus_Pauling,1,20.0,people,2
1462,Robert_Oppenheimer,Richard_Feynman,1,20.0,people,2


In [15]:
last_people = last_link.loc[last_link.category_last == 'people',]
last_not_people = last_link.loc[last_link.category_last == 'non_people',]

In [16]:
print(last_not_people.count_duplicates.sum(), last_people.count_duplicates.sum())
print(len(last_not_people.count_duplicates), len(last_people.count_duplicates))

5129 781
1128 336
