In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# Python functions in .py file to read data
import data_readers

import ast

Study of the Game
- How many articles, links
- Distr of categories

Study of Man
- Shortest paths
- Categories of unfinished paths

Study of the Machine
- Compare algos

Study of Man vs Machine
- When does the man beat the machine?

In [128]:
finished_paths = data_readers.read_finished_paths()


In [4]:
# The links and edges
wikispeedia = data_readers.read_wikispeedia_graph()

# The finished paths
finished_paths = data_readers.read_finished_paths()

# The unfinished paths
unfinished_paths = data_readers.read_unfinished_paths()

# DF of all articles
articles = data_readers.read_articles()

# DF of all articles and their categories
categories = data_readers.read_categories()


# We found out later that the data contained in the shortest path matrix given to us seems to be wrong
# Here we also add a quick dictionary that properly shows that this is wrong, and give an example
shortest_path_df = data_readers.read_shortest_path_df()
shortest_path_dict = dict(nx.all_pairs_shortest_path(wikispeedia))

# Searching for the string of a given article. It has to be formatted like the article name
# Which shouldn't be a problem, as we'll probably usually retrieve them internally
obi_wan_text = data_readers.plaintext_article_finder('Obi-Wan_Kenobi')

In [None]:
machine_data = pd.read_csv('notebooks_final/machine_data_runs_0_269.csv')

In [None]:
def prepare_machine_data_shortest_paths(df) -> pd.DataFrame:
    df['Path_1'] = df['Path_1'].apply(lambda x: ast.literal_eval(x))
    df['len_shortest_path'] = machine_data['Path_1'].apply(len) 
    return df
machine_data = prepare_machine_data_shortest_paths(machine_data)


Study of the Game (Data)
- number articles, paths, etc
- Which categories are the articles in?

Study of Man
- Distribution of human paths
    - bubble chart. x-axis: category, y-axis: length to target node, size: count of paths

- Can we identify any patterns or recurring structures in the human paths (ie. going for a central hub)?

Study of the Machine
- Distribution of machine shortest paths.
- Which categories does machine have the most explorations?

Study of Man vs Machine
- When are human paths shorter than machine paths?


In [None]:
machine_data.head(1)

# Ch 1. Study of the Game

In [None]:
# wikispeedia
G = nx.Graph()
nx.drawing.nx_pydot.graphviz_layout(G)


In [None]:
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

In [None]:
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

In [None]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()

In [None]:
# Create Plotly figure
fig = go.Figure()
G = wikispeedia
# Add nodes to the figure
for node in G.nodes():
    x, y = G.nodes[node]['pos'] if 'pos' in G.nodes[node] else (None, None)
    fig.add_trace(go.Scatter(
        x=[x],
        y=[y],
        mode="markers+text",
        marker=dict(size=20),
        text=str(node),
        name=str(node)
    ))
# Add edges to the figure
for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos'] if 'pos' in G.nodes[edge[0]] else (None, None)
    x1, y1 = G.nodes[edge[1]]['pos'] if 'pos' in G.nodes[edge[1]] else (None, None)
    fig.add_trace(go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        mode='lines',
        line=dict(width=1),
        name=f"Edge {edge[0]}-{edge[1]}"
    ))

    # Update layout
fig.update_layout(
    title="NetworkX Graph with Plotly",
    showlegend=True,
    hovermode='closest'
)

# Show the figure
fig.show()

### Which categories are the target nodes from?

In [134]:
print('There are', categories['article'].duplicated().sum(), 'articles with more than 1 category.')

print('Should we drop the duplicate categories, or doublecount them?')
print('This corresponds to', categories['article'].duplicated().sum() / len(wikispeedia.nodes), 'of the articles.')

# Let's drop them for now.
categories['article'] = categories['article'].drop_duplicates()
print('The new shape is: ',categories.shape)

# Why are there more articles here than nodes (# articles)?

There are 606 articles with more than 1 category.
Should we drop the duplicate categories, or doublecount them?
This corresponds to 0.13196864111498258 of the articles.
The new shape is:  (5204, 2)


In [135]:
# Let's use string manipulation to extract the highest level category for each article.

sub_categories = categories['categories'].str[8:].str.split('.')
category_depth_1 = sub_categories.apply(lambda x: x[0])
categories['depth_1'] = category_depth_1
# categories.head()

Let's find the category corresponding to each unfinished target.

@daniele

In [32]:
# Merging categories with unfinished paths.
unfinished_paths_with_categories = pd.merge(unfinished_paths, categories, left_on = 'target', right_on= 'article', how = 'left')

# Count the occurrences of each category
category_counts = unfinished_paths_with_categories['depth_1'].value_counts()

fig = go.Figure(go.Bar(
    x=category_counts.index,
    y=category_counts.values,
    marker=dict(color='blue')  # Adjust colors if needed
))

# Update layout
fig.update_layout(
    title='Frequency of Categories of Target Articles in Unfinished Paths',
    xaxis=dict(title='Categories', tickangle=45),  # Rotate x-axis labels for better readability
    yaxis=dict(title='Frequency'),
    height=500,  # Adjust the height of the plot as needed
    margin=dict(l=80, r=80, t=80, b=80),  # Adjust margins for better appearance
)

fig.show()

Notice that targets in the science category make up the largest proportion of unfinished games. In Deliverable 3, we will investigate this more. We'll discover if this is because most of the articles are from the science category, or if science articles are actually harder to find in the game.

# Ch 2. Study of Man

In [33]:
shortest_path_df

Unnamed: 0,"(%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,)","(%C3%85land,)","(%C3%89douard_Manet,)","(%C3%89ire,)","(%C3%93engus_I_of_the_Picts,)","(%E2%82%AC2_commemorative_coins,)","(10th_century,)","(11th_century,)","(12th_century,)","(13th_century,)",...,"(Ziad_Jarrah,)","(Zimbabwe,)","(Zinc,)","(Zinc_chloride,)","(Zion_National_Park,)","(Zionism,)","(Zirconium,)","(Zoroaster,)","(Zuid-Gelders,)","(Zulu,)"
"(%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,)",0,-1,-1,-1,-1,-1,3,3,3,3,...,4,3,3,4,4,3,4,4,4,2
"(%C3%85land,)",-1,0,-1,-1,-1,-1,2,2,2,2,...,4,2,3,4,4,3,4,3,3,3
"(%C3%89douard_Manet,)",-1,-1,0,-1,-1,-1,3,3,2,2,...,4,3,2,3,4,3,4,3,3,3
"(%C3%89ire,)",-1,-1,-1,0,-1,-1,3,3,3,3,...,4,2,2,3,4,3,4,4,3,3
"(%C3%93engus_I_of_the_Picts,)",-1,-1,-1,-1,0,-1,2,2,3,2,...,4,2,3,4,4,3,4,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(Zionism,)",-1,-1,-1,-1,-1,-1,2,2,2,2,...,3,2,2,3,3,0,3,3,3,2
"(Zirconium,)",-1,-1,-1,-1,-1,-1,3,3,3,3,...,3,3,2,2,3,3,0,3,4,3
"(Zoroaster,)",-1,-1,-1,-1,-1,-1,2,2,2,2,...,3,2,2,3,4,3,3,0,3,3
"(Zuid-Gelders,)",-1,-1,-1,-1,-1,-1,3,3,3,3,...,4,3,3,4,4,3,5,4,0,3


In [153]:
# Merging categories with finished paths.
finished_paths_with_category = pd.merge(finished_paths, categories, left_on = 'first_article', right_on= 'article', how = 'left')
finished_paths_with_category = finished_paths_with_category[['path', 'first_article', 'last_article', 'path_length', 'depth_1']]
finished_paths_with_category.rename(columns={'depth_1': 'Category'}, inplace=True)
finished_paths_with_category


Unnamed: 0,path,first_article,last_article,path_length,Category
0,14th_century;15th_century;16th_century;Pacific...,14th_century,African_slave_trade,9,History
1,14th_century;Europe;Africa;Atlantic_slave_trad...,14th_century,African_slave_trade,5,History
2,14th_century;Niger;Nigeria;British_Empire;Slav...,14th_century,African_slave_trade,8,History
3,14th_century;Renaissance;Ancient_Greece;Greece,14th_century,Greece,4,History
4,14th_century;Italy;Roman_Catholic_Church;HIV;R...,14th_century,John_F._Kennedy,7,History
...,...,...,...,...,...
51313,Yagan;Ancient_Egypt;Civilization,Yagan,Civilization,3,People
51314,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,Yagan,Fiction,9,People
51315,Yagan;Australia;England;France;United_States;T...,Yagan,U.S._Open_%28tennis%29,7,People
51316,Yarralumla%2C_Australian_Capital_Territory;Aus...,Yarralumla%2C_Australian_Capital_Territory,Abraham_Lincoln,4,Geography


In [138]:
# @daniele
px.histogram(finished_paths, x='path_length', title='Distribution of Finished Path Lengths taken by Humans')
# px.bar(data_frame=)

## 2.2 Study of Unique Paths / ## 2.2 Path Length Per Category
Here we study the unique source and target pairs. We will use the dataframes to compare the performance between humans and machines, as well as to know what paths to make machines complete.

**article_combinations**

This dataframe contains information on all the combination of source and target articles in the finished games (paths). It includes how many times it has been played, and the mean and std of the path length, duration of the game, and rating.

**unique_targets** and **unique_sources**


These dataframes include all the sources and targets that appears in the finished games

<br><br>
Note that we don't change to ASCII the name of the articles yet. We will do it at a later step if we need to.
<br><br>

In [39]:
# How many each pair of articles has been visited
article_combinations_count = finished_paths.groupby(['first_article', 'last_article']).size().reset_index(name='count')

# The mean and std of the path length for each pair of articles
article_combinations_stats = finished_paths.groupby(['first_article', 'last_article'])['path_length'].agg(['mean', 'std']).reset_index()
article_combinations_stats['std'] = article_combinations_stats['std'].fillna(0)
article_combinations_stats.rename(columns={'mean': 'mean_length', 'std': 'std_length'}, inplace=True)

# The mean and std of the rating for each pair of articles. 
    # Note that mean and std may be nan if there are nan ratings. We purposely leave them as nan, as we don't want to fill them with 0s or 1s.
    # Depending on the application, we could change this in the future if neeeded.
rating_combinations_stats_rating = finished_paths.groupby(['first_article', 'last_article'])['rating'].agg(['mean', 'std']).reset_index()
#rating_combinations_stats_rating['std'] = rating_combinations_stats_rating['std'].fillna(0)
mask = rating_combinations_stats_rating['mean'].notnull()
rating_combinations_stats_rating.loc[mask, 'std'] = rating_combinations_stats_rating.loc[mask, 'std'].fillna(0)
rating_combinations_stats_rating.rename(columns={'mean': 'mean_rating', 'std': 'std_rating'}, inplace=True)

# The mean and std of the time for each pair of articles.
rating_combinations_stats_time = finished_paths.groupby(['first_article', 'last_article'])['durationInSec'].agg(['mean', 'std']).reset_index()
rating_combinations_stats_time['std'] = rating_combinations_stats_time['std'].fillna(0)
rating_combinations_stats_time.rename(columns={'mean': 'mean_durationInSec', 'std': 'std_durationInSec'}, inplace=True)

# Merging all the dataframes
article_combinations = pd.merge(article_combinations_count, article_combinations_stats, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_rating, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_time, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, categories, left_on= 'last_article', right_on = 'article')

# The number of unique sources and targets
unique_sources = finished_paths['first_article'].value_counts().reset_index()
unique_targets = finished_paths['last_article'].value_counts().reset_index()

@daniele

In [85]:
avg_length_per_category = article_combinations.groupby('depth_1')['mean_length'].mean()
x = avg_length_per_category.keys()
y = avg_length_per_category
size = np.log(article_combinations.groupby('depth_1')['count'].sum())

fig = go.Figure()

fig.add_trace(go.Scatter(
    x= x,
    y= y,
    mode='markers',
    marker=dict(
        size=size,
                # color=bubble_sizes,
                colorscale='Viridis',  # Adjust colorscale if needed
                showscale=True
                )
))

fig.update_layout(
    title='Average Path Length by Category of Target Node (from Finished Human Paths)',
    xaxis=dict(title='Categories'),
    yaxis=dict(title='Average Path Length'),
    showlegend=False,
    hovermode='closest'
)

fig.show()

In [37]:
unique_sources.sample(5)

Unnamed: 0,first_article,count
701,Geography_of_India,17
1397,Ammolite,12
2103,Batholith,9
4008,Anne_of_Great_Britain,2
3497,Anno_Domini,4


In [38]:
unique_targets.sample(5)

Unnamed: 0,last_article,count
2121,Hollandic,7
434,Shark,25
1897,Radish,8
1380,Arab-Israeli_conflict,11
2074,Culture,7


# Ch 3. Study of Machine

# Ch 4. Study of Man vs Machine

In [143]:
machine_data = pd.read_csv('notebooks_final/machine_data_runs_934_most_common.csv')

In [144]:
def prepare_machine_data_shortest_paths(df) -> pd.DataFrame:
    df['Path_1'] = df['Path_1'].apply(lambda x: ast.literal_eval(x))
    df['len_shortest_path'] = machine_data['Path_1'].apply(len) 

    # Drop useless columns
    df = df[['Source', 'Target', 'len_shortest_path']]
    return df
machine_data = prepare_machine_data_shortest_paths(machine_data)

In [145]:
machine_data

Unnamed: 0,Source,Target,len_shortest_path
0,Asteroid,Viking,4
1,Brain,Telephone,4
2,Theatre,Zebra,4
3,Pyramid,Bean,4
4,Batman,Wood,4
...,...,...,...
929,Flag of the Republic of China,Tanzania,4
930,Barcelona,Tower of London,4
931,New Zealand,Atheism,3
932,Barbados,Butter,4


## 4.1 When are human paths shorter than the machine paths?

In [187]:
def combine_human_and_machine(df_human, df_machine) -> pd.DataFrame:
    df = pd.merge(df_human, df_machine, how = 'inner', left_on= ['first_article', 'last_article'], right_on = ['Source', 'Target'])
    df.rename(columns= {'path_length': 'man_len', 'len_shortest_path': 'machine_len'}, inplace=True)
    return df
man_machine_df = combine_human_and_machine(finished_paths_with_category, machine_data)

In [188]:
man_slower = man_machine_df[man_machine_df['man_len'] > man_machine_df['machine_len']]
man_faster = man_machine_df[man_machine_df['man_len'] < man_machine_df['machine_len']]

In [189]:
man_avgs = man_machine_df.groupby('Category')[['man_len', 'machine_len']].mean()
man_avgs

Unnamed: 0_level_0,man_len,machine_len
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Art,8.693878,3.346939
Business_Studies,6.544304,3.392405
Citizenship,7.247525,3.930693
Countries,6.220339,3.288136
Design_and_Technology,8.020531,3.992754
Everyday_life,6.315053,3.736289
Geography,6.290997,3.310289
History,9.740741,4.074074
IT,7.609195,4.017241
Language_and_literature,7.74895,3.985294


In [211]:
fig = px.bar(man_avgs, title='Average Finished Path Length by Category of Machine vs Man')

fig.update_traces(
    showlegend=True,  # Show the legend
    legend_title='Player',  # Set legend title
    selector={'type': 'box'},  # Select traces of type 'box' for updating
)

# Update layout
fig.update_layout(
    # title='Average Finished Path Length by Category of Machine vs Man',
    xaxis=dict(title='Category'),
    yaxis=dict(title='Length'),
    yaxis2=dict(title='Length'),  # Add a separate y-axis label for the second box plot
)

In [185]:
man_machine_df = pd.melt(man_machine_df, id_vars=['Category'], var_name='Variable', value_name='Value')
man_machine_df
# Create a box plot with both 'man_len' and 'machine_len' on the same plot
# fig = px.box(man_machine_df, x='Category', y='Value', color='Variable', facet_col='Variable')


# px.box(man_machine_df, x = 'Category', y = ['man_len'])
# px.box(man_machine_df, x = 'Category', y = ['machine_len'])

ValueError: value_name (Value) cannot match an element in the DataFrame columns.