# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np

from ast import literal_eval

import plotly.express as px
import plotly.io as pio

from pyvis.network import Network
import plotly.graph_objects as go
from geopy.geocoders import Nominatim

from sklearn.preprocessing import MinMaxScaler

In [3]:
#!pip install pyvis
#!pip install jinja2
#!pip install geopy

## Questions of interest 

0) What are the most clicked articles?
1) What is the distribution of countries in the Wikipedia graph? 
2) What is the distribution of countries among start and stop articles? 
3) What is the distribution of in and out degrees per country? 

## 0. What are the most clicked articles? 

Let's first import the data `country_clicks_links.csv` with which we will be working.

In [4]:
data = pd.read_csv("data/country_clicks_links.csv", index_col=0)

Now we define the variables that we need.

In [5]:
data_sorted = data.sort_values(by='click_count', ascending=False)
data_sorted = data_sorted[data_sorted['click_count'] != 0]
articles = data_sorted.index.tolist()
click_count = data_sorted.click_count.tolist()
in_degree = data_sorted.num_links_in.tolist()
out_degree = data_sorted.num_links_out.tolist()

Let's construct a graph in which: 
- each node is an article
    - 20 most clicked articles 
    - 20 least clicked articles 

In [6]:
most_clicked_articles = articles[:20]
most_clicked_counts = click_count[:20]
most_in_degree = in_degree[:20]
most_out_degree = out_degree[:20]

least_clicked_articles = articles[-20:]
least_clicked_counts = click_count[-20:]
least_in_degree = in_degree[-20:]
least_out_degree = out_degree[-20:]

- the size and color of a node is proportional to the number of clicks associtated to that node
- each edge represents an out link

We are also interested in the in and out degree of each article
- in degree = num_links_in 
- out degree = num_links_out

Let's first look at the most clicked articles:

In [31]:
color_map = plt.cm.get_cmap('Reds')

# Normalize the log_counts to [0, 1]
norm = matplotlib.colors.Normalize(vmin=min(most_clicked_counts), vmax=max(most_clicked_counts))
most_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in most_clicked_counts]

least_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in least_clicked_counts]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut(gravity=-10000,  # Controls the strength of repulsion between nodes
               central_gravity=0.01,  # Weak central gravity so nodes spread out
               spring_length=300000,  # Increase distance between connected nodes
               spring_strength=0.05)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {most_clicked_articles[i]} \n click count: {most_clicked_counts[i]} \n in degree: {most_in_degree[i]} \n out degree: {most_out_degree[i]}" for i in range(len(most_clicked_articles))]

# define position of nodes 
num_nodes = len(most_clicked_articles)

# Define a circular layout for the nodes
radius = 500  # Adjust the radius of the circle
angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)  # Evenly spaced angles
x_positions = radius * np.cos(angles)  # x-coordinates
y_positions = radius * np.sin(angles)  # y-coordinates

net.add_nodes(most_clicked_articles, 
              title=titles, 
              color=most_colors_hex, 
              x=x_positions, 
              y=y_positions)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(most_clicked_articles):
    name_links_out = data_sorted.name_links_out.iloc[i]

    if pd.notna(name_links_out):
        for article2 in most_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)


net.show("graphs/topic_1/most_used_articles_graph.html")

graphs/topic_1/most_used_articles_graph.html



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



An now let's look at the least clicked articles: 

In [32]:
color_map = plt.cm.get_cmap('Reds')

# Normalize the log_counts to [0, 1]
norm = matplotlib.colors.Normalize(vmin=min(least_clicked_counts), vmax=max(least_clicked_counts))
least_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in least_clicked_counts]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut(overlap=0)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {least_clicked_articles[i]} \n click count: {least_clicked_counts[i]} \n in degree: {least_in_degree[i]} \n out degree: {least_out_degree[i]}" for i in range(len(least_clicked_articles))]

# define position of nodes 
num_nodes = len(least_clicked_articles)

# Define a circular layout for the nodes
radius = 500  # Adjust the radius of the circle
angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)  # Evenly spaced angles
x_positions = radius * np.cos(angles)  # x-coordinates
y_positions = radius * np.sin(angles)  # y-coordinates

net.add_nodes(least_clicked_articles, 
              title=titles, 
              color=least_colors_hex,
              x=x_positions, 
              y=y_positions)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(least_clicked_articles):
    name_links_out = data_sorted.name_links_out.iloc[i]

    if pd.notna(name_links_out):
        for article2 in least_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)


net.show("graphs/topic_1/least_used_articles_graph.html")

graphs/topic_1/least_used_articles_graph.html



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



You can separate all nodes to get an idea of which articles are well connected and which are isolated!! 

We see that a lot of western countries are present within the most clicked articles and many oriental subjects are in the least clicked countries. This motivates our project in which we will associate each article to it's corresponding country and analyse the distributions of those countries within the Wikipedia graph. In the end, we are ultimately interested in whether players of the Wikispeedia game really tend to click more on articles that are associated with western countries or if this feature is due to the properties of the Wikipedia graph itself. 

## 1. What is the distribution of countries in the Wikipedia graph?

Now that all (most) articles are associated to a country, we can look at the distribution of those countries. Is there a country that is associated to more articles (we have our little idea haha but let's check). 

In order to find the distribution of countries in the Wikipedia graph, we need the number of articles that are associated to each of the represented countries.

In [9]:
countries_occurrences = data.groupby("Top_1_name").size().reset_index()
data_countries = data.groupby("Top_1_name").sum().reset_index()
data_countries["occurrence"] = countries_occurrences.iloc[:,1].tolist()

# percent of articles that belong to each country
data_countries["percent"] = data_countries["occurrence"] / sum(data_countries["occurrence"])

In [10]:
data_countries

Unnamed: 0,Top_1_name,click_count,num_links_in,name_links_in,num_links_out,name_links_out,occurrence,percent
0,afghanistan,629,313.0,"['15th_Marine_Expeditionary_Unit', '18th_centu...",732.0,"['18th_century', 'Afghan_Hound', 'Alexander_th...",10,0.003858
1,albania,21,85.0,"['AK-47', 'Armenia', 'Atheism', 'Athens', 'Aze...",153.0,"['Afghanistan', 'Agriculture', 'Algeria', 'Anc...",2,0.000772
2,algeria,700,263.0,"['10th_century', '1973_oil_crisis', 'Africa', ...",201.0,"['Africa', 'Agriculture', 'Algerian_Civil_War'...",6,0.002315
3,andorra,6,41.0,"['Albania', 'Armenia', 'Azerbaijan', 'Biodiver...",54.0,"['Agriculture', 'Algeria', 'Argentina', 'Brazi...",1,0.000386
4,angola,28,80.0,"['13th_century', '2005_Lake_Tanganyika_earthqu...",21.0,"['Africa', 'Atlantic_Ocean', 'Bantu', 'Cold_Wa...",1,0.000386
...,...,...,...,...,...,...,...,...
190,venezuela,447,180.0,"['El_Hatillo_Municipality%2C_Miranda', 'Global...",150.0,"['Baroque', 'Baseball', 'Capital', 'Caribbean_...",5,0.001929
191,vietnam,287,281.0,"['Asia', 'Beijing', 'Global_city', 'Ho_Chi_Min...",171.0,"['Agriculture', 'Australia', 'Capital', 'Confu...",6,0.002315
192,yemen,19,68.0,"['Albania', 'Arab_League', 'Arabic_language', ...",140.0,"['11th_century', '16th_century', '19th_century...",1,0.000386
193,zambia,121,122.0,"['Zambezi']['Africa', 'Henry_Morton_Stanley', ...",206.0,"['AIDS', 'Copper', 'Mining', 'Zambezi', 'Zambi...",6,0.002315


In [33]:
fig = px.pie(data_countries, values='percent', names='Top_1_name', title='Distribution of countries in Wikipedia')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

# Export the figure to an HTML file
pio.write_html(fig, file='graphs/topic_1/pie_plot_distribution_of_countries.html', auto_open=False)

We see that 8 countries make up for 1/2 of the articles in Wikipedia, namely the US, UK, Australia, France, Germany, Italy, India and China. Those are all in the top10 of countries that publish the most!!

Let's now look at the connectivity between countries. 

> **_NOTE:_** Two countries are said to be *connected* if at least one article from the first country contains a link to an article associated to the other country, and the other way around for the 2 same articles. 

We first need to prepare the data, so to find which countries are connected and how often those connections appear in the Wikipedia graph! 

In [12]:
data = pd.read_csv("data/country_clicks_links.csv", index_col=0)

data.dropna(subset=["Top_1_name"], inplace=True)
data["name_links_out"] = data["name_links_out"].fillna("[]")
data["num_links_in"] = data["num_links_in"].fillna(0)
data["num_links_out"] = data["num_links_out"].fillna(0)
data["name_links_out"] = data["name_links_out"].apply(literal_eval)

start_country = []
end_country = []

for idx, row in data.iterrows():
    links_out_list = list(row["name_links_out"])
    for out_link in links_out_list:
        try:
            start_country.append(row["Top_1_name"])
            end_country.append(data.loc[out_link]["Top_1_name"])
        except:
            pass

df = pd.DataFrame(list(zip(start_country, end_country)),
              columns=['start_country','end_country'])

In [13]:
country_connexions = df.groupby(["start_country", "end_country"]).size().sort_values(ascending=False).reset_index()
country_connexions

Unnamed: 0,start_country,end_country,0
0,united states,united states,1006
1,united states,united kingdom,619
2,united kingdom,united states,522
3,united states,france,454
4,united states,germany,397
...,...,...,...
8762,indonesia,puerto rico,1
8763,indonesia,philippines,1
8764,indonesia,peru,1
8765,indonesia,panama,1


Great, well done! 

Now we will add this information as edges on a graph representing the countries, their associated number of articles and their connections!
- each node is a country 
- the size and color of a node is proportional to the number of articles associtated to that node
- edges represent connnections between countries in the Wikipedia graph

We need the position (i.e. longitude and latitude) of each country on a world map, since each country will be a node on a world map.

In [58]:
# Get a set of coordinates (latitude, longitude) for each country for visualisation purposes
geolocator = Nominatim(user_agent="http")

def get_country_coordinates(country_name):
    location = geolocator.geocode(country_name)
    return (location.latitude, location.longitude)

countries = data_countries.Top_1_name.tolist()
coords = []
for country in countries: 
    # handle ambiguous cases (the geolocator function tends to place ambiguous countries in the US!)
    if country == "sudan": 
        print("before:", get_country_coordinates("sudan"))
        print("after", get_country_coordinates("Sudan, Africa"))
        coords.append(get_country_coordinates("Sudan, Africa"))
    elif country == "georgia": 
        coords.append(get_country_coordinates("Georgia, Caucasus"))
    elif country == "lebanon": 
        coords.append(get_country_coordinates("Lebanon, Asia"))
    elif country == "greenland": 
        coords.append(get_country_coordinates("Greenland, North America"))
    elif country == "jordan": 
        coords.append(get_country_coordinates("Jordan, Middle East"))
    else:
        coords.append(get_country_coordinates(country))

latitudes = [coord[0] for coord in coords]
longitudes = [coord[1] for coord in coords]

before: (34.0678644, -102.524362)
after (15.531178050000001, 32.5666937354618)


In [56]:
location_sudan = geolocator.geocode("Georgia, Caucasus")
print(location_sudan)

Caucasus Mountains, ჭალისოფელი, დუშეთის მუნიციპალიტეტი, მცხეთა-მთიანეთი, საქართველო


Now let's construct the graph!

In [78]:
color_map = plt.cm.get_cmap('Reds')
colors_hex = [matplotlib.colors.to_hex(color_map(num)) for num in data_countries.occurrence.tolist()]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut()
net.toggle_physics(False)

x_positions = [coord[1] * 50 for coord in coords]  # Scaling for better spacing
y_positions = [coord[0] * -50 for coord in coords] # Negate to align correctly

net.add_nodes(countries, 
              title=countries, 
              color=colors_hex, 
              size=data_countries.occurrence.tolist(), 
              x = x_positions, 
              y = y_positions)


# Let's add edges between articles that are connected in Wikipedia
for i in range(len(country_connexions)):
    start_country = country_connexions["start_country"][i]
    end_country = country_connexions["end_country"][i]
    
    if start_country == "turkey":
        # add edge
        net.add_edge(start_country, end_country)

net.show("graphs/topic_1/country_graph_turkey.html")

graphs/topic_1/country_graph_turkey.html



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



We can try to transform the pyvis visualization to plotly to add fancy toggle buttons later!

In [72]:
color_map = plt.cm.get_cmap('Reds')
colors_hex = [matplotlib.colors.to_hex(color_map(num)) for num in data_countries.occurrence.tolist()]

# Node positions (already calculated)
x_positions = [coord[1] * 50 for coord in coords]  # Scaling for better spacing
y_positions = [coord[0] * -50 for coord in coords]  # Negate to align correctly

# Create a list of edges
edge_x = []
edge_y = []
for i in range(len(country_connexions)):
    start_country = country_connexions["start_country"][i]
    end_country = country_connexions["end_country"][i]
    
    start_index = countries.index(start_country)
    end_index = countries.index(end_country)
    
    # Add the edge coordinates to the list
    edge_x.append(x_positions[start_index])
    edge_y.append(y_positions[start_index])
    edge_x.append(x_positions[end_index])
    edge_y.append(y_positions[end_index])

# Create node trace (Scatter plot for nodes)
node_trace = go.Scatter(
    x=x_positions,
    y=y_positions,
    mode='markers',
    hoverinfo='text',
    text=countries,  # Tooltip information on hover
    marker=dict(
        color=colors_hex,  # Color by occurrence
        size=[occurrence/4 for occurrence in data_countries.occurrence.tolist()],  # Size by occurrence
        line=dict(width=2, color='black')  # Black border for nodes
    )
)

# Create edge trace (Scatter plot for edges)
edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Country Connections Network',
                    showlegend=False,
                    hovermode='closest',
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False),
                    plot_bgcolor='white'  # Background color for the plot
                ))

# Show the figure
fig.show()


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



## 3. What is the distribution of in and out degrees? 

For fun, we are also interested in the in and out degree of each country. 

> **_NOTE:_**  The in degree of a country is defined as the sum of the in degrees of its articles. Same for out degrees. The higher the in degree of a country, the more central it is meaning that the more it is accessible from other countries. 

In [34]:
# create bar plot for most occuring countries
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=False).iloc[:8:,]

trace_most_in = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_in"],
                        name="in degree",
                        marker_color="blue")

trace_most_out = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_out"],
                        name="out degree",
                        marker_color="red")


# create bar plot for most occuring countries
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=True).iloc[:8:,]

trace_least_in = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_in"],
                        name="in degree",
                        marker_color="blue")

trace_least_out = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_out"],
                        name="out degree",
                        marker_color="red")

# create figures
fig = go.Figure()

# add traces
fig.add_trace(trace_most_in)
fig.add_trace(trace_most_out)
fig.add_trace(trace_least_in)
fig.add_trace(trace_least_out)


# Create buttons to toggle between the two bar charts
fig.update_layout(
    barmode='stack',
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=[
                dict(
                    label="Most occurring countries",
                    method="update",
                    args=[{"visible": [True, True, False, False]},  # Show first chart, hide second
                          {"title": "Connectivity of most occurring countries"}]
                ),
                dict(
                    label="Least occurring countries",
                    method="update",
                    args=[{"visible": [False, False, True, True]},  # Hide first chart, show second
                          {"title": "Connectivity of least occurring countries"}]
                ),
            ],
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.11,
            xanchor="left",
            y=1.1,
            yanchor="middle"
        ),
    ]
)

# Show the figure
fig.show()

# Export the figure to an HTML file
pio.write_html(fig, file='graphs/topic_1/bar_plot_distribution_of_degrees.html', auto_open=False)

We observe that countries that occur more often in Wikipedia (i.e. many articles are associated with those countries), are immensely more connected, so have much more links that lead in and out of them. Those articles are so called "central hubs" of the Wikipedia graph. 