# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [94]:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

import plotly.express as px

from pyvis.network import Network
import plotly.graph_objects as go
from geopy.geocoders import Nominatim

from sklearn.preprocessing import MinMaxScaler

In [1]:
#!pip install pyvis
#!pip install jinja2
#!pip install geopy

## Questions of interest 

0) What are the most clicked articles?
1) What is the distribution of countries in the Wikipedia graph? 
2) What is the distribution of countries among start and stop articles? 
3) What is the distribution of in and out degrees per country? 

## 0. What are the most clicked articles? 

Let's first import the data `country_clicks_links.csv` with which we will be working.

In [25]:
data = pd.read_csv("data/country_clicks_links.csv", index_col=0)

Now we define the variables that we need.

In [96]:
data_sorted = data.sort_values(by='click_count', ascending=False)
data_sorted = data_sorted[data_sorted['click_count'] != 0]
articles = data_sorted.index.tolist()
click_count = data_sorted.click_count.tolist()
in_degree = data_sorted.num_links_in.tolist()
out_degree = data_sorted.num_links_out.tolist()

Let's construct a graph in which: 
- each node is an article
    - 20 most clicked articles 
    - 20 least clicked articles 

In [102]:
most_clicked_articles = articles[:20]
most_clicked_counts = click_count[:20]
most_in_degree = in_degree[:20]
most_out_degree = out_degree[:20]

least_clicked_articles = articles[-20:]
least_clicked_counts = click_count[-20:]
least_in_degree = in_degree[-20:]
least_out_degree = out_degree[-20:]

- the size and color of a node is proportional to the number of clicks associtated to that node
- each edge represents an out link

We are also interested in the in and out degree of each article
- in degree = num_links_in 
- out degree = num_links_out

Let's first look at the most clicked articles:

In [101]:
color_map = plt.cm.get_cmap('Reds')

# Normalize the log_counts to [0, 1]
norm = matplotlib.colors.Normalize(vmin=min(most_clicked_counts), vmax=max(most_clicked_counts))
most_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in most_clicked_counts]

least_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in least_clicked_counts]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut(gravity=-10000,  # Controls the strength of repulsion between nodes
               central_gravity=0.01,  # Weak central gravity so nodes spread out
               spring_length=300000,  # Increase distance between connected nodes
               spring_strength=0.05)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {most_clicked_articles[i]} \n click count: {most_clicked_counts[i]} \n in degree: {most_in_degree[i]} \n out degree: {most_out_degree[i]}" for i in range(len(most_clicked_articles))]

net.add_nodes(most_clicked_articles, 
              title=titles, 
              color=most_colors_hex)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(most_clicked_articles):
    print(article1)
    name_links_out = data_sorted.name_links_out.iloc[i]
    print(name_links_out)

    if pd.notna(name_links_out):
        for article2 in most_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)


net.show("most_used_articles_graph.html")

United_States
['Abraham_Lincoln', 'Advertising', 'Agriculture', 'American_Civil_War', 'American_English', 'American_Revolutionary_War', 'American_Samoa', 'American_football', 'American_popular_music', 'Amtrak', 'Anguilla', 'Antarctica', 'Antigua_and_Barbuda', 'Apollo_11', 'Arctic_Ocean', 'Argentina', 'Aruba', 'Atlanta%2C_Georgia', 'Atlantic_Ocean', 'Attack_on_Pearl_Harbor', 'Auto_racing', 'Bah%C3%A1%27%C3%AD_Faith', 'Baker_Island', 'Bald_Eagle', 'Baltimore%2C_Maryland', 'Barbados', 'Baseball', 'Basketball', 'Battle_of_Gettysburg', 'Belize', 'Benin', 'Bermuda', 'Boston%2C_Massachusetts', 'Brazil', 'British_Virgin_Islands', 'Broadcasting', 'Brunei', 'Buddhism', 'C%C3%B4te_d%27Ivoire', 'California', 'Cambodia', 'Cameroon', 'Canada', 'Cape_Verde', 'Capitalism', 'Caribbean_Sea', 'Cayman_Islands', 'Chicago', 'China', 'Chinese_language', 'Christopher_Columbus', 'Cinema_of_the_United_States', 'Coal', 'Cold_War', 'Colombia', 'Communism', 'Computer', 'Corporation', 'Costa_Rica', 'Country', 'Crim


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



In [99]:
titles

"article: ['United_States', 'Europe', 'United_Kingdom', 'England', 'Earth', 'Africa', 'World_War_II', 'North_America', 'Animal', 'Human', 'Mammal', 'France', 'Germany', 'English_language', 'Science', 'India', 'Atlantic_Ocean', 'Agriculture', 'Plant', 'Computer'] \n click count: [12370, 5553, 5278, 4304, 4134, 3464, 2846, 2503, 2396, 2334, 2292, 2201, 2121, 1956, 1855, 1722, 1706, 1698, 1648, 1640] \n in degree: [1551.0, 933.0, 972.0, 751.0, 269.0, 477.0, 751.0, 410.0, 492.0, 165.0, 199.0, 959.0, 743.0, 598.0, 135.0, 611.0, 250.0, 241.0, 185.0, 87.0] \n out degree: [294.0, 159.0, 168.0, 172.0, 118.0, 212.0, 119.0, 77.0, 29.0, 137.0, 42.0, 85.0, 169.0, 118.0, 40.0, 81.0, 125.0, 57.0, 70.0, 36.0]"

An now let's look at the least clicked articles: 

In [107]:
color_map = plt.cm.get_cmap('Reds')

# Normalize the log_counts to [0, 1]
norm = matplotlib.colors.Normalize(vmin=min(least_clicked_counts), vmax=max(least_clicked_counts))
least_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in least_clicked_counts]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut(overlap=1)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {least_clicked_articles[i]} \n click count: {least_clicked_counts[i]} \n in degree: {least_in_degree[i]} \n out degree: {least_out_degree[i]}" for i in range(len(least_clicked_articles))]

net.add_nodes(least_clicked_articles, 
              title=titles, 
              color=least_colors_hex)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(least_clicked_articles):
    print(article1)
    name_links_out = data_sorted.name_links_out.iloc[i]
    print(name_links_out)

    if pd.notna(name_links_out):
        for article2 in least_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)


net.show("least_used_articles_graph.html")

Magpie
['Abraham_Lincoln', 'Advertising', 'Agriculture', 'American_Civil_War', 'American_English', 'American_Revolutionary_War', 'American_Samoa', 'American_football', 'American_popular_music', 'Amtrak', 'Anguilla', 'Antarctica', 'Antigua_and_Barbuda', 'Apollo_11', 'Arctic_Ocean', 'Argentina', 'Aruba', 'Atlanta%2C_Georgia', 'Atlantic_Ocean', 'Attack_on_Pearl_Harbor', 'Auto_racing', 'Bah%C3%A1%27%C3%AD_Faith', 'Baker_Island', 'Bald_Eagle', 'Baltimore%2C_Maryland', 'Barbados', 'Baseball', 'Basketball', 'Battle_of_Gettysburg', 'Belize', 'Benin', 'Bermuda', 'Boston%2C_Massachusetts', 'Brazil', 'British_Virgin_Islands', 'Broadcasting', 'Brunei', 'Buddhism', 'C%C3%B4te_d%27Ivoire', 'California', 'Cambodia', 'Cameroon', 'Canada', 'Cape_Verde', 'Capitalism', 'Caribbean_Sea', 'Cayman_Islands', 'Chicago', 'China', 'Chinese_language', 'Christopher_Columbus', 'Cinema_of_the_United_States', 'Coal', 'Cold_War', 'Colombia', 'Communism', 'Computer', 'Corporation', 'Costa_Rica', 'Country', 'Crime', 'Cu


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



You can separate all nodes to get an idea of which articles are well connected and which are isolated!! 

We see that a lot of western countries are present within the most clicked articles and many oriental subjects are in the least clicked countries. This motivates our project in which we will associate each article to it's corresponding country and analyse the distributions of those countries within the Wikipedia graph. In the end, we are ultimately interested in whether players of the Wikispeedia game really tend to click more on articles that are associated with western countries or if this feature is due to the properties of the Wikipedia graph itself. 

## 1. What is the distribution of countries in the Wikipedia graph?

Now that all (most) articles are associated to a country, we can look at the distribution of those countries. Is there a country that is associated to more articles (we have our little idea haha but let's check). 

In order to find the distribution of countries in the Wikipedia graph, we need the number of articles that are associated to each of the represented countries.

In [124]:
countries_occurrences = data.groupby("Top_1_name").size().reset_index()
data_countries = data.groupby("Top_1_name").sum().reset_index()
data_countries["occurrence"] = countries_occurrences.iloc[:,1].tolist()

# percent of articles that belong to each country
data_countries["percent"] = data_countries["occurrence"] / sum(data_countries["occurrence"])

In [123]:
data_countries

Unnamed: 0,Top_1_name,click_count,num_links_in,name_links_in,num_links_out,name_links_out,occurrence,percent
0,afghanistan,629,313.0,"['15th_Marine_Expeditionary_Unit', '18th_centu...",732.0,"['18th_century', 'Afghan_Hound', 'Alexander_th...",10,0.003858
1,albania,21,85.0,"['AK-47', 'Armenia', 'Atheism', 'Athens', 'Aze...",153.0,"['Afghanistan', 'Agriculture', 'Algeria', 'Anc...",2,0.000772
2,algeria,700,263.0,"['10th_century', '1973_oil_crisis', 'Africa', ...",201.0,"['Africa', 'Agriculture', 'Algerian_Civil_War'...",6,0.002315
3,andorra,6,41.0,"['Albania', 'Armenia', 'Azerbaijan', 'Biodiver...",54.0,"['Agriculture', 'Algeria', 'Argentina', 'Brazi...",1,0.000386
4,angola,28,80.0,"['13th_century', '2005_Lake_Tanganyika_earthqu...",21.0,"['Africa', 'Atlantic_Ocean', 'Bantu', 'Cold_Wa...",1,0.000386
...,...,...,...,...,...,...,...,...
190,venezuela,447,180.0,"['El_Hatillo_Municipality%2C_Miranda', 'Global...",150.0,"['Baroque', 'Baseball', 'Capital', 'Caribbean_...",5,0.001929
191,vietnam,287,281.0,"['Asia', 'Beijing', 'Global_city', 'Ho_Chi_Min...",171.0,"['Agriculture', 'Australia', 'Capital', 'Confu...",6,0.002315
192,yemen,19,68.0,"['Albania', 'Arab_League', 'Arabic_language', ...",140.0,"['11th_century', '16th_century', '19th_century...",1,0.000386
193,zambia,121,122.0,"['Zambezi']['Africa', 'Henry_Morton_Stanley', ...",206.0,"['AIDS', 'Copper', 'Mining', 'Zambezi', 'Zambi...",6,0.002315


In [128]:
fig = px.pie(data_countries, values='percent', names='Top_1_name', title='Distribution of countries in Wikipedia')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

We see that 8 countries make up for 1/2 of the articles in Wikipedia, namely the US, UK, Australia, France, Germany, Italy, India and China. Those are all in the top10 of countries that publish the most!!

## 3. What is the distribution of in and out degrees? 

For fun, we are also interested in the in and out degree of each country. 

> **_NOTE:_**  The in degree of a country is defined as the sum of the in degrees of its articles. Same for out degrees. 

In [140]:
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=False)
fig1 = px.bar(sorted_data_countries.iloc[:8,:], x="Top_1_name", y=["num_links_in", "num_links_out"], title="Connectivity of most occuring countries")

sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=True)
fig2 = px.bar(sorted_data_countries.iloc[:8,:], x="Top_1_name", y=["num_links_in", "num_links_out"], title="Connectivity of least occuring countries")

# Create buttons to toggle between the two bar charts
fig1.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=[
                dict(
                    label="Most Occurring Countries",
                    method="update",
                    args=[{"visible": [True, False]},  # Show first chart, hide second
                          {"title": "Connectivity of most occurring countries"}]
                ),
                dict(
                    label="Least Occurring Countries",
                    method="update",
                    args=[{"visible": [False, True]},  # Hide first chart, show second
                          {"title": "Connectivity of least occurring countries"}]
                ),
            ],
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.11,
            xanchor="left",
            y=1.1,
            yanchor="middle"
        ),
    ]
)

# Show the figure
fig1.show()

In [None]:
for idx, row in df.iterrows():
        links_out_list = list(row["name_links_out"])
        for out_link in links_out_list:
            try:
                l.append(f"{row['Top_1_name']} -> {df.loc[out_link]['Top_1_name']}")
            except:
                pass

We also need the position (i.e. longitude and latitude) of each country on a world map, since each country will be a node on a world map.

In [18]:
# Get a set of coordinates (latitude, longitude) for each country for visualisation purposes
geolocator = Nominatim(user_agent="my_app")

def get_country_coordinates(country_name):
    location = geolocator.geocode(country_name)
    return (location.latitude, location.longitude)

coords = []
for country in countries: 
    coords.append(get_country_coordinates(country))

len(coords)

195

In [19]:
latitudes = [coord[0] for coord in coords]
longitudes = [coord[1] for coord in coords]

In [None]:
# Create the nodes
node_trace = go.Scattergeo(
    lon=longitudes,
    lat=latitudes,
    text=countries,
    mode='markers',
    marker=dict(
        size=node_sizes/4,
        color=colors_hex,
        line=dict(width=0.5, color='rgb(40,40,40)')
    ),
    hovertemplate='<b>Country:</b> %{text}<br>' +
                  '<b>Articles:</b> %{customdata[0]}<br>' +
                  '<b>Clicks:</b> %{customdata[1]}<extra></extra>',
    customdata=np.column_stack((num_articles, clicks))
)

# Create edges (path between articles based on their associated country)
edges = all_pairs_countries_normalized.index
edge_traces = []
for edge, weight in all_pairs_countries_normalized.items():
    country_from, country_to = edge.split('-> ')
    
    # Get coordinates for both countries
    lon1, lat1 = longitudes[countries.index(country_from)], latitudes[countries.index(country_from)]
    lon2, lat2 = longitudes[countries.index(country_to)], latitudes[countries.index(country_to)]
    
    # Create edge trace
    edge_trace = go.Scattergeo(
        lon = [lon1, lon2, None],
        lat = [lat1, lat2, None],
        mode = 'lines',
        showlegend=False,
        line = dict(width = weight * 1000, color = 'rgba(0, 128, 0, 0.1)'),
        hoverinfo = 'none'
    )
    
    edge_traces.append(edge_trace)

# Create the world map
layout = go.Layout(
    title='World map of the number of articles and the click count per country before scaling',
    showlegend=False,
    geo=dict(
        scope='world',
        projection_type='equirectangular',
        showland=True,
        landcolor='rgb(243, 243, 243)',
    ),
)

# Create the figure
fig = go.Figure(data=[node_trace] + edge_traces, layout=layout)

# Implement toggling visibility
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {
                    'label': 'Show Both Nodes and Edges',
                    'method': 'update',
                    'args': [{'visible': [True] + [True] * len(edge_traces)}]
                },
                {
                    'label': 'Show Nodes Only',
                    'method': 'update',
                    'args': [{'visible': [True] + [False] * len(edge_traces)}]
                },
                {
                    'label': 'Show Edges Only',
                    'method': 'update',
                    'args': [{'visible': [False] + [True] * len(edge_traces)}]
                },
                {
                    'label': 'Hide All',
                    'method': 'update',
                    'args': [{'visible': [False] * (1 + len(edge_traces))}]
                }
            ],
            'direction': 'down',
        }
    ]
)

# Show the plot
fig.show()
fig.write_html("world_counts_and_articles_before_scaling.html")

[Tutorial](https://pyvis.readthedocs.io/en/latest/tutorial.html#getting-started)

Let's construct a graph in which: 
- each node is a country 
- the size and color of a node is proportional to the number of articles associtated to that node

In [24]:
color_map = plt.cm.get_cmap('Reds')
colors_hex = [matplotlib.colors.to_hex(color_map(num)) for num in occurrences]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut()
net.toggle_physics(False)

x_positions = [coord[1] * 50 for coord in coords]  # Scaling for better spacing
y_positions = [coord[0] * -50 for coord in coords] # Negate to align correctly

net.add_nodes(countries, 
              title=countries, 
              color=colors_hex, 
              size=occurrences, 
              x = x_positions, 
              y = y_positions)

"""
# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(articles[:50]):
    print(article1)
    name_links_out = country_clicks.name_links_out.iloc[i]
    print(name_links_out)

    if pd.notna(name_links_out):
        for article2 in articles[:50]: 
            if article2 in name_links_out:
                print(article2, "is an out link from", article1)
                net.add_edge(article1, article2)
        print("---------")
"""

net.show("country_graph.html")

country_graph.html


  color_map = plt.cm.get_cmap('Reds')
