# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import os

import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np

from ast import literal_eval

import plotly.express as px
import plotly.io as pio

from pyvis.network import Network
import plotly.graph_objects as go
from geopy.geocoders import Nominatim

import folium

In [3]:
#!pip install pyvis
#!pip install jinja2
#!pip install geopy
#!pip install folium

In [26]:
project_path = os.getcwd()
project_path

'c:\\Users\\fricl\\OneDrive\\Documents\\Suisse\\EPFL\\Cours\\MA3\\ADA\\ada-2024-project-lesfraisestagada'

## Questions of interest 

0) What are the most clicked articles?
1) What is the distribution of countries in the Wikipedia graph? 
3) What is the distribution of in and out degrees per country? 

## 0. What are the most clicked articles? 

Let's first import the data `country_clicks_links.csv` with which we will be working.

In [4]:
data = pd.read_csv("data/country_clicks_links.csv", index_col=0)

Now we define the variables that we need.

In [5]:
data_sorted = data.sort_values(by='click_count', ascending=False)
data_sorted = data_sorted[data_sorted['click_count'] != 0]
articles = data_sorted.index.tolist()
click_count = data_sorted.click_count.tolist()
in_degree = data_sorted.num_links_in.tolist()
out_degree = data_sorted.num_links_out.tolist()

Let's construct a graph in which: 
- each node is an article
    - 20 most clicked articles 
    - 20 least clicked articles 

In [6]:
most_clicked_articles = articles[:20]
most_clicked_counts = click_count[:20]
most_in_degree = in_degree[:20]
most_out_degree = out_degree[:20]

least_clicked_articles = articles[-20:]
least_clicked_counts = click_count[-20:]
least_in_degree = in_degree[-20:]
least_out_degree = out_degree[-20:]

- the size and color of a node is proportional to the number of clicks associtated to that node
- each edge represents an out link

We are also interested in the in and out degree of each article
- in degree = num_links_in 
- out degree = num_links_out

Let's first look at the most clicked articles:

In [7]:
# define a global color map
color_map = plt.get_cmap('Reds')

In [8]:
# Define a color gradient to represent the number of clicks for each article
norm = matplotlib.colors.Normalize(vmin=min(most_clicked_counts), vmax=max(most_clicked_counts))
most_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in most_clicked_counts]

In [18]:
import webbrowser

In [28]:
net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000', 
              cdn_resources='in_line')

# Turn off physics so nodes stay fixed
net.barnes_hut(gravity=-10000,  # Controls the strength of repulsion between nodes
               central_gravity=0.01,  # Weak central gravity so nodes spread out
               spring_length=300000,  # Increase distance between connected nodes
               spring_strength=0.05)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {most_clicked_articles[i]} \n click count: {most_clicked_counts[i]} \n in degree: {most_in_degree[i]} \n out degree: {most_out_degree[i]}" for i in range(len(most_clicked_articles))]

# define position of nodes 
num_nodes = len(most_clicked_articles)

# Define a circular layout for the nodes
radius = 500  # Adjust the radius of the circle
angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)  # Evenly spaced angles
x_positions = radius * np.cos(angles)  # x-coordinates
y_positions = radius * np.sin(angles)  # y-coordinates

net.add_nodes(most_clicked_articles, 
              title=titles, 
              color=most_colors_hex, 
              x=x_positions, 
              y=y_positions)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(most_clicked_articles):
    name_links_out = data_sorted.name_links_out.iloc[i]

    if pd.notna(name_links_out):
        for article2 in most_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)

# save to html for visualization on our website
net.show("graphs/topic_1/most_used_articles_graph.html")

# open the graph in browser
webbrowser.open(f"{project_path}/graphs/topic_1/most_used_articles_graph.html")

graphs/topic_1/most_used_articles_graph.html


True

An now let's look at the least clicked articles: 

In [30]:
# Normalize the log_counts to [0, 1]
norm = matplotlib.colors.Normalize(vmin=min(least_clicked_counts), vmax=max(least_clicked_counts))
least_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in least_clicked_counts]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000', 
              cdn_resources='in_line')

# Turn off physics so nodes stay fixed
net.barnes_hut(overlap=0)  # Adjust spring tightness
net.toggle_physics(False)

# write the title of each node
titles = [f"article: {least_clicked_articles[i]} \n click count: {least_clicked_counts[i]} \n in degree: {least_in_degree[i]} \n out degree: {least_out_degree[i]}" for i in range(len(least_clicked_articles))]

# define position of nodes 
num_nodes = len(least_clicked_articles)

# Define a circular layout for the nodes
radius = 500  # Adjust the radius of the circle
angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)  # Evenly spaced angles
x_positions = radius * np.cos(angles)  # x-coordinates
y_positions = radius * np.sin(angles)  # y-coordinates

net.add_nodes(least_clicked_articles, 
              title=titles, 
              color=least_colors_hex,
              x=x_positions, 
              y=y_positions)


# Let's add edges between articles that are connected in Wikipedia
for i, article1 in enumerate(least_clicked_articles):
    name_links_out = data_sorted.name_links_out.iloc[i]

    if pd.notna(name_links_out):
        for article2 in least_clicked_articles: 
            if article2 in name_links_out:
                net.add_edge(article1, article2)


net.show("graphs/topic_1/least_used_articles_graph.html")

# open the graph in browser
webbrowser.open(f"{project_path}/graphs/topic_1/least_used_articles_graph.html")

graphs/topic_1/least_used_articles_graph.html


True

We see that a lot of western countries are present within the most clicked articles and many oriental subjects are in the least clicked countries. This motivates our project in which we will associate each article to it's corresponding country and analyse the distributions of those countries within the Wikipedia graph. In the end, we are ultimately interested in whether players of the Wikispeedia game really tend to click more on articles that are associated with western countries or if this feature is due to the properties of the Wikipedia graph itself. 

## 1. What is the distribution of countries in the Wikipedia graph?

Now that all (most) articles are associated to a country, we can look at the distribution of those countries. Is there a country that is associated to more articles (we have our little idea haha but let's check). 

In order to find the distribution of countries in the Wikipedia graph, we need the number of articles that are associated to each of the represented countries.

In [31]:
countries_occurrences = data.groupby("Top_1_name").size().reset_index()
data_countries = data.groupby("Top_1_name").sum().reset_index()
data_countries["occurrence"] = countries_occurrences.iloc[:,1].tolist()

# percent of articles that belong to each country
data_countries["percent"] = data_countries["occurrence"] / sum(data_countries["occurrence"])
data_countries

Unnamed: 0,Top_1_name,click_count,num_links_in,name_links_in,num_links_out,name_links_out,occurrence,percent
0,afghanistan,629,313.0,"['15th_Marine_Expeditionary_Unit', '18th_centu...",732.0,"[18th_century, Afghan_Hound, Alexander_the_Gre...",10,0.003858
1,albania,21,85.0,"['AK-47', 'Armenia', 'Atheism', 'Athens', 'Aze...",153.0,"[Afghanistan, Agriculture, Algeria, Ancient_Ro...",2,0.000772
2,algeria,700,263.0,"['10th_century', '1973_oil_crisis', 'Africa', ...",201.0,"[Africa, Agriculture, Algerian_Civil_War, Algi...",6,0.002315
3,andorra,6,41.0,"['Albania', 'Armenia', 'Azerbaijan', 'Biodiver...",54.0,"[Agriculture, Algeria, Argentina, Brazil, Cana...",1,0.000386
4,angola,28,80.0,"['13th_century', '2005_Lake_Tanganyika_earthqu...",21.0,"[Africa, Atlantic_Ocean, Bantu, Cold_War, Cuba...",1,0.000386
...,...,...,...,...,...,...,...,...
190,venezuela,447,180.0,"['El_Hatillo_Municipality%2C_Miranda', 'Global...",150.0,"[Baroque, Baseball, Capital, Caribbean_Sea, Ch...",5,0.001929
191,vietnam,287,281.0,"['Asia', 'Beijing', 'Global_city', 'Ho_Chi_Min...",171.0,"[Agriculture, Australia, Capital, Confucianism...",6,0.002315
192,yemen,19,68.0,"['Albania', 'Arab_League', 'Arabic_language', ...",140.0,"[11th_century, 16th_century, 19th_century, 7th...",1,0.000386
193,zambia,121,122.0,"['Zambezi']['Africa', 'Henry_Morton_Stanley', ...",206.0,"[AIDS, Copper, Mining, Zambezi, Zambia, Africa...",6,0.002315


In [14]:
fig = px.pie(data_countries, values='percent', names='Top_1_name', title='Distribution of countries in Wikipedia')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

# Export the figure to an HTML file
pio.write_html(fig, file='graphs/topic_1/pie_plot_distribution_of_countries.html', auto_open=False)

We see that 8 countries make up for 1/2 of the articles in Wikipedia, namely the US, UK, Australia, France, Germany, Italy, India and China. Those are all in the top10 of countries that publish the most!!

Let's now look at the connectivity between countries. 

> **_NOTE:_** Two countries are said to be *connected* if at least one article from the first country contains a link to an article associated to the other country, and the other way around for the 2 same articles. 

We first need to prepare the data, so to find which countries are connected and how often those connections appear in the Wikipedia graph! 

In [15]:
data = pd.read_csv("data/country_clicks_links.csv", index_col=0)

data.dropna(subset=["Top_1_name"], inplace=True)
data["name_links_out"] = data["name_links_out"].fillna("[]")
data["num_links_in"] = data["num_links_in"].fillna(0)
data["num_links_out"] = data["num_links_out"].fillna(0)
data["name_links_out"] = data["name_links_out"].apply(literal_eval)

start_country = []
end_country = []

for idx, row in data.iterrows():
    links_out_list = list(row["name_links_out"])
    for out_link in links_out_list:
        try:
            start_country.append(row["Top_1_name"])
            end_country.append(data.loc[out_link]["Top_1_name"])
        except:
            pass

df = pd.DataFrame(list(zip(start_country, end_country)),
              columns=['start_country','end_country'])

Let's construct a DataFrame that contains the connections between countries and also a count variable that tract how often a connection occurs. 

In [16]:
country_connexions = df.groupby(["start_country", "end_country"]).size().sort_values(ascending=False).reset_index()
country_connexions = country_connexions.rename(columns={country_connexions.columns[-1]: 'count'})
country_connexions

Unnamed: 0,start_country,end_country,count
0,united states,united states,1006
1,united states,united kingdom,619
2,united kingdom,united states,522
3,united states,france,454
4,united states,germany,397
...,...,...,...
8762,indonesia,puerto rico,1
8763,indonesia,philippines,1
8764,indonesia,peru,1
8765,indonesia,panama,1


Great, well done! 

Now we will add this information as edges on a graph representing the countries, their associated number of articles and their connections!
- each node is a country 
- the size and color of a node is proportional to the number of articles associtated to that node
- edges represent connnections between countries in the Wikipedia graph

We need the position (i.e. longitude and latitude) of each country on a world map, since each country will be a node on a world map.

In [32]:
# Get a set of coordinates (latitude, longitude) for each country for visualisation purposes
geolocator = Nominatim(user_agent="my_app", timeout=10)

def get_country_coordinates(country_name):
    location = geolocator.geocode(country_name)
    return (location.latitude, location.longitude)

countries = data_countries.Top_1_name.tolist()
coords = []
for country in countries: 
    # handle ambiguous cases (the geolocator function tends to place ambiguous countries in the US!)
    if country == "sudan": 
        print("before:", get_country_coordinates("sudan"))
        print("after", get_country_coordinates("Sudan, Africa"))
        coords.append(get_country_coordinates("Sudan, Africa"))
    elif country == "georgia": 
        coords.append(get_country_coordinates("Georgia, Caucasus"))
    elif country == "lebanon": 
        coords.append(get_country_coordinates("Lebanon, Asia"))
    elif country == "greenland": 
        coords.append(get_country_coordinates("Greenland, North America"))
    elif country == "jordan": 
        coords.append(get_country_coordinates("Jordan, Middle East"))
    else:
        coords.append(get_country_coordinates(country))

latitudes = [coord[0] for coord in coords]
longitudes = [coord[1] for coord in coords]

KeyboardInterrupt: 

Now let's construct the graph!

In [209]:
color_map = plt.cm.get_cmap('Reds')
colors_hex = [matplotlib.colors.to_hex(color_map(num + 100)) for num in data_countries.occurrence.tolist()]

net = Network(directed=True, 
              notebook=True, 
              font_color='#10000000')

# Turn off physics so nodes stay fixed
net.barnes_hut()
net.toggle_physics(False)

x_positions = [coord[1] * 50 for coord in coords]  # Scaling for better spacing
y_positions = [coord[0] * -50 for coord in coords] # Negate to align correctly

net.add_nodes(countries, 
              title=countries, 
              color=colors_hex, 
              size=data_countries.occurrence.tolist(), 
              x = x_positions, 
              y = y_positions)


net.show("graphs/topic_1/country_graph_no_edge.html")

graphs/topic_1/country_graph_no_edge.html



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



The last step is to overlay this nice graph on a world map. For this we will use Leaflet and more preceisely Follium in Python! 

In [216]:
def overlap_world_map(out_path, edge=True):
    # Create a base map centered on an average coordinate (e.g., latitude 0, longitude 0)
    map_center = [0, 0]  # Adjust center to fit your data better if needed
    world_map = folium.Map(location=map_center, zoom_start=2, tiles='cartodbpositron')

    # Add each country node as a CircleMarker with scaled sizes
    for country, lat, lon, color, size in zip(countries, latitudes, longitudes, colors_hex, data_countries.occurrence.tolist()):
        folium.CircleMarker(
            location=[lat, lon],  # Use latitude and longitude
            radius=size/10,         # Scaled size based on occurrence
            color=color,           # Use the color from your PyVis graph
            fill=True,
            fill_opacity=0.7,
            popup=folium.Popup(
                f"<b>{country}</b><br>{size} articles",  # Popup content (HTML formatted for better display)
                max_width=100,  # Set the maximum width of the popup
                min_width=50    # Set a minimum width if desired
            )
        ).add_to(world_map)

    # Add a slider for threshold (this one will be for dynamically changing the threshold value)
    slider_html = '''
        <div style="position: absolute; top: 10px; left: 100px; z-index: 9999;">
            <label for="threshold_slider" style="font-size: 14px; color: black;">Edge Threshold:</label>
            <input type="range" id="threshold_slider" min="1" max="611" value="1" step="10" style="width: 200px;">
            <span id="threshold_value" style="font-size: 14px; color: black;">1</span>
        </div>
    '''
    
    # Add the slider to the map as an HTML element
    world_map.get_root().html.add_child(folium.Element(slider_html))

    # JavaScript to update edges based on the slider value
    slider_js = '''
        <script>
            document.getElementById("threshold_slider").oninput = function() {
                var threshold = parseInt(this.value);
                document.getElementById("threshold_value").innerText = threshold;
                // Filter edges based on the threshold value
                var lines = document.querySelectorAll('path[stroke="blue"]');
                lines.forEach(function(line) {
                    var opacity = parseFloat(line.getAttribute("stroke-opacity"));
                    if (opacity >= (threshold / 1006)) {
                        line.style.display = "block";  // Show the edge
                    } else {
                        line.style.display = "none";   // Hide the edge
                    }
                });
            };
        </script>
    '''
    
    # Add the JavaScript to the map
    world_map.get_root().html.add_child(folium.Element(slider_js))

    if edge:
        
        for i in range(len(country_connexions)):
            start_country = country_connexions["start_country"].iloc[i]
            end_country = country_connexions["end_country"].iloc[i]
            count = country_connexions["count"].iloc[i]
            
            # Get coordinates for both countries
            try:
                start_idx = countries.index(start_country)
                end_idx = countries.index(end_country)
                start_coords = [latitudes[start_idx], longitudes[start_idx]]
                end_coords = [latitudes[end_idx], longitudes[end_idx]]
                
                # Add a line (edge) between the two countries
                folium.PolyLine(
                    locations=[start_coords, end_coords],  # List of [lat, lon] pairs
                    color='blue',  # Line color
                    weight=2,      # Line thickness
                    opacity=count/1006,   # Line transparency
                    class_name='edge-line',  # CSS class to target in JavaScript
                    # Store the "count" in a data attribute for filtering
                ).add_to(world_map)
            except ValueError:
                print(f"Connection skipped: {start_country} -> {end_country} (one of them not found in countries list)")

    # Save the combined map to an HTML file
    world_map.save(out_path)

Let's finally create our world map without any edges because with the edges it is way too crowded!

In [217]:
# just nodes
overlap_world_map("graphs/topic_1/country_graph_no_edge.html", "graphs/topic_1/world_graph_map.html", edge=True)

## 3. What is the distribution of in and out degrees? 

For fun, we are also interested in the in and out degree of each country. 

> **_NOTE:_**  The in degree of a country is defined as the sum of the in degrees of its articles. Same for out degrees. The higher the in degree of a country, the more central it is meaning that the more it is accessible from other countries. 

In [113]:
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=False).iloc[:8:,]

# create bar plot for most occuring countries
trace_most_in = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_in"],
                        name="in degree",
                        marker_color="blue")

trace_most_out = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_out"],
                        name="out degree",
                        marker_color="red")


# create bar plot for least occuring countries
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=True).iloc[:8:,]

trace_least_in = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_in"],
                        name="in degree",
                        marker_color="blue")

trace_least_out = go.Bar(x=sorted_data_countries["Top_1_name"], 
                       y=sorted_data_countries["num_links_out"],
                        name="out degree",
                        marker_color="red")

# create figures
fig = go.Figure()

# add traces
fig.add_trace(trace_most_in)
fig.add_trace(trace_most_out)
fig.add_trace(trace_least_in)
fig.add_trace(trace_least_out)


# Create buttons to toggle between the two bar charts
fig.update_layout(
    barmode='stack',
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=[
                dict(
                    label="Most occurring countries",
                    method="update",
                    args=[{"visible": [True, True, False, False]},  # Show first chart, hide second
                          {"title": "Connectivity of most occurring countries"}]
                ),
                dict(
                    label="Least occurring countries",
                    method="update",
                    args=[{"visible": [False, False, True, True]},  # Hide first chart, show second
                          {"title": "Connectivity of least occurring countries"}]
                ),
            ],
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.11,
            xanchor="left",
            y=1.1,
            yanchor="middle"
        ),
    ]
)

# Show the figure
fig.show()

# Export the figure to an HTML file
pio.write_html(fig, file='graphs/topic_1/bar_plot_distribution_of_degrees.html', auto_open=False)

We observe that countries that occur more often in Wikipedia (i.e. many articles are associated with those countries), are immensely more connected, so have much more links that lead in and out of them. Those articles are so called "central hubs" of the Wikipedia graph. 

Finally, we can look at the connectivity between the most occurring and least occuring countries! To visulize this we will draw a circle graph!

In [None]:
sorted_data_countries = data_countries.sort_values(by='occurrence', ascending=False)

In [127]:
def draw_circle_graph(df, out_path):
    color_map = plt.cm.get_cmap('Reds')

    # Normalize the log_counts to [0, 1]
    norm = matplotlib.colors.Normalize(vmin=min(df.occurrence.tolist()), vmax=max(df.occurrence.tolist()))
    most_colors_hex = [matplotlib.colors.to_hex(color_map(norm(count))) for count in df.occurrence.tolist()]


    net = Network(directed=True, 
                notebook=True, 
                font_color='#10000000')

    # Turn off physics so nodes stay fixed
    net.barnes_hut(gravity=-10000,  # Controls the strength of repulsion between nodes
                central_gravity=0.01,  # Weak central gravity so nodes spread out
                spring_length=300000,  # Increase distance between connected nodes
                spring_strength=0.05)  # Adjust spring tightness
    net.toggle_physics(False)

    # write the title of each node
    titles = [f"country: {df.Top_1_name[i]} \n articles: {df.occurrence[i]}" for i in range(len(df))]

    # define position of nodes 
    num_nodes = len(df)

    # Define a circular layout for the nodes
    radius = 500  # Adjust the radius of the circle
    angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)  # Evenly spaced angles
    x_positions = radius * np.cos(angles)  # x-coordinates
    y_positions = radius * np.sin(angles)  # y-coordinates

    net.add_nodes(df.Top_1_name.tolist(), 
                title=titles, 
                color=most_colors_hex, 
                x=x_positions, 
                y=y_positions)

        
    # Let's add edges between articles that are connected in Wikipedia
    for i in range(len(country_connexions)):
        start_country = country_connexions["start_country"][i]
        end_country = country_connexions["end_country"][i]
        if (start_country in df.Top_1_name.tolist()) and (end_country in df.Top_1_name.tolist()):
            net.add_edge(start_country, end_country)

    # save to HTML
    net.show(out_path)

In [133]:
draw_circle_graph(sorted_data_countries.iloc[:10,].reset_index(), "graphs/topic_1/most_countries_graph_circle.html")
draw_circle_graph(sorted_data_countries.iloc[-10:,].reset_index(), "graphs/topic_1/least_countries_graph_circle.html")

combined_countries = pd.concat([
    sorted_data_countries.iloc[:10,],
    sorted_data_countries.iloc[-10:,]
]).reset_index(drop=True)

draw_circle_graph(combined_countries, "graphs/topic_1/combined_countries_graph_circle.html")


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



graphs/topic_1/most_countries_graph_circle.html
graphs/topic_1/least_countries_graph_circle.html
graphs/topic_1/combined_countries_graph_circle.html
