In [31]:
# Importing libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors

import math

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import igraph as ig

import folium
from folium import Map, PolyLine, CircleMarker
from folium.plugins import MarkerCluster
from folium.vector_layers import CircleMarker, PolyLine
from shapely.geometry import Point, LineString
from pyvis.network import Network
from collections import Counter

import community
from community import community_louvain
import matplotlib.cm as cm
import leidenalg

import os
import glob
import re

from bs4 import BeautifulSoup

In [2]:
# Defining directories to extract files from // store files to 

dataframes_dir = '/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/filtered_dataframes/'
graphs_dir = '/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/graphs/'
maps_dir = '/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/'
info_dir = '/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/'

In [3]:
### 0) Prepare dataset for Graph building

# Loading dataset

df_routes = pd.read_csv("/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/df_routes.csv")
df_routes.head(10)

Unnamed: 0,FromLatitude,FromLongitude,ToLatitude,ToLongitude,RouteDistance,FromLocation,ToLocation,FromCity,ToCity,FromCountry,ToCountry,CustomerName,RouteCount
0,57.71873,11.82862,57.71873,11.82862,0.0,968,77695,Skara,Göteborg,Sweden,Sweden,12414,240268
1,55.47409,9.15797,55.47409,9.15797,0.0,8409,2301,Horsens,Vejen,Denmark,Denmark,12895,195961
2,57.71877,11.8292,57.69555,11.85303,5.714,30251,70713,Gothenburg,Göteborg,Sweden,Sweden,12895,172307
3,57.71877,11.8292,57.71225,11.96575,12.148,30251,70709,Gothenburg,Göteborg,Sweden,Sweden,12895,132002
4,57.72323,11.85834,57.72323,11.85834,0.0,945,11641,DESTELDONK,Göteborg,Sweden,Sweden,12895,115222
5,57.71499,11.82177,57.71499,11.82177,0.0,9483,495,Göteborg,Göteborg,Sweden,Sweden,12417,114706
6,56.63623,9.77687,56.63623,9.77687,0.0,22240,2300,Dublin,Hobro,Denmark,Denmark,4467,78664
7,55.72639,-3.95987,55.72614,-3.96033,0.006,9230,20364,Larkhall,LARKHALL,United Kingdom,United Kingdom,8062,69049
8,57.71225,11.96575,57.71877,11.8292,14.361,30257,70696,Göteborg,Gothenburg,Sweden,Sweden,12415,51297
9,57.71872,11.82861,57.71872,11.82861,0.0,11592,81480,Skara,Göteborg,Sweden,Sweden,12414,50553


In [4]:
### 1) Building functions to interactively filter the dataset


## 1.1) Create function to filter for country and distance

def filter_country_distance(df, country, min_distance, max_distance=None):
    """
    Filters a given input dataframe based on a specified country and a range of route distances.
    
    Parameters:
    df (pd.DataFrame): The input dataframe containing 'FromCountry', 'ToCountry', and 'RouteDistance' columns.
    country (str): The country to filter the data for; we include not only domestic deliveries in the filter, but also deliveries that have only either the pick-up or drop-off in the specified country.
    min_distance (float): The minimum route distance to include in the filtered data.
    max_distance (float, optional): The maximum route distance to include in the filtered data. Defaults to None.

    Returns:
    pd.DataFrame: A new dataframe filtered based on the specified country and distance range.
    """

    country_filter = (df['FromCountry'] == country) | (df['ToCountry'] == country)
    if max_distance is not None:
        distance_filter = (df['RouteDistance'] >= min_distance) & (df['RouteDistance'] <= max_distance)
    else:
        distance_filter = (df['RouteDistance'] >= min_distance)
    
    filtered_df = df[country_filter & distance_filter]
    return filtered_df


## 1.2) Create function to export filtered dataframes

def export_filtered_dataframes(filtered_dataframes, directory):

    """
    Exports the filtered dataframes from a dictionary to a specified directory as CSV files.
    
    Parameters:
    filtered_dataframes (dict): A dictionary containing the filtered dataframes with their respective keys.
    directory (str): The directory where the filtered dataframes should be saved as CSV files.
    """
     
    for key, df in filtered_dataframes.items():
        filename = os.path.join(directory, f"{key}.csv")
        
        if not os.path.exists(filename):
            df.to_csv(filename, index=False)
            print(f"Data frame {key} has been exported to {filename}.")
        else:
            print(f"Data frame {key} already exists at {filename}.")


## 1.3) Create function that requests  user input of country

def get_country_input(prompt):

    """
    Requests user input of a valid country from a predefined list of countries.
    
    Parameters:
    prompt (str): A string to display as a prompt for the user input.
    
    Returns:
    str: A valid country entered by the user.
    """

    valid_countries = ["United Kingdom", "Sweden", "Belgium", "Netherlands", "Germany", "Denmark"]
    while True:
        country = input(prompt)
        if country in valid_countries:
            return country
        else:
            print("Invalid country. Please enter one of the following countries:", ', '.join(valid_countries))


## 1.4) Create function that requests  user input of minimum distance and maximum distance (to define range to look at)

def get_distance_input(prompt):
    """
    Requests user input for a distance and validates the input.
    
    Parameters:
    prompt (str): The prompt to display when requesting user input.
    
    Returns:
    float or str: A valid distance input as a float, or the string 'all' for no maximum distance.
    """
    while True:
        try:
            distance = input(prompt)
            if distance == 'all':
                return distance
            distance = float(distance)
            if distance >= 0:
                return distance
            else:
                print("Invalid input. Distance must be greater than or equal to 0.")
        except ValueError:
            print("Invalid input. Please enter a valid number or 'all'.")



## 1.5) Defining main() function  to be called to execute filtering of dataset

def filter_and_export():
    
    """
    Interactively requests user input for country and distance range, filters a dataset based on the input, 
    and exports the filtered dataset as a CSV file.
    """

    # Get user inputs
    country = get_country_input("Please enter a country (United Kingdom, Sweden, Belgium, Netherlands, Germany, Denmark): ")
    min_distance = get_distance_input("Please enter a minimum distance (>= 0): ")
    max_distance = get_distance_input("Please enter a maximum distance (>= minimum distance) or type 'all' for no maximum: ")

    if max_distance == "all":
        max_distance = None
    else:
        while float(max_distance) <= min_distance:
            print("Maximum distance must be greater than the minimum distance.")
            max_distance = get_distance_input("Please enter a maximum distance (>= minimum distance) or type 'all' for no maximum: ")

    # Filter the dataframe based on user inputs
    filtered_df = filter_country_distance(df_routes, country, min_distance, max_distance)

    # Export the filtered dataframe
    if not os.path.exists(dataframes_dir):
        os.makedirs(dataframes_dir)

    if country == "United Kingdom":
        country_abbr = "UK"
    else:
        country_abbr = country.replace(' ', '_')

    if max_distance is None:
        filtered_df_key = f"df_routes_{country_abbr}_all"
    else:
        max_distance_str = int(max_distance)
        filtered_df_key = f"df_routes_{country_abbr}_{int(min_distance)}_{max_distance_str}"

    export_filtered_dataframes({filtered_df_key: filtered_df}, dataframes_dir)



In [6]:
# Calling main function to start filtering proceess

# if __name__ == "__main__":
filter_and_export()

Data frame df_routes_Denmark_all already exists at /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/filtered_dataframes/df_routes_Denmark_all.csv.


In [7]:
### 2) Building functions for Graph Building 

def build_graph(df_routes):

    """
    Builds a directed graph object from the given dataframe of routes.
    
    Args:
        df_routes (pandas.DataFrame): A dataframe containing columns 'FromLatitude', 'FromLongitude', 'ToLatitude',
                                       'ToLongitude', and 'RouteCount', representing the routes between locations and the
                                       number of times each route was taken.
    
    Returns:
        G (networkx.DiGraph): A directed graph object representing the routes between locations, with edges weighted by
                              the number of times each route was taken.
    """

    if df_routes.empty or not all(col in df_routes.columns for col in ['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude', 'RouteCount']):
        raise ValueError("Input dataframe is empty or missing required columns.")

    # Creating empty graph
    G = nx.DiGraph()

    # Add nodes and edges to the graph
    for idx, row in df_routes.iterrows():
        from_node = (row['FromLatitude'], row['FromLongitude'])
        to_node = (row['ToLatitude'], row['ToLongitude'])

        if from_node not in G:
            G.add_node(from_node, latitude=row['FromLatitude'], longitude=row['FromLongitude'])
        if to_node not in G:
            G.add_node(to_node, latitude=row['ToLatitude'], longitude=row['ToLongitude'])

        G.add_edge(from_node, to_node, route_count=row['RouteCount'], route_distance=row['RouteDistance'])

    return G


def calculate_node_traffic(graph):

    """
    Calculates the total traffic passing through each node in the given graph.
    
    Args:
        graph (networkx.DiGraph): The graph object representing the routes between locations.
    
    Returns:
        node_traffic (dict): A dictionary mapping node coordinates to the total traffic passing through them.
    """

    node_traffic = {}

    for edge in graph.edges(data=True):
        start, end, data = edge
        route_count = data['route_count'] 

        if start not in node_traffic:
            node_traffic[start] = 0
        if end not in node_traffic:
            node_traffic[end] = 0

        node_traffic[start] += route_count
        node_traffic[end] += route_count

    return node_traffic


def normalize_node_traffic(node_traffic):

    """
    Normalizes the traffic values for each node to the range [0, 1].
    
    Args:
        node_traffic (dict): A dictionary mapping node coordinates to the total traffic passing through them.
    
    Returns:
        node_traffic_normalized (dict): A dictionary mapping node coordinates to the normalized traffic values.
    """

    max_node_traffic = max(node_traffic.values())
    return {node: math.log(traffic + 1) / math.log(max_node_traffic + 1) for node, traffic in node_traffic.items()}


def create_map(df_routes, graph, node_traffic_normalized):

    """
    Creates a Folium map object displaying the nodes and edges of the given graph, with node sizes and edge colors
    scaled according to the normalized traffic values.
    
    Args:
        df_routes (pandas.DataFrame): A dataframe containing columns 'FromLatitude' and 'FromLongitude', representing the
                                       locations of the nodes in the graph.
        graph (networkx.DiGraph): The graph object representing the routes between locations.
        node_traffic_normalized (dict): A dictionary mapping node coordinates to the normalized traffic values.
    
    Returns:
        folium_map (folium.folium.Map): A Folium map object displaying the nodes and edges of the graph.
    """

    if graph.number_of_nodes() == 0:
        raise ValueError("Input graph has no nodes.")

    # Calculate the map center using the average latitude and longitude of all nodes
    map_center = [sum(coord[0] for coord in graph.nodes()) / graph.number_of_nodes(),
                  sum(coord[1] for coord in graph.nodes()) / graph.number_of_nodes()]

    folium_map = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB Positron')
    add_nodes_edges_to_map(graph, folium_map, node_traffic_normalized)
    return folium_map


def add_nodes_edges_to_map(graph, folium_map, node_traffic_normalized):

    """
    Adds the nodes and edges of the given graph to the given Folium map object, with node sizes and edge colors
    scaled according to the normalized traffic values.
    
    Args:
        graph (networkx.DiGraph): The graph object representing the routes between locations.
        folium_map (folium.folium.Map): The Folium map object to which the nodes and edges should be added.
        node_traffic_normalized (dict): A dictionary mapping node coordinates to the normalized traffic values.
    """

    max_route_count = max(data['route_count'] for _, _, data in graph.edges(data=True))

    for edge in graph.edges(data=True):
        start, end, data = edge

        start_intensity = node_traffic_normalized[start]
        end_intensity = node_traffic_normalized[end]

        start_color = matplotlib.colors.to_hex(plt.cm.get_cmap('Blues')(start_intensity))
        end_color = matplotlib.colors.to_hex(plt.cm.get_cmap('Blues')(end_intensity))

        start_node_marker = CircleMarker(location=start, radius=0.5, color=start_color, fill=True, fill_opacity=0.5)
        end_node_marker = CircleMarker(location=end, radius=0.5, color=end_color, fill=True, fill_opacity=0.5)

        intensity = math.log(data['route_count'] + 1) / math.log(max_route_count + 1)
        edge_color = matplotlib.colors.to_hex(plt.cm.get_cmap('coolwarm')(intensity))

        polyline = PolyLine(locations=[start, end], color=edge_color, weight=1)

        folium_map.add_child(start_node_marker)
        folium_map.add_child(end_node_marker)
        folium_map.add_child(polyline)

In [8]:
### 3) Creating graphs and respective maps for all the created dataframes in step 1) by using the functions defined in step 2)

# Find all CSV files in the directory
csv_files = glob.glob(os.path.join(dataframes_dir, 'df_routes_*.csv'))

for csv_file in csv_files:
    # Load the DataFrame
    df = pd.read_csv(csv_file)

    # Extract country, min_distance, and max_distance from the file name
    file_name = os.path.basename(csv_file)[:-4]  # Remove the .csv extension
    file_name_parts = file_name.split('_')

    if file_name_parts[-1] == "all":
        max_distance = None
        min_distance = None
        country = '_'.join(file_name_parts[2:-1])  # Rejoin the country name parts
    else:
        min_distance = file_name_parts[-2]
        max_distance = file_name_parts[-1]
        country = '_'.join(file_name_parts[2:-2])  # Rejoin the country name parts

    # Build the graph and calculate node traffic
    graph = build_graph(df)
    node_traffic = calculate_node_traffic(graph)
    node_traffic_normalized = normalize_node_traffic(node_traffic)

    # Save the graph to a GraphML file and export to directory
    if max_distance is None:
        graph_name = f'G_{country}_all.graphml'
    else:
        graph_name = f'G_{country}_{min_distance}_{max_distance}.graphml'
    graph_path = os.path.join(graphs_dir, graph_name)
    nx.write_graphml(graph, graph_path)
    
    print(f"Graph saved for {country}_{min_distance}_{max_distance}: {graph_path}")

    # Create and save the map
    map_all = create_map(df, graph, node_traffic_normalized)
    if max_distance is None:
        map_name = f'{country}_map_all_routes.html'
    else:
        map_name = f'{country}_map_all_routes_{min_distance}_{max_distance}.html'
    map_path = os.path.join(maps_dir, map_name)
    map_all.save(map_path)

    print(f"Map saved for {country}_{min_distance}_{max_distance}: {map_path}")

Graph saved for Denmark_None_None: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/graphs/G_Denmark_all.graphml
Map saved for Denmark_None_None: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_map_all_routes.html
Graph saved for Belgium_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/graphs/G_Belgium_5_300.graphml
Map saved for Belgium_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_map_all_routes_5_300.html
Graph saved for Denmark_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/graphs/G_Denmark_5_300.graphml
Map saved for Denmark_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_map_all_routes_5_300.html
Graph saved for Belgium_None_None: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03

In [9]:
def visualize_top_routes(graphs_dir, maps_dir):
    
    """
    Visualizes the top 2.5% most used routes and their attached nodes for each country's graph in the specified directory, where each graph's name follows the format 'G_{country}_all.graphml'.
    
    Parameters:
    graphs_dir (str): The directory where the graphs for each country are stored in GraphML format.
    maps_dir (str): The directory where the generated folium maps will be saved in HTML format.
    
    Returns:
    None
    """

    top_routes_dict = {}
    
    # Get the graph name and split it into components
    file = graph.name
    components = file.split("_")

    # Extract the country, min_distance, and max_distance from the components
    country = components[1]
    if len(components) == 4:
        min_distance, max_distance = components[2:4]
        max_distance = max_distance.split(".")[0]
    else:
        min_distance, max_distance = "all", "all"

    # Get the edge weights and sort them in descending order
    edges_weights = [(u, v, data["route_count"], data["route_distance"]) for u, v, data in graph.edges(data=True)]
    edges_weights.sort(key=lambda x: x[2], reverse=True)

    # Filter the top 2.5% of edges
    n_top_edges = int(0.025 * len(edges_weights))
    top_edges = edges_weights[:n_top_edges]

    # Store the top edges in the dictionary
    top_routes_dict[country] = top_edges

    # Create a folium map centered at the first node's coordinates
    first_node = graph.nodes[top_edges[0][0]]
    m = Map(location=[first_node["latitude"], first_node["longitude"]], zoom_start=5, tiles='CartoDB Positron')

    # Normalize the weights for color mapping
    max_weight = top_edges[0][2]
    min_weight = top_edges[-1][2]

    def normalize_weight(weight):
        normalized_weight = (weight - min_weight) / (max_weight - min_weight)
        normalized_weight = max(0, normalized_weight)  # Ensure the value is non-negative
        #print(f"[visualize_top_routes] weight: {weight}, min_weight: {min_weight}, max_weight: {max_weight}, normalized_weight: {normalized_weight}")
        return normalized_weight ** 0.5

    # Add the top routes to the map
    for u, v, weight, _ in reversed(top_edges):  # Reverse the order in which routes are added to the map
        node_u = graph.nodes[u]
        node_v = graph.nodes[v]

        coordinates = [
            [node_u["latitude"], node_u["longitude"]],
            [node_v["latitude"], node_v["longitude"]],
        ]

        # Color code the edges using a warm-cold colormap
        edge_color = plt.cm.coolwarm(normalize_weight(weight))

        # Convert the color to a hex format
        edge_color_hex = "#{:02x}{:02x}{:02x}".format(*(int(255 * c) for c in edge_color[:3]))

        # Calculate the weight of the polyline based on the route count
        route_weight = 0.5 + 2 * normalize_weight(weight)

        # Add the route to the map
        polyline = PolyLine(
            locations=coordinates,
            color=edge_color_hex,
            weight=route_weight,
            opacity=1,
        )
        m.add_child(polyline)

    # Save the map to the maps directory
    map_path = os.path.join(maps_dir, f"{country}_map_top_routes_{min_distance}_{max_distance}.html")
    m.save(map_path)
    print(f"Map with Top 2.5% most used routes saved for {country}_{min_distance}_{max_distance}: {map_path}")

    # Return dictionary with top 2.5% used routes
    return top_edges

In [10]:
top_routes_dict = {}

for file in os.listdir(graphs_dir):
    if file.endswith("_all.graphml") or not file.endswith(".graphml"):
        continue

    graph_file = os.path.join(graphs_dir, file)
    graph = nx.read_graphml(graph_file)
    graph.name = file

    top_edges = visualize_top_routes(graph, maps_dir)
    
    # Get the graph name and split it into components
    components = file.split("_")
    country = components[1]
    if len(components) == 4:
        min_distance, max_distance = components[2:4]
        max_distance = max_distance.split(".")[0]
    else:
        min_distance, max_distance = "all", "all"
        
    top_routes_dict[f"{country}_{min_distance}_{max_distance}"] = top_edges

Map with Top 2.5% most used routes saved for Belgium_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_map_top_routes_5_300.html
Map with Top 2.5% most used routes saved for Denmark_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_map_top_routes_5_300.html


In [11]:
def calculate_communities(graph):

    """
    Calculates communities in the input graph using the Louvain algorithm.
    
    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    
    Returns:
    dict: The partition of the graph's nodes into communities.
    """
    
    # Convert the directed graph to an undirected graph
    graph_undirected = graph.to_undirected()

    # Calculate communities using the Louvain algorithm on the undirected graph
    partition = community_louvain.best_partition(graph_undirected)

    # Return the partition for later use
    return partition

In [12]:
### Get 3 biggest communities for the Graph and visualize

def visualize_top_communities(graph, partition, maps_dir):
    """
    Visualizes the three biggest communities in the input graph on a map, based on the given partition.
    
    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    partition (dict): The partition of the graph's nodes into communities.
    """

    # Extract min_distance and max_distance from the graph name
    graph_name_no_ext = os.path.splitext(graph.name)[0]
    country, min_distance, max_distance = graph_name_no_ext.split("_")[1:4]
    max_distance = max_distance.split(".")[0]

    # Get the top 3 largest communities
    community_counts = Counter(partition.values())
    largest_communities = [community for community, _ in community_counts.most_common(3)]

    # Assign unique colors to each community
    community_colors = {
        largest_communities[0]: np.array([0, 0, 0.7]),  # dark blue
        largest_communities[1]: np.array([0, 0.5, 1]),    # regular blue
        largest_communities[2]: np.array([0.7, 0.85, 1]) # light blue
    }
    
    # Calculate the map center using the average latitude and longitude of all nodes
    map_center = [sum(graph.nodes[node]['latitude'] for node in graph.nodes()) / graph.number_of_nodes(),
                  sum(graph.nodes[node]['longitude'] for node in graph.nodes()) / graph.number_of_nodes()]

    # Create a folium map for visualization
    folium_map = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB Positron')

    # Add nodes to the folium map with colors based on their community (only for the largest communities)
    for node, community in partition.items():
        if community in largest_communities:
            marker = folium.CircleMarker(location=[graph.nodes[node]['latitude'], graph.nodes[node]['longitude']],
                                         radius=5,
                                         color=matplotlib.colors.to_hex(community_colors[community]),
                                         fill=True,
                                         fill_opacity=1)
            folium_map.add_child(marker)

    # Save the map as an HTML file
    html_filename = f"{country}_communities_top3_{min_distance}_{max_distance}.html"
    map_path = os.path.join(maps_dir, html_filename)
    folium_map.save(map_path)

    print(f"Top 3 Communities map for {country}_{min_distance}_{max_distance} saved as {map_path}")
    
    # Return the partition of the top 3 communities
    top_communities_partition = {node: community for node, community in partition.items() if community in largest_communities}
    
    return top_communities_partition

In [17]:
def calculate_visualize_top_nodes(graph, top_communities_partition, save_outputs = True):
    
    """
    Calculates the top nodes for each of the top 3 communities in the input graph based on node traffic and centrality measures.
    Visualizes the top nodes on a map using Folium, and saves the node information in an HTML table.

    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    top_communities_partition (dict): A dictionary that maps each top node to its community label.

    Returns:
    list: A list of the top 15 nodes across all top communities, sorted by node traffic in descending order.
    """


    # Remove the file extension from the graph name
    graph_name_no_ext = os.path.splitext(graph.name)[0]

    # Extract country, min_distance, and max_distance from the graph name
    country, min_distance, max_distance = graph_name_no_ext.split("_")[1:4]
    max_distance = max_distance.split(".")[0]


    # Calculate node traffic
    node_traffic = calculate_node_traffic(graph)

    # Get top 3 communities from the partition
    top_communities = [comm for comm, _ in Counter(top_communities_partition.values()).most_common(3)]

    # Get top 5 nodes by total traffic for each of the top 3 communities
    top_nodes = []
    for community in top_communities:
        community_nodes = [node for node, comm in top_communities_partition.items() if comm == community]
        community_node_traffic = {node: node_traffic[node] for node in community_nodes}
        top_community_nodes = sorted(community_node_traffic, key=community_node_traffic.get, reverse=True)[:5]
        top_nodes.extend(top_community_nodes)

    # Calculate weighted centrality measures for the top nodes
    node_data = []

    for node in top_nodes:
        weight_func = lambda u, v, d: d['route_count']
        betweenness_centrality = nx.betweenness_centrality(graph, weight=weight_func)[node]
        # eigenvector_centrality = nx.eigenvector_centrality(graph, weight='route_count', max_iter=2500)[node]
        degree_centrality = nx.degree_centrality(graph)[node]

        node_data.append((node, betweenness_centrality, degree_centrality))


    # Sort the node data by node traffic in descending order
    sorted_node_data = sorted(node_data, key=lambda x: node_traffic[x[0]], reverse=True)

    if save_outputs:
        # Visualization
        # Calculate the map center using the average latitude and longitude of all nodes
        map_center = [sum(graph.nodes[node]['latitude'] for node in graph.nodes()) / graph.number_of_nodes(),
                    sum(graph.nodes[node]['longitude'] for node in graph.nodes()) / graph.number_of_nodes()]

        # Create a folium map for visualization
        folium_map = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB Positron')

        # Assign unique colors to each community
        community_colors = {
            top_communities[0]: np.array([0, 0, 0.7]),  # dark blue
            top_communities[1]: np.array([0, 0.5, 1]),    # regular blue
            top_communities[2]: np.array([0.7, 0.85, 1]) # light blue
        }

        # Add top nodes to the folium map with colors based on their community
        for node, betweenness_centrality, degree_centrality in sorted_node_data:
            community = top_communities_partition[node]
            marker = folium.CircleMarker(location=[graph.nodes[node]['latitude'], graph.nodes[node]['longitude']],
                                        radius=7,
                                        color=matplotlib.colors.to_hex(community_colors[community]),
                                        fill=True,
                                        fill_opacity=1)
            folium_map.add_child(marker)

        # Save the map as an HTML file
        html_filename = f"{country}_top_nodes_{min_distance}_{max_distance}.html"
        print_message = f"Top 15 nodes map for {country}, min_distance: {min_distance}, max_distance: {max_distance} saved as "

        map_path = os.path.join(maps_dir, html_filename)
        folium_map.save(map_path)

        print(print_message + map_path) 


        # Generate an HTML string with the desired table format
        html_string = """
        <html>
        <head>
        <style>
            table {{
                width: 100%;
                border-collapse: collapse;
            }}
            th, td {{
                text-align: left;
                padding: 8px;
                border-bottom: 1px solid #ddd;
            }}
            th {{
                background-color: #f2f2f2;
            }}
        </style>
        </head>
        <body>
        <h1>Top Nodes for {graph_name}</h1>
        <table>
            <tr>
                <th>Node</th>
                <th>Node Traffic</th>
                <th>Weighted Betweenness Centrality</th>
                <th>Weighted Degree Centrality</th>
            </tr>
        """.format(graph_name=graph.name)

        for node, betweenness_centrality, degree_centrality in sorted_node_data:
            html_string += f"""
            <tr>
                <td>{node}</td>
                <td>{node_traffic[node]}</td>
                <td>{betweenness_centrality}</td>
                <td>{degree_centrality}</td>
            </tr>
            """

        html_string += """
        </table>
        </body>
        </html>
        """

        # Save the HTML string to a file
        file_name = f"{graph.name}_top_nodes_info.html"
        file_path = os.path.join(info_dir, file_name)

        with open(file_path, 'w') as file:
            file.write(html_string)

        # Print a confirmation message
        print(f"Node information for {graph.name} saved as {file_path}")

    # Return the top_nodes list for next step
    return top_nodes

In [18]:
# Now calling the functions defined above

# Iterate through the graph files in the graphs_dir directory
for file in os.listdir(graphs_dir):
    if file.endswith(".graphml") and not file.endswith("_all.graphml"):
        graph_file = os.path.join(graphs_dir, file)
        graph = nx.read_graphml(graph_file)
        graph.name = file  # Set the graph.name attribute explicitly

        # Calling two functions on community detection
        partition = calculate_communities(graph)
        top_communities_partition = visualize_top_communities(graph, partition, maps_dir)

        # Call the calculate_visualize_top_nodes function
        top_nodes = calculate_visualize_top_nodes(graph, top_communities_partition)


Top 3 Communities map for Belgium_5_300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_communities_top3_5_300.html
Top 15 nodes map for Belgium, min_distance: 5, max_distance: 300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_top_nodes_5_300.html
Node information for G_Belgium_5_300.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Belgium_5_300.graphml_top_nodes_info.html
Top 3 Communities map for Denmark_5_300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_communities_top3_5_300.html
Top 15 nodes map for Denmark, min_distance: 5, max_distance: 300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_top_nodes_5_300.html
Node information for G_Denmark_5_300.graphml saved as /Users/jangaydoul/Desktop/Copenh

In [19]:
def compare_top_routes_and_nodes(graph, top_nodes, top_routes, maps_dir):
    """
    Compares the routes attached to the top nodes with the top most important routes, creates a list of overlapping routes,
    and calculates the share of the top routes that are attached to at least one of the top nodes.

    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    top_nodes (list): List of top 15 nodes based on node traffic
    top_routes (list): List of the most important routes in the graph.
    maps_dir (str): The directory where the generated folium maps will be saved in HTML format.

    Returns:
    overlap_routes (list): List of routes that are both part of the most important routes and attached to at least one of the top nodes.
    """

    # Identify routes attached to the top nodes
    routes_attached_to_top_nodes = []
    for u, v, data in graph.edges(data=True):
        if u in top_nodes or v in top_nodes:
            routes_attached_to_top_nodes.append((u, v, data))

    # Compare routes_attached_to_top_nodes with top_routes
    top_route_edges = {(u, v) for u, v, route_count, route_distance in top_routes}

    overlap_routes = []
    for u, v, data in routes_attached_to_top_nodes:
        for tr_u, tr_v, tr_data, _ in top_routes:
            if (u == tr_u and v == tr_v) or (u == tr_v and v == tr_u):
                overlap_routes.append((u, v, data))
                break

    # Calculate the share of the top routes that are attached to at least one of the top nodes
    electrifiable_share = (len(overlap_routes) / len(top_routes)) * 100

    # Print result
    print(f"Electrifiable share of most important routes: {electrifiable_share}%")

    return overlap_routes, electrifiable_share, routes_attached_to_top_nodes

In [20]:
def visualize_overlap(graph, top_routes, overlap_routes, maps_dir):
    """
    Visualizes the overlapping routes between the top 2.5% most used routes and routes attached to the top nodes.

    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    top_routes (list): List of top 2.5% most used routes.
    overlap_routes (list): List of routes that are both part of the top 2.5% most used routes and attached to at least one of the top nodes.
    maps_dir (str): The directory where the generated folium maps will be saved in HTML format.

    Returns:
    None
    """

    # Create a folium map centered at the first node's coordinates
    first_node = graph.nodes[top_routes[0][0]]
    m = folium.Map(location=[first_node['latitude'], first_node['longitude']], zoom_start=5, tiles='CartoDB Positron')

    # Normalize the weights for color mapping
    max_weight = max(data['route_count'] for _, _, data in overlap_routes)
    min_weight = min(data['route_count'] for _, _, data in overlap_routes)


    def normalize_weight(weight):
        normalized_weight = (weight - min_weight) / (max_weight - min_weight)
        normalized_weight = max(0, normalized_weight)  # Ensure the value is non-negative
        return normalized_weight ** 0.5

    # Add the overlapping routes to the map
    for u, v, data in overlap_routes:
        node_u = graph.nodes[u]
        node_v = graph.nodes[v]

        coordinates = [
            [node_u['latitude'], node_u['longitude']],
            [node_v['latitude'], node_v['longitude']],
        ]

        # Color code the edges using a warm-cold colormap
        edge_color = plt.cm.coolwarm(normalize_weight(data['route_count']))

        # Convert the color to a hex format
        edge_color_hex = "#{:02x}{:02x}{:02x}".format(*(int(255 * c) for c in edge_color[:3]))

        # Calculate the weight of the polyline based on the route count
        route_weight = 0.5 + 2 * normalize_weight(data['route_count'])

        # Add the route to the map
        polyline = folium.PolyLine(
            locations=coordinates,
            color=edge_color_hex,
            weight=route_weight,
            opacity=1,
        )
        m.add_child(polyline)

    # Save the map to the maps directory
    map_path = os.path.join(maps_dir, f"{country}_overlap_top_routes_{min_distance}_{max_distance}.html")
    m.save(map_path)
    print(f"Overlap routes map saved for {country}_{min_distance}_{max_distance}: {map_path}")

In [21]:
# Create an empty dictionary to store electrifiable_share for each graph
electrifiable_share_dict = {}

# Loop through the directory and find graphs with the '_{min_distance}_{max_distance}.graphml' format
for file in os.listdir(graphs_dir):
    if "_all.graphml" in file or not file.endswith(".graphml"):
        continue

    country, min_distance, max_distance = file.split("_")[1:4]
    max_distance = max_distance.split(".")[0]

    # Read the graph
    graph_file = os.path.join(graphs_dir, file)
    graph = nx.read_graphml(graph_file)
    graph.name = file  # Set the graph.name attribute explicitly

    # Calculate the communities and top communities partition for the graph
    partition = calculate_communities(graph)
    top_communities_partition = visualize_top_communities(graph, partition, maps_dir)

    # Calculate the top 15 nodes for the graph
    top_nodes = calculate_visualize_top_nodes(graph, top_communities_partition)

    # Get the top 2.5% most used routes for the graph
    top_routes = top_routes_dict[f"{country}_{min_distance}_{max_distance}"]

    # Compare the routes attached to the top nodes with the top 2.5% of routes
    overlap_routes, electrifiable_share, routes_attached_to_top_nodes = compare_top_routes_and_nodes(graph, top_nodes, top_routes, maps_dir)

    # Add the electrifiable_share to the dictionary
    electrifiable_share_dict[graph.name] = electrifiable_share

    # Visualize the overlapping routes
    visualize_overlap(graph, top_routes, overlap_routes, maps_dir)

Top 3 Communities map for Belgium_5_300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_communities_top3_5_300.html
Top 15 nodes map for Belgium, min_distance: 5, max_distance: 300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_top_nodes_5_300.html
Node information for G_Belgium_5_300.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Belgium_5_300.graphml_top_nodes_info.html
Electrifiable share of most important routes: 91.17647058823529%
Overlap routes map saved for Belgium_5_300: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Belgium_overlap_top_routes_5_300.html
Top 3 Communities map for Denmark_5_300 saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/maps/Denmark_communities_top3_5_300.html
Top 15 nodes map for Denmark, min_distance

In [23]:
# Print the electrifiable_share_dict
print(electrifiable_share_dict)

# Function to convert the dictionary to an HTML table
def dict_to_html_table(dictionary):
    html_table = "<table border='1'>"
    html_table += "<tr><th>Graph Name</th><th>Electrifiable Share</th></tr>"
    
    for key, value in dictionary.items():
        html_table += f"<tr><td>{key}</td><td>{value}</td></tr>"
    
    html_table += "</table>"
    return html_table


# Convert the electrifiable_share_dict to an HTML table
html_table = dict_to_html_table(electrifiable_share_dict)

# Save the HTML table to a file
os.makedirs(info_dir, exist_ok=True)  # Create the directory if it doesn't exist
html_file_path = os.path.join(info_dir, "electrifiable_share_all_countries.html")

with open(html_file_path, "w") as html_file:
    html_file.write(html_table)

print(f"Electrifiable share dictionary saved as an HTML file: {html_file_path}")

{'G_Belgium_5_300.graphml': 91.17647058823529, 'G_Denmark_5_300.graphml': 67.81609195402298}
Electrifiable share dictionary saved as an HTML file: /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/electrifiable_share_all_countries.html


In [24]:
### 4) Create function to get info about Graph and communities

def graph_community_info(graph, partition):

    """
    Prints various community-related information for the input graph, such as the number of nodes and edges, 
    modularity, and top communities based on size and conductance.
    
    Parameters:
    graph (networkx.Graph): The input graph with nodes representing locations and edges representing routes.
    partition (dict): The partition of the graph's nodes into communities.
    """

    # Number of nodes and edges in the graph
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()

    # Number of communities and modularity
    num_communities = len(set(partition.values()))
    modularity = community_louvain.modularity(partition, graph.to_undirected())

    # Top 3 largest communities
    community_counts = Counter(partition.values())
    largest_communities = community_counts.most_common(3)

    # Calculate the conductance for each of the largest communities
    community_nodes = {comm: set() for comm, _ in largest_communities}
    for node, community in partition.items():
        if community in community_nodes:
            community_nodes[community].add(node)

    community_conductance = {}
    for community, nodes in community_nodes.items():
        community_boundary = list(nx.edge_boundary(graph, nodes))
        community_conductance[community] = len(community_boundary) / len(nodes)

    # Top 3 communities with the lowest conductance score (from the largest communities)
    lowest_conductance_communities = sorted(community_conductance.items(), key=lambda x: x[1], reverse=False)

    # Calculate avg. Degree of nodes 
    avg_degree = sum(d for _, d in graph.degree()) / float(num_nodes)

    # Average clustering coefficient
    avg_clustering_coeff = nx.average_clustering(graph)

    # Create an HTML string for storing the information
    html_string = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Graph Information</title>
    </head>
    <body>
        <h1>{graph.name} Information</h1>
        <p>Number of nodes: {graph.number_of_nodes()}</p>
        <p>Number of edges: {graph.number_of_edges()}</p>
        <p>Average degree: {avg_degree:.4f}</p>
        <p>Average clustering coefficient: {avg_clustering_coeff:.4f}</p>
        <p>Number of communities: {num_communities}</p>
        <p>Modularity of the community structure: {modularity:.4f}</p>
        <h2>Top 3 largest communities:</h2>
        <ol>
            {"".join([f"<li>Community {comm}: {size} nodes</li>" for comm, size in largest_communities])}
        </ol>
        <h2>Conductance score for the largest communities:</h2>
        <ol>
            {"".join([f"<li>Community {comm}: {score:.4f}</li>" for comm, score in lowest_conductance_communities])}
        </ol>
    </body>
    </html>
    """


    # Save the HTML string to a file
    file_name = f"{graph.name}_info.html"
    file_path = os.path.join(info_dir, file_name)

    with open(file_path, 'w') as file:
        file.write(html_string)


    # Print a confirmation message
    print(f"Graph information for {graph.name} saved as {file_path}")

In [25]:
# Iterate through all graph files in the graphs_dir directory
for file in os.listdir(graphs_dir):
    if not file.endswith(".graphml"):
        continue

    # Read the graph from the file
    graph_file = os.path.join(graphs_dir, file)
    graph = nx.read_graphml(graph_file)
    graph.name = file  # Set the graph.name attribute explicitly

    # Calculate the communities for the graph using the Louvain algorithm
    partition = calculate_communities(graph)

    # Call the graph_community_info function
    graph_community_info(graph, partition)

Graph information for G_Belgium_5_300.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Belgium_5_300.graphml_info.html
Graph information for G_Denmark_5_300.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Denmark_5_300.graphml_info.html
Graph information for G_Belgium_all.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Belgium_all.graphml_info.html
Graph information for G_Denmark_all.graphml saved as /Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/info/G_Denmark_all.graphml_info.html


In [None]:
### 5) Create function to compare the top nodes among different countries and distance ranges

def top_nodes_comparison():
  top_nodes_dict = {}
  # Iterate through all files in the info_dir
  for file_name in os.listdir(info_dir):
    # Ignore files ending with '_all.graphml' and hidden files (starting with '.')
    if file_name.endswith('graphml_top_nodes_info.html'):
      #print(f"Processing file: {file_name}")  # Print the current file being processed
      # Get main information for the file being processed
      country, min_distance, max_distance = file_name.split("_")[1:4]
      max_distance = max_distance.split(".")[0]
      html_file_name = f"G_{country}_{min_distance}_{max_distance}.graphml_top_nodes_info.html"
    
      # Find the corresponding HTML file and extract the top nodes
      with open(os.path.join(info_dir, html_file_name), "r") as html_file:
        soup = BeautifulSoup(html_file.read(), "html.parser")
        top_nodes_table = soup.find("table")
        top_nodes = []
        for row in top_nodes_table.findAll("tr")[1:]:
          cell = row.findAll("td")[0]
          node_coordinates = tuple(map(float, re.findall(r"[-+]?\d*\.\d+|\d+", cell.text)))
          top_nodes.append(node_coordinates)

      # Store iformation about top nodes for each country and range in a dictionary
      top_nodes_dict[f"G_{country}_{min_distance}_{max_distance}"]=top_nodes

  # Generate a dataframe from the dictionary values
  df=pd.DataFrame.from_dict(top_nodes_dict)
  return top_nodes_dict, df

In [None]:
# Call the function above 

top_nodes_dict,top_nodes_df=top_nodes_comparison()

In [None]:
# Visualise the dataframe
top_nodes_df

In [None]:
### 6) Create function to find common nodes between the different scenarios for each country

def get_node_comparison_by_country(top_nodes_dict):
  countries = ["UK", "Sweden", "Belgium", "Netherlands", "Germany", "Denmark"]
  common_nodes={}

  # Iterate through each country
  for country in countries:
    shared=[]
    common_positions={}
    l=[]
    in_250=0
    in_500=0

    # Find nodes among the [5, 500] range top nodes that are also present among the top nodes of the other ranges
    for n in top_nodes_dict[f"G_{country}_5_300"]:
      if n in top_nodes_dict[f"G_{country}_5_250"]:
        shared.append(n)
        in_250+=1
      if n in top_nodes_dict[f"G_{country}_5_500"]:
        in_500+=1
      if n not in shared:
        shared.append(n) 

    # For each common node for a country its position among the top nodes is retrieved and stored in a dictionary
    for s in shared: 
      position={}
      position['300']=top_nodes_dict[f"G_{country}_5_300"].index(s)+1
      try:
        position['250']=top_nodes_dict[f"G_{country}_5_250"].index(s)+1
      except:
        position['250']='/' # Meaning the node is not present among top nodes in this range
      try:
        position['500']=top_nodes_dict[f"G_{country}_5_500"].index(s)+1
      except:
        position['500']='/'  
      common_positions[s]=position
    
    # Values of the position of common nodes for each country are stored in a dictionary
    common_nodes[f"{country}"]= common_positions    

    # Function creating an HTML table for the results
    def common_nodes_html(common_nodes, country): 
      html_table = "<table border='1'>"
      html_table += "<tr><th>Node</th><th> [5,300] Position</th><th>  [5,250] Position</th><th>[5,500] Position</th></tr>"
    
      for key, value in common_nodes[country].items():
        html_table += f"<tr><td>{key}</td><td>{value['300']}</td> <td>{value['250']}</td><td>{value['500']}</td></tr>"
    
      html_table += "</table>"
      return html_table

    # Create an HTML string for storing the information
    html_string = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
      <meta charset="UTF-8">
      <meta name="viewport" content="width=device-width, initial-scale=1.0">
      <title>Graph Information</title>
    </head>
    <body>
      <h1>{country}: Comparison of nodes based on different ranges </h1>
      <p>Number of nodes in the [5,300] range that are also in the [5,250] range: {in_250}</p>
      <p>Number of nodes in the [5,300] range that are also in the [5,500] range: {in_500}</p>
    </body>
    </html>
    """
    # Add the table to the HTML string
    html_string+=common_nodes_html(common_nodes,country)

    html_string+="</body>\n</html>"

    # Save the HTML string to a file
    file_name = f"{country}_node_comparison_range_based.html"
    file_path = os.path.join(info_dir, file_name)

    with open(file_path, 'w') as file:
      file.write(html_string)
    # Print a confirmation message
    print(f"Comparison of nodes based on different ranges for {country} saved as {file_path}")

----

In [26]:
### OLD 

def electrification_percentage(graphs_dir, top_communities_partition):
    """
    Calculates the electrification percentages of routes and deliveries for each step 
    of adding the top nodes to the electrification strategy.

    Parameters:
    graphs_dir (str): The directory path where the graph files are located.

    Returns:
    None: The function only prints the electrification percentages for each step.
    """

    # Iterate through all files in the graphs_dir
    for file_name in os.listdir(graphs_dir):
        # Ignore files ending with '_all.graphml' and hidden files (starting with '.')
        if not file_name.endswith('_all.graphml') and not file_name.startswith('.'):
            print(f"Processing file: {file_name}")  # Print the current file being processed

            # Read the graph from the file and set its name attribute to the file_name
            graph = nx.read_graphml(os.path.join(graphs_dir, file_name))
            graph.name = file_name  # Set the graph name attribute

            # Extract the country name, min_distance and max_distance from the file_name
            _, country, min_distance, max_distance = file_name.split("_")[:4]
            max_distance = os.path.splitext(max_distance)[0]

            # Read the all_graph file for the same country
            all_graph_file_name = f"G_{country}_all.graphml"
            all_graph = nx.read_graphml(os.path.join(graphs_dir, all_graph_file_name))

            # Calculate the total number of routes and deliveries in the all_graph
            total_routes = all_graph.number_of_edges()
            total_deliveries = sum(data['route_count'] for _, _, data in all_graph.edges(data=True))

            total_routes_subset = graph.number_of_edges()
            total_deliveries_subset = sum(data['route_count'] for _, _, data in graph.edges(data=True))

            # Get the top nodes using the calculate_visualize_top_nodes function
            top_nodes = calculate_visualize_top_nodes(graph, top_communities_partition, save_outputs=False)

            # Initialize lists to store the number of electrified routes and deliveries for each step
            electrified_routes = []
            electrified_deliveries = []
            electrified_routes_percentage = []
            electrified_deliveries_percentage = []
            electrified_routes_subset_percentage = []
            electrified_deliveries_subset_percentage = []

            # Iterate through each step of adding the top nodes to the electrification strategy
            for i in range(1, len(top_nodes) + 1):
                # Get the top nodes to be electrified at the current step
                current_top_nodes = top_nodes[:i]

                # Initialize a set to store the electrified routes and a counter for electrified deliveries
                routes_attached_to_top_nodes = set()
                deliveries_attached_to_top_nodes = 0

                # Iterate through all edges (routes) in the graph
                for u, v, data in graph.edges(data=True):
                    # If either endpoint of the route is in the current_top_nodes, add the route to the electrified set
                    # and update the electrified deliveries counter
                    if u in current_top_nodes or v in current_top_nodes:
                        routes_attached_to_top_nodes.add((u, v))
                        deliveries_attached_to_top_nodes += data['route_count']

                # Update the electrified_routes and electrified_deliveries lists with the current step's values
                electrified_routes.append(len(routes_attached_to_top_nodes))
                electrified_deliveries.append(deliveries_attached_to_top_nodes)

                # Calculate the electrification percentages for the current step
                electrified_routes_percentage.append((len(routes_attached_to_top_nodes) / total_routes) * 100)
                electrified_deliveries_percentage.append((deliveries_attached_to_top_nodes / total_deliveries) * 100)

                electrified_routes_subset_percentage.append((len(routes_attached_to_top_nodes) / total_routes_subset) * 100)
                electrified_deliveries_subset_percentage.append((deliveries_attached_to_top_nodes / total_deliveries_subset) * 100)

                # Create an HTML table for the results
                html_table = "<table border='1'><tr><th>Node</th><th>Electrified Routes</th><th>% Electrification (Routes) Total</th><th>% Electrification (Routes) Subset</th><th>Electrified Deliveries</th><th>% Electrification (Deliveries) Total</th><th>% Electrification (Deliveries) Subset</th></tr>"
                for i, (node, routes, routes_percentage, routes_subset_percentage, deliveries, deliveries_percentage, deliveries_subset_percentage) in enumerate(zip(top_nodes, electrified_routes, electrified_routes_percentage, electrified_routes_subset_percentage, electrified_deliveries, electrified_deliveries_percentage, electrified_deliveries_subset_percentage), start=1):
                    html_table += f"<tr><td>{node}</td><td>{routes}</td><td>{routes_percentage:.2f}%</td><td>{routes_subset_percentage:.2f}%</td><td>{deliveries}</td><td>{deliveries_percentage:.2f}%</td><td>{deliveries_subset_percentage:.2f}%</td></tr>"
                html_table += "</table>"


                # Save the HTML table to a file
                with open(os.path.join(info_dir, f"{country}_{min_distance}_{max_distance}_pareto_electrification.html"), "w") as f:
                    f.write(html_table)

In [61]:
def electrification_percentage(graphs_dir, info_dir, top_communities_partition):
    """
    Calculates the electrification percentages of routes and deliveries for each step 
    of adding the top nodes to the electrification strategy.

    Parameters:
    graphs_dir (str): The directory path where the graph files are located.

    Returns:
    None: The function only prints the electrification percentages for each step.
    """

    # Iterate through all files in the graphs_dir
    for file_name in os.listdir(graphs_dir):
        # Ignore files ending with '_all.graphml' and hidden files (starting with '.')
        if not file_name.endswith('_all.graphml') and not file_name.startswith('.'):
            print(f"Processing file: {file_name}")  # Print the current file being processed

            # Read the graph from the file and set its name attribute to the file_name
            graph = nx.read_graphml(os.path.join(graphs_dir, file_name))
            graph.name = file_name  # Set the graph name attribute

            # Convert the graph node coordinates to tuples of floats
            graph = nx.relabel_nodes(graph, {node: tuple(map(float, node.strip('()').split(','))) for node in graph.nodes()})

            # Extract the country name, min_distance and max_distance from the file_name
            _, country, min_distance, max_distance = file_name.split("_")[:4]
            max_distance = os.path.splitext(max_distance)[0]

            # Read the all_graph file for the same country
            all_graph_file_name = f"G_{country}_all.graphml"
            all_graph = nx.read_graphml(os.path.join(graphs_dir, all_graph_file_name))

            # Calculate the total number of routes and deliveries in the all_graph
            total_routes = all_graph.number_of_edges()
            total_deliveries = sum(data['route_count'] for _, _, data in all_graph.edges(data=True))

            total_routes_subset = graph.number_of_edges()
            total_deliveries_subset = sum(data['route_count'] for _, _, data in graph.edges(data=True))

            # Find the corresponding HTML file and extract the top nodes
            html_file_name = f"G_{country}_{min_distance}_{max_distance}.graphml_top_nodes_info.html"
            with open(os.path.join(info_dir, html_file_name), "r") as html_file:
                soup = BeautifulSoup(html_file.read(), "html.parser")
                top_nodes_table = soup.find("table")
                top_nodes = []
                for row in top_nodes_table.findAll("tr")[1:]:
                    cell = row.findAll("td")[0]
                    node_coordinates = tuple(map(float, re.findall(r"[-+]?\d*\.\d+|\d+", cell.text)))
                    top_nodes.append(node_coordinates)

            # Check if all top_nodes are present in the graph
            missing_nodes = [node for node in top_nodes if node not in graph.nodes()]
            if missing_nodes:
                print("Missing nodes:", missing_nodes)
            else:
                print("All top_nodes are present in the graph.")

            # Initialize lists to store the number of electrified routes and deliveries for each step
            electrified_routes = []
            electrified_deliveries = []
            electrified_routes_percentage = []
            electrified_deliveries_percentage = []
            electrified_routes_subset_percentage = []
            electrified_deliveries_subset_percentage = []

            # Iterate through each step of adding the top nodes to the electrification strategy
            for i in range(1, len(top_nodes) + 1):
                # Get the top nodes to be electrified at the current step
                current_top_nodes = top_nodes[:i]

                # Initialize a set to store the electrified routes and a counter for electrified deliveries
                routes_attached_to_top_nodes = set()
                deliveries_attached_to_top_nodes = 0

                # Iterate through all edges (routes) in the graph
                for u, v, data in graph.edges(data=True):
                    # If either endpoint of the route is in the current_top_nodes, add the route to the electrified set
                    # and update the electrified deliveries counter
                    if u in current_top_nodes or v in current_top_nodes:
                        routes_attached_to_top_nodes.add((u, v))
                        deliveries_attached_to_top_nodes += data['route_count']

                # Update the electrified_routes and electrified_deliveries lists with the current step's values
                electrified_routes.append(len(routes_attached_to_top_nodes))
                electrified_deliveries.append(deliveries_attached_to_top_nodes)

                # Calculate the electrification percentages for the current step
                electrified_routes_percentage.append((len(routes_attached_to_top_nodes) / total_routes) * 100)
                electrified_deliveries_percentage.append((deliveries_attached_to_top_nodes / total_deliveries) * 100)

                electrified_routes_subset_percentage.append((len(routes_attached_to_top_nodes) / total_routes_subset) * 100)
                electrified_deliveries_subset_percentage.append((deliveries_attached_to_top_nodes / total_deliveries_subset) * 100)

                # Create an HTML table for the results
                html_table = "<table border='1'><tr><th>Node</th><th>Electrified Routes</th><th>% Electrification (Routes) Total</th><th>% Electrification (Routes) Subset</th><th>Electrified Deliveries</th><th>% Electrification (Deliveries) Total</th><th>% Electrification (Deliveries) Subset</th></tr>"
                for i, (node, routes, routes_percentage, routes_subset_percentage, deliveries, deliveries_percentage, deliveries_subset_percentage) in enumerate(zip(top_nodes, electrified_routes, electrified_routes_percentage, electrified_routes_subset_percentage, electrified_deliveries, electrified_deliveries_percentage, electrified_deliveries_subset_percentage), start=1):
                    html_table += f"<tr><td>{node}</td><td>{routes}</td><td>{routes_percentage:.2f}%</td><td>{routes_subset_percentage:.2f}%</td><td>{deliveries}</td><td>{deliveries_percentage:.2f}%</td><td>{deliveries_subset_percentage:.2f}%</td></tr>"
                html_table += "</table>"


                # Save the HTML table to a file
                with open(os.path.join(info_dir, f"{country}_{min_distance}_{max_distance}_pareto_electrification.html"), "w") as f:
                    f.write(html_table)


In [62]:
# Calling the functions 
electrification_percentage(graphs_dir, info_dir, top_communities_partition)

Processing file: G_Belgium_5_300.graphml
All top_nodes are present in the graph.
Processing file: G_Denmark_5_300.graphml
All top_nodes are present in the graph.


In [63]:
### General Calculations

# Define the list of countries
countries = ['United Kingdom', 'Sweden', 'Germany', 'Belgium', 'Netherlands', 'Denmark']

# Initialize the HTML string
html_str = "<html>\n<head></head>\n<body>\n"

# Calculate the percentages for each country
for country in countries:
    # Calculate the number of routes that depart from or arrive in the country
    depart_routes = df_routes[df_routes['FromCountry'] == country].shape[0]
    arrive_routes = df_routes[df_routes['ToCountry'] == country].shape[0]
    total_routes = depart_routes + arrive_routes
    
    # Calculate the percentage of routes that depart from or arrive in the country
    route_percent = total_routes / len(df_routes) * 100
    
    # Calculate the number of deliveries that depart from or arrive in the country
    depart_deliveries = df_routes[df_routes['FromCountry'] == country]['RouteCount'].sum()
    arrive_deliveries = df_routes[df_routes['ToCountry'] == country]['RouteCount'].sum()
    total_deliveries = depart_deliveries + arrive_deliveries
    
    # Calculate the percentage of deliveries that depart from or arrive in the country
    delivery_percent = total_deliveries / df_routes['RouteCount'].sum() * 100
    
    # Add the results for the country to the HTML string
    html_str += f"<h3>{country}</h3>\n"
    html_str += f"<p>Route percentage: {route_percent:.2f}%</p>\n"
    html_str += f"<p>Delivery percentage: {delivery_percent:.2f}%</p>\n"
    html_str += "<hr>\n"

# Finalize the HTML string
html_str += "</body>\n</html>"

# Write the HTML string to a file
if not os.path.exists(info_dir):
    os.makedirs(info_dir)

with open(os.path.join(info_dir, "general_info.html"), "w") as f:
    f.write(html_str)