In [17]:
# Importing libraries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, LineString
import folium
from folium.plugins import MarkerCluster
import community as community_louvain
import leidenalg
import igraph as ig

In [2]:
# Loading dataset and filtering for ... 
# 1) ... deliveries with pick-up and drop off in the same country 
# 2) ... route_distance below 300km

df_final = pd.read_csv("/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/csvs/df_final.csv")
df_final_UK = df_final[(df_final['FromCountry'] == 'United Kingdom') & (df_final['ToCountry'] == 'United Kingdom')]
df_final_UK_300 = df_final_UK[df_final_UK['route_distance'] < 300]

len(df_final_UK_300)

906595

In [3]:
# In this cell we'll find out about the # of occurence of each route to filter out the routes that have been taken the most often 

# Count the occurrences of each route
route_counts = df_final_UK_300.groupby(['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude']).size().reset_index(name='route_count')

# Sort the routes by the number of times they were taken
sorted_routes = route_counts.sort_values('route_count', ascending=False)

# Filter the top 10% routes
n = int(len(sorted_routes) * 0.1)
top_routes = sorted_routes.head(n)

# Merge the top_routes data back into the original DataFrame to receive new column 'route_count'
top_routes = pd.merge(df_final_UK_300, top_routes, on=['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude'])


# --> 'top_routes' will be dataset we'll be moving forward with; it only contains the routes that have been taken the most often, 
# but still on delivery level (example UK: 10% of the routes account for ~87% of the deliveries)

In [13]:
# Create a graph out of the dataframe containing the top 10% most used routes 
G = nx.from_pandas_edgelist(
    top_routes,
    source="FromLocation",
    target="ToLocation",
    edge_attr=["route_distance"],
    create_using=nx.DiGraph(),
)

In [31]:
# Adding node attributes for color coding
for index, row in top_routes.iterrows():
    from_location = row['FromLocation']
    to_location = row['ToLocation']
    route_distance = row['route_distance']
    from_lat = row['FromLatitude']
    from_lon = row['FromLongitude']
    to_lat = row['ToLatitude']
    to_lon = row['ToLongitude']

    G.add_node(from_location, latitude=from_lat, longitude=from_lon, geometry=Point(from_lon, from_lat))
    G.add_node(to_location, latitude=to_lat, longitude=to_lon, geometry=Point(to_lon, to_lat))
    G.add_edge(from_location, to_location, weight=route_distance)

# Calculate node attributes for pick up and drop off rates
pickup_counts = top_routes['FromLocation'].value_counts()
dropoff_counts = top_routes['ToLocation'].value_counts()

for node in G.nodes():
    pickups = pickup_counts.get(node, 0)
    dropoffs = dropoff_counts.get(node, 0)
    total = pickups + dropoffs
    pickup_rate = pickups / total
    dropoff_rate = dropoffs / total
    
    #G.nodes[node]['pickup_rate'] = pickup_rate
   # G.nodes[node]['dropoff_rate'] = dropoff_rate

    #if pickups > dropoffs:
        #G.nodes[node]["color"] = "red"
    #else:
        #G.nodes[node]["color"] = "blue"



In [38]:
(top_routes['route_count']).mean()

12136.209063911545

In [16]:
# Adding edge attributes to identify most used routes

threshold = (top_routes['route_distance']).mean()

for u, v, data in G.edges(data=True):
    num_routes = len(top_routes[(top_routes["FromLocation"] == u) & (top_routes["ToLocation"] == v)])
    data["weight"] = num_routes

    if num_routes >= threshold:  # Define a threshold for frequent routes
        data["color"] = "green"
    else:
        data["color"] = "gray"


In [21]:
# Community Detection with Leiden algorithm

# Convert networkx directed graph to igraph directed graph
ig_graph = ig.Graph.from_networkx(G)

# Apply the Leiden algorithm for community detection
partition = leidenalg.find_partition(ig_graph, leidenalg.ModularityVertexPartition)

# Assign communities to nodes in the networkx graph
for node, community in zip(G.nodes(), partition.membership):
    G.nodes[node]["community"] = community


In [22]:
# Apply centrality measures to find out about the most important nodes within every community:

# Calculate centrality measures
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
degree_centrality = nx.degree_centrality(G)

# Assign centrality measures to nodes
for node in G.nodes():
    G.nodes[node]["betweenness"] = betweenness_centrality[node]
    G.nodes[node]["closeness"] = closeness_centrality[node]
    G.nodes[node]["degree"] = degree_centrality[node]

In [39]:
# Extract nodes with highest betweenness centrality
top_betweenness = sorted(G.nodes(data=True), key=lambda x: x[1]['betweenness'], reverse=True)[:10]
print("Top 10 nodes with highest betweenness centrality: ", [node[0] for node in top_betweenness])

# Extract nodes with highest closeness centrality
top_closeness = sorted(G.nodes(data=True), key=lambda x: x[1]['closeness'], reverse=True)[:10]
print("Top 10 nodes with highest closeness centrality: ", [node[0] for node in top_closeness])

# Extract nodes with highest degree centrality
top_degree = sorted(G.nodes(data=True), key=lambda x: x[1]['degree'], reverse=True)[:10]
print("Top 10 nodes with highest degree centrality: ", [node[0] for node in top_degree])


Top 10 nodes with highest betweenness centrality:  [21143, 9423, 4156, 31163, 5802, 9469, 34263, 239, 30446, 21154]
Top 10 nodes with highest closeness centrality:  [52324, 21090, 21048, 21110, 78450, 78441, 48492, 48520, 56646, 86197]
Top 10 nodes with highest degree centrality:  [9358, 9343, 9342, 9225, 21143, 9405, 9401, 9423, 9227, 9408]


In [44]:
import folium
from folium.plugins import MarkerCluster
import geopandas as gpd
from shapely.geometry import Point, LineString


# Create a GeoDataFrame with nodes and their coordinates
nodes_gdf = gpd.GeoDataFrame(
    [(node, data['latitude'], data['longitude'], data['community'], data['betweenness'], data['closeness'], data['degree']) 
     for node, data in G.nodes(data=True)],
    columns=['node', 'latitude', 'longitude', 'community', 'betweenness', 'closeness', 'degree'])

# Create a geometry column from the latitude and longitude columns
nodes_gdf['geometry'] = nodes_gdf.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Create a GeoDataFrame with edges and their weights
edges_gdf = gpd.GeoDataFrame(
    [(u, v, data['weight'], data['color']) for u, v, data in G.edges(data=True)],
    columns=['source', 'target', 'weight', 'color'])

# Create a LineString geometry for each edge
edges_gdf['geometry'] = edges_gdf.apply(lambda row: LineString([nodes_gdf.loc[row['source']]['geometry'].coords[0], nodes_gdf.loc[row['target']]['geometry'].coords[0]]), axis=1)


# Create a Folium map
map_center = nodes_gdf['geometry'].mean().coords[0][::-1]
map_1 = folium.Map(location=map_center, zoom_start=10)

# Create markers for each community
markers = []
for i, row in nodes_gdf.iterrows():
    popup_html = f"""Node: {row['node']}<br>
                    Community: {row['community']}<br>
                    Betweenness Centrality: {row['betweenness']}<br>
                    Closeness Centrality: {row['closeness']}<br>
                    Degree Centrality: {row['degree']}"""
    marker = folium.Marker(location=row['geometry'].coords[0][::-1], tooltip=row['node'], popup=popup_html)
    markers.append(marker)

# Add the markers to the map and add the edges as a layer
MarkerCluster(markers).add_to(map_1)
folium.GeoJson(edges_gdf).add_to(map_1)

# Display the map
map_1





KeyError: 9443