In [21]:
# Importing needed libraries

import pandas as pd
import networkx as nx
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
# Loading dataset
df_final = pd.read_csv("/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/csvs/df_final.csv")
df_final.head()

Unnamed: 0,BookingId,SubBookingName,Customer Name,StartRequestedDate,EndRequestedDate,FromLocation,ToLocation,FromLatitude,FromLongitude,FromCity,FromCountry,ToLatitude,ToLongitude,ToCity,ToCountry,DomesticDelivery,route_distance,FullLoadIndicator,EmptyBookingIndicator,GrossWeight,Temperature,frozen_load
0,7399574,A,2966,2020-12-31,2020-12-31,33917,32289,53.51131,-1.1254,YORKSHIRE,United Kingdom,53.60475,-0.65636,SCUNTHORPE,United Kingdom,1,46.346,1.0,0.0,0.0,,0
1,7399575,A,8183,2020-12-31,2020-12-31,9443,57716,53.50446,-2.84867,Liverpool,United Kingdom,53.04612,-2.92787,Wrexham,United Kingdom,1,78.192,1.0,0.0,0.0,,0
2,7399576,A,8183,2020-12-31,2020-12-31,9443,57716,53.50446,-2.84867,Liverpool,United Kingdom,53.04612,-2.92787,Wrexham,United Kingdom,1,78.192,1.0,0.0,0.0,,0
3,7399577,A,4737,2020-12-31,2020-12-31,5977,9831,53.77629,-1.52585,Leeds,United Kingdom,53.71874,-1.41235,YORKSHIRE,United Kingdom,1,17.733,0.0,0.0,0.0,,0
4,7399578,A,4737,2020-12-31,2020-12-31,5977,3182,53.77629,-1.52585,Leeds,United Kingdom,55.87784,-3.65235,Bathgate,United Kingdom,1,369.34,1.0,0.0,0.0,,0


In [23]:
df_final_UK = df_final[(df_final['FromCountry'] == 'United Kingdom') & (df_final['ToCountry'] == 'United Kingdom')]

len(df_final_UK)

1482429

In [24]:
# Filtering for distances < 300km 

df_final_UK_300 = df_final_UK[df_final_UK['route_distance'] < 300]

len(df_final_UK_300)

906595

In [86]:
# Trying folium library

import pandas as pd
import networkx as nx
import folium
from matplotlib import colors, cm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges to the graph
for index, row in df_final_UK_300.iterrows():
    from_location = row['FromLocation']
    to_location = row['ToLocation']
    route_distance = row['route_distance']
    from_lat = row['FromLatitude']
    from_lon = row['FromLongitude']
    to_lat = row['ToLatitude']
    to_lon = row['ToLongitude']

    G.add_node(from_location, latitude=from_lat, longitude=from_lon)
    G.add_node(to_location, latitude=to_lat, longitude=to_lon)
    G.add_edge(from_location, to_location, weight=route_distance)

# Calculate node attributes for pick up and drop off rates
pickup_counts = df_final_UK_300['FromLocation'].value_counts()
dropoff_counts = df_final_UK_300['ToLocation'].value_counts()

for node in G.nodes():
    pickups = pickup_counts.get(node, 0)
    dropoffs = dropoff_counts.get(node, 0)
    total = pickups + dropoffs
    pickup_rate = pickups / total
    dropoff_rate = dropoffs / total
    
    G.nodes[node]['pickup_rate'] = pickup_rate
    G.nodes[node]['dropoff_rate'] = dropoff_rate
    G.nodes[node]['size'] = total

# Create a function to map node colors based on pickup and dropoff rates
def get_node_color(pickup_rate, dropoff_rate):
    pickup_colormap = cm.get_cmap('Reds')
    dropoff_colormap = cm.get_cmap('Blues')
    
    if pickup_rate > dropoff_rate:
        return colors.to_hex(pickup_colormap(pickup_rate))
    else:
        return colors.to_hex(dropoff_colormap(dropoff_rate))

# Create a map centered at the average latitude and longitude of the locations
map_center = df_final_UK_300[['FromLatitude', 'FromLongitude']].mean().tolist()
m = folium.Map(location=map_center, zoom_start=5, tiles="CartoDB Positron")

# Count the number of times each route appears in the data and store it in a dictionary.
route_counts = df_final_UK_300.groupby(['FromLocation', 'ToLocation']).size().to_dict()

# Calculate the top 50% most used routes threshold
threshold = int(len(route_counts) * 0.05)
top_routes = df_final_UK_300.groupby(['FromLocation', 'ToLocation']).size().nlargest(threshold).index.tolist()

# Create a set to store nodes that are part of the top 5% routes
top_route_nodes = {node for route in top_routes for node in route}

# Create a custom colormap or use a built-in one
edge_colormap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", ["lightblue", "darkblue"])

# Normalize the edge weights based on the minimum and maximum edge weights
min_edge_weight = min(route_counts.values())
max_edge_weight = max(route_counts.values())

# Use logarithmic normalization if the edge weights are heavily skewed
norm = mcolors.LogNorm(vmin=min_edge_weight, vmax=max_edge_weight)

# Iterate through the nodes in the graph
for node in G.nodes():
    latitude = G.nodes[node]['latitude']
    longitude = G.nodes[node]['longitude']
    pickup_rate = G.nodes[node]['pickup_rate']
    dropoff_rate = G.nodes[node]['dropoff_rate']
    node_color = get_node_color(pickup_rate, dropoff_rate)

    if node in top_route_nodes:
        folium.CircleMarker(
            location=[latitude, longitude],
            radius=0.25,
            color=node_color,
            fill=True,
            fill_color=node_color,
            fill_opacity=0.7,
            popup=f"{node}: {pickup_rate:.2f}"
        ).add_to(m)

        # This line should be indented one level deeper
        for _, to_node in G.out_edges(node):
            if to_node in top_route_nodes:
                edge_weight = route_counts.get((node, to_node), 0)

                if (node, to_node) in top_routes:
                    to_location = [G.nodes[to_node]['latitude'], G.nodes[to_node]['longitude']]
                    
                    # Calculate the edge color based on the normalized edge weight
                    edge_color = edge_colormap(norm(edge_weight))

                    folium.PolyLine(
                        locations=[[latitude, longitude], to_location],
                        color=mcolors.to_hex(edge_color),
                        weight=1,
                        opacity=0.5
                    ).add_to(m)


# Save the map to an HTML file
m.save('map_uk_300.html')




In [87]:
# Count the number of nodes in the graph
num_nodes = G.number_of_nodes()

# Count the number of edges in the graph
num_edges = G.number_of_edges()

print(f"The graph has {num_nodes} nodes and {num_edges} edges.")

The graph has 14507 nodes and 24901 edges.


In [None]:
# Folium Library + Grouped nodes 

import pandas as pd
import networkx as nx
import folium
from matplotlib import colors, cm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.cluster import DBSCAN
from haversine import haversine, Unit



# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges to the graph
for index, row in df_final_UK_300.iterrows():
    from_location = row['FromLocation']
    to_location = row['ToLocation']
    route_distance = row['route_distance']
    from_lat = row['FromLatitude']
    from_lon = row['FromLongitude']
    to_lat = row['ToLatitude']
    to_lon = row['ToLongitude']

    G.add_node(from_location, latitude=from_lat, longitude=from_lon)
    G.add_node(to_location, latitude=to_lat, longitude=to_lon)
    G.add_edge(from_location, to_location, weight=route_distance)

# Calculate node attributes for pick up and drop off rates
pickup_counts = df_final_UK_300['FromLocation'].value_counts()
dropoff_counts = df_final_UK_300['ToLocation'].value_counts()

### Grouping Nodes 

# Calculate distance between two points
def haversine_distance_km(p1, p2):
    return haversine(p1, p2, unit=Unit.KILOMETERS)

# Group nodes together within a specified radius
radius = 10  # Radius in kilometers
node_positions = [(G.nodes[node]['latitude'], G.nodes[node]['longitude']) for node in G.nodes]

# Calculate the distance matrix using the haversine_distance_km function
distance_matrix = nx.floyd_warshall_numpy(G, weight=lambda u, v, data: haversine_distance_km((data['latitude'], data['longitude']), (G.nodes[v]['latitude'], G.nodes[v]['longitude'])))

# Use DBSCAN clustering to group nodes within the specified radius
dbscan = DBSCAN(eps=radius, min_samples=1, metric='precomputed')
clusters = dbscan.fit_predict(distance_matrix)

# Create a new graph to store the grouped nodes and edges
G_grouped = nx.DiGraph()

# Add the grouped nodes to the new graph
for i, cluster_label in enumerate(clusters):
    node = list(G.nodes)[i]
    if cluster_label not in G_grouped.nodes:
        G_grouped.add_node(cluster_label, latitude=G.nodes[node]['latitude'], longitude=G.nodes[node]['longitude'])

# Add the aggregated edges to the new graph
for i, node1 in enumerate(G.nodes):
    cluster1 = clusters[i]
    for j, node2 in enumerate(G.nodes):
        cluster2 = clusters[j]
        if G.has_edge(node1, node2) and cluster1 != cluster2:
            if G_grouped.has_edge(cluster1, cluster2):
                G_grouped.edges[cluster1, cluster2]['weight'] += G.edges[node1, node2]['weight']
            else:
                G_grouped.add_edge(cluster1, cluster2, weight=G.edges[node1, node2]['weight'])


for node in G_grouped.nodes():
    pickups = pickup_counts.get(node, 0)
    dropoffs = dropoff_counts.get(node, 0)
    total = pickups + dropoffs
    pickup_rate = pickups / total
    dropoff_rate = dropoffs / total
    
    G_grouped.nodes[node]['pickup_rate'] = pickup_rate
    G_grouped.nodes[node]['dropoff_rate'] = dropoff_rate
    G_grouped.nodes[node]['size'] = total

# Create a function to map node colors based on pickup and dropoff rates
def get_node_color(pickup_rate, dropoff_rate):
    pickup_colormap = cm.get_cmap('Reds')
    dropoff_colormap = cm.get_cmap('Blues')
    
    if pickup_rate > dropoff_rate:
        return colors.to_hex(pickup_colormap(pickup_rate))
    else:
        return colors.to_hex(dropoff_colormap(dropoff_rate))

# Create a map centered at the average latitude and longitude of the locations
map_center = df_final_UK_300[['FromLatitude', 'FromLongitude']].mean().tolist()
m = folium.Map(location=map_center, zoom_start=5, tiles="CartoDB Positron")

# Count the number of times each route appears in the data and store it in a dictionary.
route_counts = df_final_UK_300.groupby(['FromLocation', 'ToLocation']).size().to_dict()

# Calculate the top 50% most used routes threshold
threshold = int(len(route_counts) * 0.05)
top_routes = df_final_UK_300.groupby(['FromLocation', 'ToLocation']).size().nlargest(threshold).index.tolist()

# Create a set to store nodes that are part of the top 5% routes
top_route_nodes = {node for route in top_routes for node in route}

# Create a custom colormap or use a built-in one
edge_colormap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", ["lightblue", "darkblue"])

# Normalize the edge weights based on the minimum and maximum edge weights
min_edge_weight = min(route_counts.values())
max_edge_weight = max(route_counts.values())

# Use logarithmic normalization if the edge weights are heavily skewed
norm = mcolors.LogNorm(vmin=min_edge_weight, vmax=max_edge_weight)

# Iterate through the nodes in the graph
for node in G_grouped.nodes():
    latitude = G_grouped.nodes[node]['latitude']
    longitude = G_grouped.nodes[node]['longitude']
    pickup_rate = G_grouped.nodes[node]['pickup_rate']
    dropoff_rate = G_grouped.nodes[node]['dropoff_rate']
    node_color = get_node_color(pickup_rate, dropoff_rate)

    if node in top_route_nodes:
        folium.CircleMarker(
            location=[latitude, longitude],
            radius=0.25,
            color=node_color,
            fill=True,
            fill_color=node_color,
            fill_opacity=0.7,
            popup=f"{node}: {pickup_rate:.2f}"
        ).add_to(m)

        # This line should be indented one level deeper
        for _, to_node in G_grouped.out_edges(node):
            if to_node in top_route_nodes:
                edge_weight = route_counts.get((node, to_node), 0)

                if (node, to_node) in top_routes:
                    to_location = [G_grouped.nodes[to_node]['latitude'], G_grouped.nodes[to_node]['longitude']]
                    
                    # Calculate the edge color based on the normalized edge weight
                    edge_color = edge_colormap(norm(edge_weight))

                    folium.PolyLine(
                        locations=[[latitude, longitude], to_location],
                        color=mcolors.to_hex(edge_color),
                        weight=1,
                        opacity=0.5
                    ).add_to(m)


# Save the map to an HTML file
m.save('map_uk_300.html')