In [80]:
# Importing libraries
import pandas as pd
pd.set_option('display.max_columns', None)
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors


import math

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import igraph as ig

import folium
from folium import PolyLine, CircleMarker
from folium.plugins import MarkerCluster
from shapely.geometry import Point, LineString
from pyvis.network import Network

from community import community_louvain
import matplotlib.cm as cm
import leidenalg


ImportError: cannot import name 'community_louvain' from 'community' (/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/community/__init__.py)

In [2]:
# Count how many routes are in df_final_UK_300
# Create df with unique routes 
# Check if .nunique() & len(routes) are the same 

# Then build graph based on unique routes df 

In [3]:
### 1) Prepare dataset for Graph building

# Loading dataset

df_final = pd.read_csv("/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/csvs/df_final.csv")
coordinates_df = pd.read_csv("/Users/jangaydoul/Desktop/Copenhagen Business School/4. Semester :: Thesis/03_Data/csvs/df_graph.csv")

In [4]:
print(len(coordinates_df)) #162,112
print(len(df_final)) #4,973,604

162112
4973604


In [6]:
df_final['FromLocation'].nunique() #37,994
df_final['ToLocation'].nunique() #87,093

37994

In [7]:
# Group the data by pick-up and drop-off points
grouped = df_final.groupby(['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude'])

# Count the number of unique groups
unique_routes = len(grouped)

# Print the result
print("Number of unique routes:", unique_routes)

Number of unique routes: 162113


In [8]:
# Drop duplicates from df_final based on the four coordinate variables
df_final_unique = df_final.drop_duplicates(subset=['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude'])

# Select the columns to merge from df_final
columns_to_merge = ['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude',
                    'FromLocation', 'ToLocation', 'FromCity', 'ToCity', 'FromCountry', 'ToCountry', 'FullLoadIndicator']

# Merge the two dataframes on the four coordinate variables
df_graph = pd.merge(coordinates_df, df_final_unique[columns_to_merge], on=['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude'])


In [9]:
# Count the frequency of each unique route in df_final
route_frequency = df_final.groupby(['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude']).size().reset_index(name='route_count')

# Merge the frequency information with merged_df
df_graph = pd.merge(df_graph, route_frequency, on=['FromLatitude', 'FromLongitude', 'ToLatitude', 'ToLongitude'])


In [10]:
df_graph.head()

Unnamed: 0,FromLatitude,FromLongitude,ToLatitude,ToLongitude,route_distance,FromLocation,ToLocation,FromCity,ToCity,FromCountry,ToCountry,FullLoadIndicator,route_count
0,50.63656,4.78251,63.44178,10.40893,2004.311,17287,42342,Perwez,Trondheim,Belgium,Norway,1.0,17
1,56.20318,14.8734,57.42278,15.06204,160.384,14689,67460,Karlshamn,Vetlanda,Sweden,Sweden,0.0,3
2,57.73572,11.98862,56.16157,15.58364,376.025,25161,27919,Göteborg,Karlskrona,Sweden,Sweden,0.0,5
3,60.71076,10.60526,51.00683,-0.42949,1991.237,21503,28913,Raufoss,BILLINGSHURST,Norway,United Kingdom,1.0,2
4,57.73572,11.98862,56.21754,15.27024,348.243,25161,30402,Göteborg,Ronneby,Sweden,Sweden,0.0,2


In [11]:
df_graph['route_count'].sum()

4973250

In [13]:
len(df_graph)

162112

In [14]:
### 1.1) Filtering for Country (UK in this case) and distance < 300km 

df_graph_UK = df_graph[(df_graph['FromCountry'] == 'United Kingdom') & (df_graph['ToCountry'] == 'United Kingdom')]
df_graph_UK_300 = df_graph_UK[df_graph_UK['route_distance'] < 300]

len(df_graph_UK_300)

19720

In [56]:
df_graph_UK_300['route_distance'].describe()

count    19720.000000
mean       163.646162
std         80.758959
min         -1.000000
25%        107.263500
50%        174.570000
75%        227.372250
max        299.908000
Name: route_distance, dtype: float64

In [75]:
### 2) Building Graph

### BASELINE FOR FURTHER CODING

# Build the empty directed graph
G = nx.DiGraph()

# Filter the top 2.5% most often used routes and add them as edges to the graph
top_routes = df_graph_UK_300.nlargest(int(len(df_graph_UK_300) * 0.025), 'route_count')

for idx, row in top_routes.iterrows():
    G.add_edge((row['FromLatitude'], row['FromLongitude']),
               (row['ToLatitude'], row['ToLongitude']),
               route_count=row['route_count'])

# Calculate the traffic for each node
node_traffic = {}
for edge in G.edges(data=True):
    start, end, data = edge
    route_count = data['route_count']
    
    if start not in node_traffic:
        node_traffic[start] = 0
    if end not in node_traffic:
        node_traffic[end] = 0
    
    node_traffic[start] += route_count
    node_traffic[end] += route_count

# Normalize the node traffic values
max_node_traffic = max(node_traffic.values())
node_traffic_normalized = {node: math.log(traffic + 1) / math.log(max_node_traffic + 1) for node, traffic in node_traffic.items()}

def add_nodes_edges_to_map(graph, folium_map, node_traffic_normalized):
    max_route_count = max(data['route_count'] for _, _, data in graph.edges(data=True))
    
    for edge in graph.edges(data=True):
        start, end, data = edge
        
        # Set the color intensity based on the logarithmic scale of the node traffic
        start_intensity = node_traffic_normalized[start]
        end_intensity = node_traffic_normalized[end]
        
        # Get the color from the color gradient
        start_color = plt.cm.get_cmap('Blues')(start_intensity)
        end_color = plt.cm.get_cmap('Blues')(end_intensity)
        
        start_color = matplotlib.colors.to_hex(start_color)
        end_color = matplotlib.colors.to_hex(end_color)
        
        start_node_marker = CircleMarker(location=start, radius=0.5, color=start_color, fill=True, fill_opacity=0.5)
        end_node_marker = CircleMarker(location=end, radius=0.5, color=end_color, fill=True, fill_opacity=0.5)
        
        intensity = math.log(data['route_count'] + 1) / math.log(max_route_count + 1)
        edge_color = plt.cm.get_cmap('coolwarm')(intensity)
        edge_color = matplotlib.colors.to_hex(edge_color)
        
        polyline = PolyLine(locations=[start, end], color=edge_color, weight=1)
        
        folium_map.add_child(start_node_marker)
        folium_map.add_child(end_node_marker)
        folium_map.add_child(polyline)

# Create the folium map centered based on the coordinates and add nodes and edges to the map
map_center = df_graph_UK_300[['FromLatitude', 'FromLongitude']].mean().tolist()
map_UK = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB Positron')
add_nodes_edges_to_map(G, map_UK, node_traffic_normalized)

# Save the map 
map_UK.save('UK_map.html')

In [76]:
# Counting number of nodes and edges

num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 432
Number of edges: 493


In [78]:
### 3) Detecting Communities

# Louvain algorithm returns a dictionary, where the keys represent the nodes and the values represent the community each node belongs to 
partition = community_louvain.best_partition(G)


AttributeError: module 'community' has no attribute 'best_partition'