In [1]:
from coordgenerator import generate_random_coordinates_df
from resolutionselector import select_resolution
from pyspark.sql import SparkSession
import h3
from pyspark.sql.functions import udf, monotonically_increasing_id, explode, collect_list, col
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import expr, posexplode, radians, sin, cos, sqrt, atan2, lit
import random
import math
import numpy as np
from itertools import combinations
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType

# Initialize Spark session
spark = SparkSession.builder.appName("RandomCoordinates")\
    .getOrCreate()

# Settings
distance = 5
num_points = 10000

resolution = select_resolution(distance)
print(f"The selected resolution for a distance of {distance} km is: {resolution}")


25/02/25 22:50:45 WARN Utils: Your hostname, Quintens-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
25/02/25 22:50:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/25 22:50:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


The selected resolution for a distance of 5 km is: 6


## Data Prep

In [2]:
%%time

# Generate random coordinates DataFrame (assuming this function is defined)
print(f'There are {math.comb(num_points, 2)} combos of points to calculate')
min_lat, min_lon = -25.86653, 26.74617
max_lat, max_lon = 49.65699, 70.25976

random_coordinates = [
    (_, random.uniform(min_lat, max_lat), random.uniform(min_lon, max_lon))
    for _ in range(num_points)
]

# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

# Create DataFrame
coordinates_df = spark.createDataFrame(random_coordinates, schema)
coordinates_df.cache()


There are 49995000 combos of points to calculate
CPU times: user 21.4 ms, sys: 2.96 ms, total: 24.4 ms
Wall time: 1.03 s


DataFrame[id: int, latitude: double, longitude: double]

In [3]:
%%time
# Define UDF to convert latitude and longitude to H3 index
def lat_lon_to_h3(lat, lon, resolution):
    return h3.latlng_to_cell(lat, lon, resolution)

def grid_disk_k1(cell):
    return h3.grid_disk(cell, k=1)

# Register UDF
lat_lon_to_h3_udf = udf(lat_lon_to_h3, StringType())
grid_disk_k1_udf = udf(grid_disk_k1, ArrayType(StringType()))

# Add H3 column to DataFrame
coordinates_df = coordinates_df.withColumn("h3_index", lat_lon_to_h3_udf(coordinates_df["latitude"], coordinates_df["longitude"], lit(resolution)))
coordinates_df = coordinates_df.withColumn("h3_neighbours", grid_disk_k1_udf(coordinates_df["h3_index"]))

# Explode the 'h3_neighbours' column
exploded_df = coordinates_df.withColumn("h3_neighbour", explode(coordinates_df["h3_neighbours"]))

# Group by 'h3_neighbour' and collect list of IDs
grouped_df = exploded_df.groupBy("h3_neighbour").agg(collect_list("id").alias("id_list"))

# Filter groups with more than one ID and create combinations
combinations_df = grouped_df.filter(expr("size(id_list) > 1"))

# Define UDF to generate combinations
def generate_combinations(id_list):
    return list(combinations(id_list, 2))

combination_schema = ArrayType(StructType([
    StructField("ID1", StringType(), False),
    StructField("ID2", StringType(), False)
]))

generate_combinations_udf = udf(generate_combinations, combination_schema)

# Generate combinations and explode
combinations_df = grouped_df.withColumn("combinations", explode(generate_combinations_udf(grouped_df["id_list"])))

# Select and rename columns
combinations_df = combinations_df.select(
    combinations_df["combinations.ID1"].cast("string"),
    combinations_df["combinations.ID2"].cast("string")
)

CPU times: user 8.49 ms, sys: 4.33 ms, total: 12.8 ms
Wall time: 221 ms


# Self-join to create combinations
combinations_df = posexploded_df.alias("a").join(
    posexploded_df.alias("b"),
    (col("a.h3_neighbour") == col("b.h3_neighbour")) & (col("a.pos") < col("b.pos")),
    "inner"
).select(
    col("a.col").alias("ID1"),
    col("b.col").alias("ID2")
).filter(col("ID1") != col("ID2"))

df1 = posexploded_df.toPandas()


In [4]:
coordinates_df1 = coordinates_df.withColumnRenamed('id', 'ID1')\
    .withColumnRenamed('latitude', 'lat1')\
    .withColumnRenamed('longitude', 'lon1')\
    .select('ID1', 'lat1', 'lon1')

coordinates_df2 = coordinates_df.withColumnRenamed('id', 'ID2')\
    .withColumnRenamed('latitude', 'lat2')\
    .withColumnRenamed('longitude', 'lon2')\
    .select('ID2', 'lat2', 'lon2')

combinations_df1 = combinations_df.join(
    coordinates_df1,
    combinations_df.ID1 == coordinates_df1.ID1,
    "left"
)

combinations_final = combinations_df1.join(
    coordinates_df2,
    combinations_df1.ID2 == coordinates_df2.ID2,
    "left"
)

print(combinations_final.count())


                                                                                

2532


In [5]:

# Calculate Haversine distance
earth_radius_km = 6371.0

combinations_filtered = combinations_final.withColumn(
    "distance_km",
    2 * earth_radius_km * atan2(
        sqrt(
            (sin(radians(col("lat2")) - radians(col("lat1"))) / 2)**2 +
            cos(radians(col("lat1"))) * cos(radians(col("lat2"))) *
            (sin(radians(col("lon2")) - radians(col("lon1"))) / 2)**2
        ),
        sqrt(1 - (
            (sin(radians(col("lat2")) - radians(col("lat1"))) / 2)**2 +
            cos(radians(col("lat1"))) * cos(radians(col("lat2"))) *
            (sin(radians(col("lon2")) - radians(col("lon1"))) / 2)**2
        ))
    )
)

25/02/25 22:50:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [8]:
# Assuming combinations_filtered is your DataFrame
filtered_df = combinations_filtered.filter(col('distance_km') <= lit(5))

# Show the result
result = filtered_df.toPandas()

In [15]:
result = result.drop_duplicates()

In [19]:
result

Unnamed: 0,ID1,ID2,ID1.1,lat1,lon1,ID2.1,lat2,lon2,distance_km
0,5717,5912,5717,42.644089,67.604921,5912,42.613841,67.623255,3.682656
1,272,2399,272,47.106743,59.709476,2399,47.077581,59.665841,4.628949
8,2122,6612,2122,42.750976,59.988563,6612,42.719121,59.986514,3.546035
15,1364,6850,1364,45.570782,64.467730,6850,45.530379,64.484392,4.676133
19,150,308,150,44.231257,63.971191,308,44.252806,63.974016,2.406611
...,...,...,...,...,...,...,...,...,...
623,2221,5864,2221,-16.668124,68.529431,5864,-16.640283,68.499944,4.410336
630,729,4135,729,-17.071971,62.121634,4135,-17.041750,62.109644,3.594065
634,556,1437,556,-24.283514,37.217519,1437,-24.306630,37.244247,3.734258
638,1947,3156,1947,-25.467807,31.011293,3156,-25.472502,31.025279,1.498044


In [17]:
result.distance_km.max()

4.967901618131541

In [11]:
%%time
import numpy as np
from scipy.spatial import KDTree
import pandas as pd

# Function to convert latitude and longitude to Cartesian coordinates
def latlon_to_cartesian(lat, lon, R=6371.0):
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    x = R * np.cos(lat_rad) * np.cos(lon_rad)
    y = R * np.cos(lat_rad) * np.sin(lon_rad)
    z = R * np.sin(lat_rad)
    return x, y, z

# Function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2, R=6371.0):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# generate data

_, latitudes , longitudes = zip(*random_coordinates) 

# Convert to Cartesian coordinates
x, y, z = latlon_to_cartesian(latitudes, longitudes)
cartesian_coords = np.vstack((x, y, z)).T

# Build KD-Tree
tree = KDTree(cartesian_coords)

# Query the tree for pairs within 5 km
pairs = tree.query_ball_tree(tree, r=5.0)

# Collect pairs of points within 5 km and calculate distances
close_pairs = []
for i, neighbors in enumerate(pairs):
    for j in neighbors:
        if i < j:  # Ensure each pair is unique
            distance = haversine(latitudes[i], longitudes[i], latitudes[j], longitudes[j])
            close_pairs.append((i, j, distance))

# Create a DataFrame to display the results
results = []
for idx1, idx2, distance in close_pairs:
    results.append({
        "ID1": idx1,
        "ID2": idx2,
        "lat1": latitudes[idx1],
        "lon1": longitudes[idx1],
        "lat2": latitudes[idx2],
        "lon2": longitudes[idx2],
        "distance_km": distance
    })

df = pd.DataFrame(results)
print(df)

      ID1   ID2       lat1       lon1       lat2       lon2  distance_km
0      47  6910 -17.647175  32.514896 -17.665524  32.488072     3.498723
1      70  2391  35.807396  50.076699  35.823026  50.081246     1.785751
2     111  3674  47.662532  36.898429  47.621662  36.905420     4.574572
3     119  2013  -8.228947  57.203200  -8.215842  57.226930     2.990459
4     130  4595  12.497893  57.280299  12.463571  57.306346     4.749940
..    ...   ...        ...        ...        ...        ...          ...
118  8078  9908  34.953270  41.991642  34.909057  41.999321     4.965843
119  8257  8746  30.098598  58.490214  30.067345  58.517623     4.362559
120  8663  9214 -14.790933  31.706412 -14.774200  31.733506     3.456519
121  9146  9151 -11.592742  37.390043 -11.615392  37.382562     2.647119
122  9307  9850   4.369673  61.658412   4.353718  61.618784     4.738310

[123 rows x 7 columns]
CPU times: user 19 ms, sys: 1.89 ms, total: 20.9 ms
Wall time: 19.9 ms


In [18]:
df.distance_km.max()

4.967902016999107

In [None]:
4.967901618131541

In [16]:
df.groupby(['ID1', 'ID2']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat1,lon1,lat2,lon2,distance_km
ID1,ID2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,34359774511,4,4,4,4,4
0,34359776062,4,4,4,4,4
1,77309461461,7,7,7,7,7
2,51539673302,4,4,4,4,4
2,68719573184,4,4,4,4,4
...,...,...,...,...,...,...
77309511229,77309496802,6,6,6,6,6
77309511230,25769807370,3,3,3,3,3
77309511230,25769860153,2,2,2,2,2
77309511230,77309508699,3,3,3,3,3


In [12]:
df_spark = combinations_with_coords_df.toPandas()

25/02/25 21:14:02 WARN TaskSetManager: Stage 40 contains a task of very large size (1951 KiB). The maximum recommended task size is 1000 KiB.
25/02/25 21:14:04 WARN TaskSetManager: Stage 41 contains a task of very large size (1951 KiB). The maximum recommended task size is 1000 KiB.
25/02/25 21:14:08 WARN TaskSetManager: Stage 42 contains a task of very large size (1951 KiB). The maximum recommended task size is 1000 KiB.
25/02/25 21:14:08 WARN TaskSetManager: Stage 43 contains a task of very large size (1951 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [5]:
import plotly.express as px 

In [6]:
px.histogram(df, x = 'distance_km')

In [25]:
coordinates_df.show()

+------------------+-----------------+---------------+--------------------+
|          latitude|        longitude|       h3_index|       h3_neighbours|
+------------------+-----------------+---------------+--------------------+
|50.843179677131126|4.398299691722763|891fa44e69bffff|[891fa44e69bffff,...|
|50.830293351691324|4.281895732206881|891fa44134bffff|[891fa44134bffff,...|
| 50.89328489172853| 4.30124550560784|891fa440607ffff|[891fa440607ffff,...|
| 50.87738152420777|4.250578511597902|891fa441497ffff|[891fa441497ffff,...|
| 50.82143090662322|4.305432395662898|891fa44ad57ffff|[891fa44ad57ffff,...|
| 50.84476970807411|4.343533754090476|891fa441853ffff|[891fa441853ffff,...|
|50.899389345798184|4.292722718417278|891fa440683ffff|[891fa440683ffff,...|
|  50.7610585766942|4.358594565497318|891fa448907ffff|[891fa448907ffff,...|
| 50.80388274752319|4.271307419948896|891fa44aeabffff|[891fa44aeabffff,...|
| 50.84073518235208|4.344546229450199|891fa44185bffff|[891fa44185bffff,...|
|50.82373502

In [None]:
!conda install -n .conda ipykernel --update-deps --force-reinstall

: 