In [1]:
from resolutionselector import select_resolution
from pyspark.sql import SparkSession
import h3
from pyspark.sql.functions import udf, monotonically_increasing_id, explode, collect_list, col
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import expr, posexplode, radians, sin, cos, sqrt, atan2, lit
import random
import math
import numpy as np
from itertools import combinations
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
import pyspark.sql.functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("RandomCoordinates")\
    .getOrCreate()

# Settings
distance_km = 5
num_points = 100000

# Select resolution
resolution = select_resolution(distance_km)
print(f"The selected resolution for a distance of {distance_km} km is: {resolution}")

resolution = 5

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/04 20:37:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


The selected resolution for a distance of 5 km is: 6


In [12]:
%%time
# Generate random coordinates DataFrame (assuming this function is defined)
print(f'There are {math.comb(num_points, 2)} combos of points to calculate')
min_lat, min_lon = -25.86653, 26.74617
max_lat, max_lon = 49.65699, 70.25976

random_coords = [
    (_, random.uniform(min_lat, max_lat), random.uniform(min_lon, max_lon))
    for _ in range(num_points)
]

# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

# Create DataFrame
coords_df = spark.createDataFrame(random_coords, schema)

# Define UDF to convert latitude and longitude to H3 index
def lat_lon_to_h3(lat, lon, resolution):
    return h3.latlng_to_cell(lat, lon, resolution)

def grid_disk_k1(cell):
    return h3.grid_disk(cell, k=1)

# Register UDF
lat_lon_to_h3_udf = udf(lat_lon_to_h3, StringType())
grid_disk_k1_udf = udf(grid_disk_k1, ArrayType(StringType()))

# Add H3 column to DataFrame
coords_df = coords_df.withColumn("h3_index", lat_lon_to_h3_udf(coords_df["latitude"], coords_df["longitude"], lit(resolution)))
coords_df = coords_df.withColumn("h3_neighbours", grid_disk_k1_udf(coords_df["h3_index"]))

# Explode the 'h3_neighbours' column, note: h3 neighbours contains h3_index as well
exploded_df = coords_df.withColumn("h3_neighbour", explode(coords_df["h3_neighbours"]))

# Group by 'h3_neighbour' and collect list of IDs
grouped_df = exploded_df.groupBy("h3_neighbour").agg(collect_list("id").alias("id_list"))

# Filter groups with more than one ID and create combinations
combinations_df = grouped_df.filter(expr("size(id_list) > 1"))

# Define UDF to generate combinations
def generate_combinations(id_list):
    return list(combinations(id_list, 2))

combination_schema = ArrayType(StructType([
    StructField("ID1", StringType(), False),
    StructField("ID2", StringType(), False)
]))

generate_combinations_udf = udf(generate_combinations, combination_schema)

# Generate combinations and explode
sdf = grouped_df.withColumn("combinations", explode(generate_combinations_udf(grouped_df["id_list"])))

# Select and rename columns
sdf = sdf.select(
    sdf["combinations.ID1"],
    sdf["combinations.ID2"]
)

# Create a unique - sorted - ID column and drop duplicates and self idents
sdf = sdf.filter(col('ID1') !=  col('ID2'))
sdf = sdf.withColumn('ID', F.concat_ws('_', F.array_sort(F.array(F.col('ID1').cast('int'), F.col('ID2').cast('int')))))
sdf = sdf.dropDuplicates(['ID'])

# Add additional details
coords_sdf1 = coords_df.withColumnRenamed('id', 'ID1')\
    .withColumnRenamed('latitude', 'lat1')\
    .withColumnRenamed('longitude', 'lon1')\
    .select('ID1', 'lat1', 'lon1')

coords_sdf2 = coords_df.withColumnRenamed('id', 'ID2')\
    .withColumnRenamed('latitude', 'lat2')\
    .withColumnRenamed('longitude', 'lon2')\
    .select('ID2', 'lat2', 'lon2')

sdf = sdf.join(
    coords_sdf1,
    sdf.ID1 == coords_sdf1.ID1,
    "left"
).drop(coords_sdf1.ID1)

sdf = sdf.join(
    coords_sdf2,
    sdf.ID2 == coords_sdf2.ID2,
    "left"
).drop(coords_sdf2.ID2)

# Calculate Haversine distance and filter
earth_radius_km = 6371.0

sdf = sdf.withColumn(
    "distance_km",
    2 * earth_radius_km * atan2(
        sqrt(
            (sin(radians(col("lat2")) - radians(col("lat1"))) / 2)**2 +
            cos(radians(col("lat1"))) * cos(radians(col("lat2"))) *
            (sin(radians(col("lon2")) - radians(col("lon1"))) / 2)**2
        ),
        sqrt(1 - (
            (sin(radians(col("lat2")) - radians(col("lat1"))) / 2)**2 +
            cos(radians(col("lat1"))) * cos(radians(col("lat2"))) *
            (sin(radians(col("lon2")) - radians(col("lon1"))) / 2)**2
        ))
    )
)

# Assuming combinations_filtered is your DataFrame
#sdf = sdf.filter(col('distance_km') <= lit(distance_km))

# Show the result
df_h3 = sdf.toPandas()

There are 4999950000 combos of points to calculate


                                                                                

CPU times: user 1.67 s, sys: 108 ms, total: 1.78 s
Wall time: 5.67 s


In [13]:
df_h3.distance_km.max()

56.26090374478646

In [14]:
e = 10.837

In [18]:
np.sqrt(27)*e

56.31070380487177

In [3]:
%%time
import numpy as np
from scipy.spatial import KDTree
import pandas as pd

# Function to convert latitude and longitude to Cartesian coordinates
def latlon_to_cartesian(lat, lon, R=6371.0):
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    x = R * np.cos(lat_rad) * np.cos(lon_rad)
    y = R * np.cos(lat_rad) * np.sin(lon_rad)
    z = R * np.sin(lat_rad)
    return x, y, z

# Function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2, R=6371.0):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# generate data

_, latitudes , longitudes = zip(*random_coords) 

# Convert to Cartesian coordinates
x, y, z = latlon_to_cartesian(latitudes, longitudes)
cartesian_coords = np.vstack((x, y, z)).T

# Build KD-Tree
tree = KDTree(cartesian_coords)

# Query the tree for pairs within 5 km
pairs = tree.query_ball_tree(tree, r=5.0)

# Collect pairs of points within 5 km and calculate distances
close_pairs = []
for i, neighbors in enumerate(pairs):
    for j in neighbors:
        if i < j:  # Ensure each pair is unique
            distance = haversine(latitudes[i], longitudes[i], latitudes[j], longitudes[j])
            close_pairs.append((i, j, distance))

# Create a DataFrame to display the results
results = []
for idx1, idx2, distance in close_pairs:
    results.append({
        "ID1": idx1,
        "ID2": idx2,
        "lat1": latitudes[idx1],
        "lon1": longitudes[idx1],
        "lat2": latitudes[idx2],
        "lon2": longitudes[idx2],
        "distance_km": distance
    })

df_kd = pd.DataFrame(results)

CPU times: user 328 ms, sys: 36.7 ms, total: 365 ms
Wall time: 394 ms


In [8]:
def concat(row): 
    id1 = row['ID1']
    id2 = row['ID2']
    if (id1 < id2):
        return str(id1) + '_' + str(id2)
    return str(round(id2,0)) + '_' + str(round(id1,0))

df_kd = df_kd[df_kd.ID1 != df_kd.ID2]
df_kd.loc[:,'ID'] = df_kd.apply(lambda l: concat(l), axis =1)


In [9]:
df_h3['in_kd'] = df_h3.ID.isin(df_kd.ID.to_list())
df_kd['in_h3'] = df_kd.ID.isin(df_h3.ID.to_list())

In [10]:
df_h3[df_h3.in_kd==False]

Unnamed: 0,ID1,ID2,ID,lat1,lon1,lat2,lon2,distance_km,in_kd


In [11]:
df_kd[df_kd.in_h3==False]

Unnamed: 0,ID1,ID2,lat1,lon1,lat2,lon2,distance_km,ID,in_h3


In [19]:
df_h3.merge(df_kd, how = 'outer', on='ID')

Unnamed: 0,ID1_x,ID2_x,ID,lat1_x,lon1_x,lat2_x,lon2_x,distance_km_x,ID1_y,ID2_y,lat1_y,lon1_y,lat2_y,lon2_y,distance_km_y
0,1038,8757,1038_8757,13.259611,29.615150,13.270710,29.611042,1.311809,1038.0,8757.0,13.259611,29.615150,13.270710,29.611042,1.311809
1,1101,4085,1101_4085,21.191203,64.938863,21.177537,64.942041,1.554904,1101.0,4085.0,21.191203,64.938863,21.177537,64.942041,1.554904
2,1112,9370,1112_9370,5.853653,49.255454,5.820389,49.279189,4.535970,1112.0,9370.0,5.853653,49.255454,5.820389,49.279189,4.535970
3,1113,3470,1113_3470,30.587342,39.318453,30.603832,39.290178,3.268999,1113.0,3470.0,30.587342,39.318453,30.603832,39.290178,3.268999
4,1165,5358,1165_5358,-20.967546,67.377595,-20.962584,67.381289,0.672036,1165.0,5358.0,-20.967546,67.377595,-20.962584,67.381289,0.672036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,,,884_7101,,,,,,884.0,7101.0,-11.063056,49.886743,-11.074043,49.859522,3.211985
132,,,902_6301,,,,,,902.0,6301.0,0.073485,57.376817,0.091374,57.336306,4.924243
133,,,925_4474,,,,,,925.0,4474.0,35.995860,34.734392,35.973159,34.720862,2.802383
134,9667,9825,9667_9825,40.899663,68.949384,40.866151,68.988135,4.949578,9667.0,9825.0,40.899663,68.949384,40.866151,68.988135,4.949578


In [None]:
df_h3.distance_km = df_h3.distance_km.apply(lambda l: round(l,6))
df_kd.distance_km = df_kd.distance_km.apply(lambda l: round(l,6))
df_h3.merge(df_kd, how = 'outer', on='distance_km')

Unnamed: 0,ID1_x,ID2_x,ID,lat1_x,lon1_x,lat2_x,lon2_x,distance_km_x,ID1_y,ID2_y,lat1_y,lon1_y,lat2_y,lon2_y,distance_km_y
0,,,1038.0_8757.0,,,,,,1038.0,8757.0,13.259611,29.615150,13.270710,29.611042,1.311809
1,1038,8757,1038_8757,13.259611,29.615150,13.270710,29.611042,1.311809,,,,,,,
2,,,1101.0_4085.0,,,,,,1101.0,4085.0,21.191203,64.938863,21.177537,64.942041,1.554904
3,1101,4085,1101_4085,21.191203,64.938863,21.177537,64.942041,1.554904,,,,,,,
4,,,1112.0_9370.0,,,,,,1112.0,9370.0,5.853653,49.255454,5.820389,49.279189,4.535970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,,,902.0_6301.0,,,,,,902.0,6301.0,0.073485,57.376817,0.091374,57.336306,4.924243
250,,,925.0_4474.0,,,,,,925.0,4474.0,35.995860,34.734392,35.973159,34.720862,2.802383
251,,,9667.0_9825.0,,,,,,9667.0,9825.0,40.899663,68.949384,40.866151,68.988135,4.949578
252,9667,9825,9667_9825,40.899663,68.949384,40.866151,68.988135,4.949578,,,,,,,


In [70]:
df_kd.shape

(10662, 8)

In [69]:
df_h3.shape

(10662, 8)

In [56]:
df_h3.sort_values(['lat1','lon1'])

Unnamed: 0,ID1,ID2,ID,lat1,lon1,lat2,lon2,distance_km
3252,19629,35873,19629_35873,-25.847455,61.501017,-25.837059,61.490208,1.583177
3253,19629,50561,19629_50561,-25.847455,61.501017,-25.859684,61.463291,4.012560
3254,19629,91536,19629_91536,-25.847455,61.501017,-25.820973,61.520370,3.524531
1535,35873,91536,35873_91536,-25.837059,61.490208,-25.820973,61.520370,3.508952
8656,35873,50561,35873_50561,-25.837059,61.490208,-25.859684,61.463291,3.685676
...,...,...,...,...,...,...,...,...
7831,50106,73343,50106_73343,49.619588,30.358522,49.592381,30.385443,3.593832
1552,36831,83641,36831_83641,49.619687,31.736289,49.651817,31.784247,4.969041
1722,4911,70560,4911_70560,49.625118,41.823067,49.609416,41.840132,2.135405
7217,15290,48735,15290_48735,49.629183,49.764763,49.628767,49.784580,1.428045


In [57]:
df_kd.sort_values(['lat1','lon1'])

Unnamed: 0,ID1,ID2,lat1,lon1,lat2,lon2,distance_km,ID
3772,19629,35873,-25.847455,61.501017,-25.837059,61.490208,1.583177,19629_35873
3773,19629,50561,-25.847455,61.501017,-25.859684,61.463291,4.012560,19629_50561
3774,19629,91536,-25.847455,61.501017,-25.820973,61.520370,3.524531,19629_91536
6239,35873,50561,-25.837059,61.490208,-25.859684,61.463291,3.685676,35873_50561
6240,35873,91536,-25.837059,61.490208,-25.820973,61.520370,3.508952,35873_91536
...,...,...,...,...,...,...,...,...
7972,50106,73343,49.619588,30.358522,49.592381,30.385443,3.593832,50106_73343
6367,36831,83641,49.619687,31.736289,49.651817,31.784247,4.969042,36831_83641
1020,4911,70560,49.625118,41.823067,49.609416,41.840132,2.135405,4911_70560
3062,15290,48735,49.629183,49.764763,49.628767,49.784580,1.428045,15290_48735


In [44]:
res = df_h3.merge(df_kd, how = 'outer', on = ID, suffixes = ('_h3', '_kd'))

In [45]:
df_h3

Unnamed: 0,ID1,ID2,ID,lat1,lon1,lat2,lon2,distance_km
0,10022,13088,10022_13088,48.257185,32.706276,48.296534,32.718233,4.464020
1,10064,43974,10064_43974,42.243198,57.947964,42.236467,57.987249,3.319487
2,10153,67823,10153_67823,-23.883419,59.977061,-23.884713,59.943239,3.441750
3,1017,72874,1017_72874,40.649281,28.731336,40.646341,28.747444,1.397725
4,10358,83835,10358_83835,29.261179,40.614183,29.239040,40.643747,3.779874
...,...,...,...,...,...,...,...,...
10657,89175,97494,89175_97494,13.331477,58.943732,13.306501,58.970753,4.032430
10658,91109,93180,91109_93180,-2.764336,62.370914,-2.806537,62.382561,4.867598
10659,93219,98175,93219_98175,5.315255,30.290333,5.311097,30.297576,0.925635
10660,93623,94471,93623_94471,5.088229,38.895291,5.097604,38.927465,3.712837


In [47]:
res

Unnamed: 0,ID1_h3,ID2_h3,ID,lat1_h3,lon1_h3,lat2_h3,lon2_h3,distance_km_h3,ID1_kd,ID2_kd,lat1_kd,lon1_kd,lat2_kd,lon2_kd,distance_km_kd
0,10001,12420,10001_12420,-22.286532,27.498840,-22.266076,27.519078,3.083855,10001.0,12420.0,-22.286532,27.498840,-22.266076,27.519078,3.083855
1,10008,92567,10008_92567,46.997350,36.213867,47.007191,36.278083,4.990997,10008.0,92567.0,46.997350,36.213867,47.007191,36.278083,4.990998
2,10011,54818,10011_54818,27.488692,55.640833,27.485646,55.597161,4.321183,10011.0,54818.0,27.488692,55.640833,27.485646,55.597161,4.321184
3,10022,13088,10022_13088,48.257185,32.706276,48.296534,32.718233,4.464020,10022.0,13088.0,48.257185,32.706276,48.296534,32.718233,4.464021
4,1002,91445,1002_91445,31.769200,41.350782,31.793979,41.368380,3.218482,1002.0,91445.0,31.769200,41.350782,31.793979,41.368380,3.218482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11672,,,9977_86550,,,,,,9977.0,86550.0,37.489388,52.467281,37.485091,52.431063,3.231095
11673,,,9979_87431,,,,,,9979.0,87431.0,46.057797,50.148513,46.055723,50.094813,4.150052
11674,,,9980_90217,,,,,,9980.0,90217.0,-18.653362,30.771769,-18.641671,30.784924,1.900292
11675,,,9989_47277,,,,,,9989.0,47277.0,-19.474269,39.151617,-19.492938,39.127603,3.262833


In [49]:
res[res.ID1_h3.isna()]

Unnamed: 0,ID1_h3,ID2_h3,ID,lat1_h3,lon1_h3,lat2_h3,lon2_h3,distance_km_h3,ID1_kd,ID2_kd,lat1_kd,lon1_kd,lat2_kd,lon2_kd,distance_km_kd
902,,,1379_12938,,,,,,1379.0,12938.0,1.705445,59.348819,1.687092,59.373602,3.428163
997,,,1420_11733,,,,,,1420.0,11733.0,40.525590,38.966197,40.532675,38.958575,1.017639
1261,,,1544_14754,,,,,,1544.0,14754.0,1.887844,29.601810,1.912184,29.597392,2.750663
1316,,,1574_11183,,,,,,1574.0,11183.0,-15.599463,41.264305,-15.563614,41.275033,4.148598
1757,,,1790_16871,,,,,,1790.0,16871.0,48.330948,68.179275,48.329099,68.234084,4.057085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11672,,,9977_86550,,,,,,9977.0,86550.0,37.489388,52.467281,37.485091,52.431063,3.231095
11673,,,9979_87431,,,,,,9979.0,87431.0,46.057797,50.148513,46.055723,50.094813,4.150052
11674,,,9980_90217,,,,,,9980.0,90217.0,-18.653362,30.771769,-18.641671,30.784924,1.900292
11675,,,9989_47277,,,,,,9989.0,47277.0,-19.474269,39.151617,-19.492938,39.127603,3.262833


In [48]:
res[res.ID1_kd.isna()]

Unnamed: 0,ID1_h3,ID2_h3,ID,lat1_h3,lon1_h3,lat2_h3,lon2_h3,distance_km_h3,ID1_kd,ID2_kd,lat1_kd,lon1_kd,lat2_kd,lon2_kd,distance_km_kd
8,5071,10053,10053_5071,44.345638,35.830748,44.318253,35.871929,4.472211,,,,,,,
26,3783,10104,10104_3783,38.628404,62.934056,38.653744,62.944335,2.955770,,,,,,,
51,7378,10235,10235_7378,-11.868321,49.131645,-11.854714,49.167351,4.169720,,,,,,,
65,4301,10285,10285_4301,5.281058,40.448351,5.286791,40.472086,2.704201,,,,,,,
92,9549,10450,10450_9549,31.123059,66.750283,31.090241,66.714598,4.985901,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11456,9592,92804,92804_9592,44.485377,36.490495,44.452147,36.532248,4.962930,,,,,,,
11487,9970,93660,93660_9970,16.340829,34.683395,16.374880,34.662886,4.373086,,,,,,,
11511,9687,94229,94229_9687,2.869149,34.264531,2.904584,34.237573,4.948516,,,,,,,
11527,9735,94487,94487_9735,39.337849,37.389942,39.361927,37.423511,3.936990,,,,,,,


In [31]:
df_h3[df_h3.ID2 == 11494]

Unnamed: 0,ID1,ID2,ID,lat1,lon1,lat2,lon2,distance_km
