# Geopy
https://geopy.readthedocs.io/en/stable/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 15:47:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/11 15:47:35 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
import os
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Fri Apr 11 03:47:38 PM EDT 2025


In [4]:
# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/parquet"

# define the path to the train and test parquet files
train_path = f"{data_path}/train"

# read the parquet files into a spark DataFrame
train_df = spark.read.parquet(train_path)

# show the data
train_df.printSchema()
train_df.show(n=5)

                                                                                

root
 |-- image_name: string (nullable = true)
 |-- path: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- organ: string (nullable = true)
 |-- species_id: integer (nullable = true)
 |-- obs_id: long (nullable = true)
 |-- license: string (nullable = true)
 |-- partner: string (nullable = true)
 |-- author: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- gbif_species_id: string (nullable = true)
 |-- species: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- family: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- references: string (nullable = true)
 |-- url: string (nullable = true)
 |-- learn_tag: string (nullable = true)
 |-- image_backup_url: string (nullable = true)



                                                                                

+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------+--------------------+
|          image_name|                path|                data|organ|species_id|    obs_id|             license|partner|          author|altitude|         latitude|         longitude|gbif_species_id|             species|        genus|    family| dataset|  publisher|          references|                 url|learn_tag|    image_backup_url|
+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------

In [5]:
# count number of NULL latitude and longitude data
null_latitude_count = train_df.filter(train_df.latitude.isNull()).count()
null_longitude_count = train_df.filter(train_df.longitude.isNull()).count()
print(f"Number of NULL latitude values: {null_latitude_count}")
print(f"Number of NULL longitude values: {null_longitude_count}")



Number of NULL latitude values: 702608
Number of NULL longitude values: 702592


                                                                                

In [6]:
# filter out the rows with NULL latitude and longitude
filtered_df = train_df.filter(
    train_df.latitude.isNotNull() & train_df.longitude.isNotNull()
)
# groupby species_id and average the latitude and longitude of each species
grouped_df = filtered_df.groupBy("species_id").agg(
    {"latitude": "avg", "longitude": "avg"}
)
# rename the columns
grouped_df = grouped_df.withColumnRenamed("avg(latitude)", "avg_latitude")
grouped_df = grouped_df.withColumnRenamed("avg(longitude)", "avg_longitude")
# show the first 5 rows
grouped_df.show(n=5)



+----------+------------------+------------------+
|species_id|      avg_latitude|     avg_longitude|
+----------+------------------+------------------+
|   1737493| 46.36360531186132| 5.639311481005929|
|   1363733| 42.12276511772893| 6.716418020291944|
|   1743681|  43.9162545329139|3.4286095476539384|
|   1359356|38.116486922794394| 8.350581385996264|
|   1741934|         42.625339|         25.145366|
+----------+------------------+------------------+
only showing top 5 rows



                                                                                

In [7]:
# count the number of unique species from the grouped DataFrame
unique_species_count = grouped_df.select("species_id").count()
print(f"Number of unique species: {unique_species_count}")



Number of unique species: 6208


                                                                                

In [8]:
import folium


def plot_map(df, col_latitude: str, col_longitude: str):
    # plot the grouped_df DataFrame
    pandas_df = df.toPandas()

    # Create a folium map centered around southwestern Europe
    map_center = [44.0, 4.0]  # roughly southern France
    m = folium.Map(location=map_center, zoom_start=5, tiles="OpenStreetMap")

    # Add markers for each species
    for _, row in pandas_df.iterrows():
        folium.CircleMarker(
            location=[row[col_latitude], row[col_longitude]],
            radius=4,
            popup=str(row["species_id"]),
            color="green",
            fill=True,
            fill_opacity=0.6,
        ).add_to(m)

    # Save or display map
    # m.save("species_map.html")
    return m

### Euclidean distance

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# define the reference point (Southern France / SW Europe)
ref_lat = 44.0
ref_lon = 4.0

# filter out null latitude and longitude
filtered_df = train_df.filter(
    train_df.latitude.isNotNull() & train_df.longitude.isNotNull()
)

# compute squared Euclidean distance to map_center
df_with_distance = filtered_df.withColumn(
    "distance",
    F.pow(F.col("latitude") - F.lit(ref_lat), 2)
    + F.pow(F.col("longitude") - F.lit(ref_lon), 2),
)

# define window by species_id, ordering by distance
window_spec = Window.partitionBy("species_id").orderBy(F.col("distance").asc())

# Get closest point per species
closest_points_df = df_with_distance.withColumn(
    "rank", F.row_number().over(window_spec)
).filter(F.col("rank") == 1)

# Select final columns
final_df = closest_points_df.select("species_id", "latitude", "longitude")
final_df.show(n=5)



+----------+-----------------+-----------------+
|species_id|         latitude|        longitude|
+----------+-----------------+-----------------+
|   1355868|43.95999191666667|        3.9117065|
|   1355869|     44.056356625|3.862885908333333|
|   1355870|       43.8347442|        3.8857002|
|   1355871|        44.050021|          3.85021|
|   1355872|        44.046747|4.105631972222222|
+----------+-----------------+-----------------+
only showing top 5 rows



                                                                                

In [10]:
map = plot_map(final_df, col_latitude="latitude", col_longitude="longitude")
map

                                                                                

In [19]:
import os
import time
import requests
from dotenv import load_dotenv

# get API key
load_dotenv()
api_key = os.environ["GEOAPIFY_API_KEY"]


def get_country_geoapify(lat: float, lon: float) -> str:
    """
    Uses Geoapify reverse geocoding to get the country name from latitude and longitude.
    """
    url = (
        f"https://api.geoapify.com/v1/geocode/reverse?"
        f"lat={lat}&lon={lon}&apiKey={api_key}&format=json"
    )
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if "results" in data and data["results"]:
            return data["results"][0].get("country", "Unknown")
        return "Unknown"
    except Exception as e:
        print(f"Geoapify error at ({lat}, {lon}): {e}")
        return "Unknown"

In [None]:
# convert Spark DataFrame to Pandas
pandas_df = final_df.toPandas()

# deduplicate coordinates
unique_coords = pandas_df[["latitude", "longitude"]].drop_duplicates()

# map each unique coordinate to a country
coord_to_country = {}
for _, row in unique_coords.iterrows():
    lat, lon = row["latitude"], row["longitude"]
    country = get_country_geoapify(lat, lon)
    coord_to_country[(lat, lon)] = country
    time.sleep(0.4)  # Geoapify allows ~3 requests/sec, keep a safe margin

# map back to the full DataFrame
pandas_df["country"] = pandas_df.apply(
    lambda row: coord_to_country.get((row["latitude"], row["longitude"]), "Unknown"),
    axis=1,
)

# save the DataFrame to a CSV file
output_csv_path = (
    f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/species_euclidean_countries.csv"
)
pandas_df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")

# show results
pandas_df.head()