# Geopy
https://geopy.readthedocs.io/en/stable/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/10 17:39:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/10 17:39:01 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
import os
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Thu Apr 10 05:39:05 PM EDT 2025


In [4]:
# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/parquet"

# define the path to the train and test parquet files
train_path = f"{data_path}/train"

# read the parquet files into a spark DataFrame
train_df = spark.read.parquet(train_path)

# show the data
train_df.printSchema()
train_df.show(n=5)

                                                                                

root
 |-- image_name: string (nullable = true)
 |-- path: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- organ: string (nullable = true)
 |-- species_id: integer (nullable = true)
 |-- obs_id: long (nullable = true)
 |-- license: string (nullable = true)
 |-- partner: string (nullable = true)
 |-- author: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- gbif_species_id: string (nullable = true)
 |-- species: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- family: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- references: string (nullable = true)
 |-- url: string (nullable = true)
 |-- learn_tag: string (nullable = true)
 |-- image_backup_url: string (nullable = true)



                                                                                

+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------+--------------------+
|          image_name|                path|                data|organ|species_id|    obs_id|             license|partner|          author|altitude|         latitude|         longitude|gbif_species_id|             species|        genus|    family| dataset|  publisher|          references|                 url|learn_tag|    image_backup_url|
+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------

In [5]:
from geopy.geocoders import Nominatim
from geopy.location import Location


def get_country_name(latitude: float, longitude: float) -> str:
    """
    Converts latitude and longitude coordinates to a country name.

    Args:
        latitude (float): Latitude coordinate.
        longitude (float): Longitude coordinate.

    Returns:
        str: Country name or "Unknown" if not found.
    """
    geolocator = Nominatim(user_agent="geocoding_app")
    coordinates = f"{latitude}, {longitude}"
    location: Location | None = geolocator.reverse(coordinates, language="en")
    if location:
        address = location.raw["address"]
        country_name = address.get("country", "Unknown")
        return country_name
    return "Unknown"


# Example usage
latitude = 40.7128  # Example: New York City
longitude = -74.0060
country = get_country_name(latitude, longitude)
print(f"The country for coordinates ({latitude}, {longitude}) is: {country}")

latitude = 51.5074  # Example: London
longitude = 0.1278
country = get_country_name(latitude, longitude)
print(f"The country for coordinates ({latitude}, {longitude}) is: {country}")

latitude = -23.5505  # Example: Sao Paulo
longitude = -46.6333
country = get_country_name(latitude, longitude)
print(f"The country for coordinates ({latitude}, {longitude}) is: {country}")

latitude = 45.74738691666667
longitude = 14.416461944444444
country = get_country_name(latitude, longitude)
print(f"The country for coordinates ({latitude}, {longitude}) is: {country}")

The country for coordinates (40.7128, -74.006) is: United States
The country for coordinates (51.5074, 0.1278) is: United Kingdom
The country for coordinates (-23.5505, -46.6333) is: Brazil
The country for coordinates (45.74738691666667, 14.416461944444444) is: Slovenia


In [6]:
# count number of NULL latitude and longitude data
null_latitude_count = train_df.filter(train_df.latitude.isNull()).count()
null_longitude_count = train_df.filter(train_df.longitude.isNull()).count()
print(f"Number of NULL latitude values: {null_latitude_count}")
print(f"Number of NULL longitude values: {null_longitude_count}")



Number of NULL latitude values: 702608
Number of NULL longitude values: 702592


                                                                                

In [7]:
# filter out the rows with NULL latitude and longitude
filtered_df = train_df.filter(
    train_df.latitude.isNotNull() & train_df.longitude.isNotNull()
)
# groupby species_id and average the latitude and longitude of each species
grouped_df = filtered_df.groupBy("species_id").agg(
    {"latitude": "avg", "longitude": "avg"}
)
# rename the columns
grouped_df = grouped_df.withColumnRenamed("avg(latitude)", "avg_latitude")
grouped_df = grouped_df.withColumnRenamed("avg(longitude)", "avg_longitude")
# show the first 5 rows
grouped_df.show(n=5)



+----------+------------------+------------------+
|species_id|      avg_latitude|     avg_longitude|
+----------+------------------+------------------+
|   1737493| 46.36360531186132| 5.639311481005929|
|   1363733| 42.12276511772893| 6.716418020291944|
|   1743681|  43.9162545329139|3.4286095476539384|
|   1359356|38.116486922794394| 8.350581385996264|
|   1741934|         42.625339|         25.145366|
+----------+------------------+------------------+
only showing top 5 rows



                                                                                

In [8]:
# count the number of unique species from the grouped DataFrame
unique_species_count = grouped_df.select("species_id").count()
print(f"Number of unique species: {unique_species_count}")



Number of unique species: 6208


                                                                                

In [9]:
import folium


# plot the grouped_df DataFrame
pandas_df = grouped_df.toPandas()

# Create a folium map centered around southwestern Europe
map_center = [44.0, 4.0]  # roughly southern France
m = folium.Map(location=map_center, zoom_start=5, tiles="OpenStreetMap")

# Add markers for each species
for _, row in pandas_df.iterrows():
    folium.CircleMarker(
        location=[row["avg_latitude"], row["avg_longitude"]],
        radius=4,
        popup=str(row["species_id"]),
        color="green",
        fill=True,
        fill_opacity=0.6,
    ).add_to(m)

# Save or display map
# m.save("species_map.html")
m

                                                                                