In [7]:
%load_ext autoreload
%autoreload 2
from geolifeclef.utils import get_spark
from pyspark.sql import functions as F
from IPython.display import display

spark = get_spark()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
po_path = "gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceOnlyOccurrences/GLC24-PO-metadata-train.csv"
pa_train_path = "gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceAbsenceSurveys/GLC24-PA-metadata-train.csv"
pa_test_path = "gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceAbsenceSurveys/GLC24-PA-metadata-test.csv"

! gcloud storage du {po_path}
! gcloud storage du {pa_train_path}
! gcloud storage du {pa_test_path}

376527330    gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceOnlyOccurrences/GLC24-PO-metadata-train.csv
97677806     gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceAbsenceSurveys/GLC24-PA-metadata-train.csv
275277       gs://dsgt-clef-geolifeclef-2024/data/downloaded/2024/PresenceAbsenceSurveys/GLC24-PA-metadata-test.csv


In [6]:
po = spark.read.csv(po_path, header=True, inferSchema=True)
pa_train = spark.read.csv(pa_train_path, header=True, inferSchema=True)
pa_test = spark.read.csv(pa_test_path, header=True, inferSchema=True)

po.printSchema()
pa_train.printSchema()
pa_test.printSchema()

                                                                                

root
 |-- year: integer (nullable = true)
 |-- month: double (nullable = true)
 |-- day: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- geoUncertaintyInM: double (nullable = true)
 |-- taxonRank: string (nullable = true)
 |-- date: date (nullable = true)
 |-- dayOfYear: integer (nullable = true)
 |-- speciesId: double (nullable = true)
 |-- surveyId: integer (nullable = true)

root
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- geoUncertaintyInM: double (nullable = true)
 |-- areaInM2: string (nullable = true)
 |-- region: string (nullable = true)
 |-- country: string (nullable = true)
 |-- speciesId: double (nullable = true)
 |-- surveyId: integer (nullable = true)

root
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- geoUncertaintyInM: double (nullable = true)
 |-- areaInM2: string (nullable = true)
 |-- 

In [10]:
po.groupBy("speciesId").count().describe().show()
pa_train.groupBy("speciesId").count().describe().show()

                                                                                

+-------+------------------+------------------+
|summary|         speciesId|             count|
+-------+------------------+------------------+
|  count|              9709|              9709|
|   mean| 5632.511896178803| 523.2049644659594|
| stddev|3246.4039722940065|1232.3447606902587|
|    min|               0.0|                 1|
|    max|           11254.0|              9265|
+-------+------------------+------------------+





+-------+-----------------+------------------+
|summary|        speciesId|             count|
+-------+-----------------+------------------+
|  count|             5016|              5016|
|   mean|5589.410287081339|295.78090111642746|
| stddev|3261.619462257373|1230.4359959873577|
|    min|              2.0|                 1|
|    max|          11254.0|             21489|
+-------+-----------------+------------------+



                                                                                

In [12]:
po.groupBy("surveyId").count().describe().show()
pa_train.groupBy("surveyId").count().describe().show()
pa_test.groupBy("surveyId").count().describe().show()

                                                                                

+-------+------------------+------------------+
|summary|          surveyId|             count|
+-------+------------------+------------------+
|  count|           3845533|           3845533|
|   mean|1959744.2132570439|1.3209604494357479|
| stddev|1131424.5684791987|1.8456836644964556|
|    min|                 1|                 1|
|    max|           3919662|               424|
+-------+------------------+------------------+



                                                                                

+-------+------------------+------------------+
|summary|          surveyId|             count|
+-------+------------------+------------------+
|  count|             88987|             88987|
|   mean|1963014.2896602876|16.672513962713655|
| stddev|1134917.6216001941|12.800096644885837|
|    min|               212|                 1|
|    max|           3919655|               837|
+-------+------------------+------------------+

+-------+------------------+-----+
|summary|          surveyId|count|
+-------+------------------+-----+
|  count|              4716| 4716|
|   mean|1947626.8112807465|  1.0|
| stddev|1132995.2122041124|  0.0|
|    min|               642|    1|
|    max|           3919234|    1|
+-------+------------------+-----+



In [23]:
train_survey = po.select("surveyId").union(pa_train.select("surveyId")).distinct()
train_survey.count()
train_survey.join(pa_test.select("surveyId"), "surveyId").count()

                                                                                

0

In [17]:
# it turns out there is no overlap between the surveyId in the train and test sets...
# if so, how do we manage the mapping between the two? If I understand correctly,
# we need to transform it to the new coordinate frame, and then project down to a
# 10m resolution using the reference geojson file as the min and max bounds
import pyproj

geojson = {
    "type": "Polygon",
    "coordinates": [
        [
            [-32.26344, 26.63842],
            [-32.26344, 72.18392],
            [35.58677, 72.18392],
            [35.58677, 26.63842],
            [-32.26344, 26.63842],
        ]
    ],
}

transformer = pyproj.Transformer.from_crs("epsg:4326", "epsg:32738", always_xy=True)

transformer.transform(-32.26344, 26.63842)

(-8038789.129055042, 17375122.09825012)

In [26]:
def get_projection_udf():
    import pyproj

    transformer = pyproj.Transformer.from_crs("epsg:4326", "epsg:32738", always_xy=True)

    @F.udf("struct<lat: double, lon: double>")
    def proj(lat, lon):
        x, y = transformer.transform(lon, lat)
        return (x, y)

    return proj


proj_udf = get_projection_udf()
po.select(proj_udf("lat", "lon").alias("proj")).select("proj.*").show()

+-------------------+--------------------+
|                lat|                 lon|
+-------------------+--------------------+
| -2979126.136755766| 1.585455714308697E7|
| -3193887.910451185|1.5745715441722106E7|
|-2824402.5018607844| 1.645507842670022E7|
|-2502843.7578417095|1.7056012382777754E7|
|-2124385.4058199157|1.6208507587851327E7|
| -2206077.994453682|1.6244647380441677E7|
|-3380603.6566508533|1.5892591287252612E7|
|  -2488542.33058973|1.6800044898976296E7|
| -3416479.978686857|1.6076315118694443E7|
| -2623890.300033362|1.5179856786659785E7|
|-2556523.5317795426|1.6343512585769545E7|
| -4092689.092628615|1.5500934344744682E7|
| -4092689.092628615|1.5500934344744682E7|
| -4092689.092628615|1.5500934344744682E7|
| -4092689.092628615|1.5500934344744682E7|
|-3014634.6328360667|1.6262366291070731E7|
| -2108168.822707106|1.5175912337132711E7|
|-2173674.5218280726|1.6477110667559408E7|
|-2139231.2840318372|1.6530050428715443E7|
|-2627579.5554473437|1.5824728693168718E7|
+----------