In [1]:
from shapely.geometry import Point
from pyspark.sql import SparkSession
import geopandas as gpd

In [2]:
shapefile_path = "/content/alcaldias/poligonos_alcaldias_cdmx.shp"
alcaldias = gpd.read_file(shapefile_path, encoding="utf-8")

In [3]:
spark = SparkSession.builder \
    .appName("Metrobus Delegations from TXT") \
    .getOrCreate()

In [4]:
alcaldias_list = alcaldias.to_dict("records")
broadcast_alcaldias = spark.sparkContext.broadcast(alcaldias_list)

In [5]:
def find_alcaldia(lat, lon):
    point = Point(lon, lat)
    matching_alcaldias = filter(
        lambda alcaldia: point.within(alcaldia["geometry"]),
        broadcast_alcaldias.value
    )
    result = next(matching_alcaldias, None)
    return result["NOMGEO"] if result else "No se encontró la alcaldía"

In [6]:
txt_file_path = "stops.txt"
df = spark.read.csv(txt_file_path, header=True, inferSchema=True)

In [7]:
columnas = ["stop_name", "stop_lat", "stop_lon"]
df_selected = df.select(*columnas)

In [8]:
rdd = df_selected.rdd.map(lambda row: (row["stop_name"], row["stop_lat"], row["stop_lon"]))

In [9]:
rdd_with_alcaldias = rdd.map(
    lambda row: (row[0], row[1], row[2], find_alcaldia(row[1], row[2]))
)

In [10]:
columns = ["stop_name", "stop_lat", "stop_lon", "alcaldia"]
df_resultado = rdd_with_alcaldias.toDF(columns)

In [11]:
df_resultado.show(400)

+--------------------+------------------+------------------+-------------------+
|           stop_name|          stop_lat|          stop_lon|           alcaldia|
+--------------------+------------------+------------------+-------------------+
|             Potrero| 19.47660808499176|-99.13265208566766|  Gustavo A. Madero|
|         Circuito L1| 19.46262217936452|-99.14386723712764|         Cuauhtémoc|
|           San Simón| 19.45951903848959|-99.14643816862963|         Cuauhtémoc|
|     Manuel González|   19.456615046343|-99.14948159428556|         Cuauhtémoc|
|       Buenavista L1| 19.44691022319351|-99.15307779648693|         Cuauhtémoc|
|            El Chopo| 19.44349969896119|-99.15434496585382|         Cuauhtémoc|
|          Revolución| 19.44038555031536|-99.15537707771638|         Cuauhtémoc|
|Plaza de la Repúb...| 19.43597072042957|-99.15739044966492|         Cuauhtémoc|
|          Reforma L1| 19.43279916313105|-99.15879315610292|         Cuauhtémoc|
|         Hamburgo L1| 19.42

In [12]:
output_path = "paradas.csv"

df_resultado.coalesce(1).write.csv(output_path, header=True, mode="overwrite")