# Address Matching with Apache Spark

## Iniciar Spark

In [None]:
import time
start_time = time.time()

In [None]:
from pyspark.sql.session import SparkSession

spark = (
    SparkSession.builder.appName("Address Matching")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "20g")
    .config("spark.driver.maxResultSize", "4g")
    .config("spark.local.dir", "/spark-tmp") \
    .getOrCreate()
)

In [None]:
conf = spark.sparkContext.getConf()
for key, value in conf.getAll():
    print(f"{key}: {value}")

## Lectura de ficheros

El dataset continene exactamente 24 columnas, gran parte de ellas son utilizadas por el ISTAC para el propio trabajo que realizan. En este proyecto, se utilizarán las siguientes columnas pues contienen la información más relevante:
- **uuid_idt**: contiene un identificador alfanumérico único para cada dirección.
- **latitud**
- **longitud**
- **tvia**: contiene el tipo de vía, es decir, si es una calle, avenida, carretera, etc.
- **nvia**: contiene el nombre de la vía.
- **numer**: contiene el número de la vía.
- **codmun**: está compuesto por un código numérico que identifica a un municipio, es decir, el código postal.
- **nommun**: contiene el nombre del municipio.
- **direccion**: contiene la dirección completa."

In [None]:
from pyspark.sql.functions import upper

# Leer el archivo CSV y cargarlo en un DataFrame
file = "../data/raw_data/TFM_Direcciones.tab"
first_df = (
    spark.read.option("delimiter", "\t")
    .option("encoding", "latin1")  # Cambiar la codificación a UTF-8
    .csv(file, header=True, inferSchema=True)
)

# Seleccionar solo las columnas deseadas
selected_columns = [
    "uuid_idt",
    "latitud",
    "longitud",
    "tvia",
    "nvia",
    "numer",
    "codmun",
    "nommun",
    "direccion",
]

first_df = first_df.select(selected_columns)
first_df = first_df.withColumn("nommun", upper(first_df["nommun"]))

# Mostrar el DataFrame
first_df.show()
first_df.schema
print(first_df.count())

In [None]:
file = "../data/raw_data/data-09022024.csv"
second_df = spark.read.option("header", True).csv(file)

selected_columns = ["uuid_idt", "latitud", "longitud", "codmun", "nommun", "direccion"]

second_df = second_df.select(selected_columns)
second_df = second_df.select(
    upper("uuid_idt").alias("uuid_idt"),
    "latitud",
    "longitud",
    "codmun",
    upper("nommun").alias("nommun"),
    "direccion",
)

municipalities = first_df.select("nommun").distinct()
second_df = second_df.join(municipalities, "nommun", "inner")

second_df.show()
print(second_df.count())
print(second_df.count() + first_df.count())

In [None]:
from pyspark.sql.functions import count


def uuid_frecuency(dataframe):
    """Frequency of the number of addresses associated by uuid_idt."""
    uuid_counts = dataframe.groupBy("uuid_idt").count()
    values_under_10 = (
        uuid_counts
        .filter("count <= 9")
        .groupBy("count")
        .agg(count("*").alias("Frecuencia"))
        .orderBy("count")
    )
    values_under_10 = values_under_10.withColumnRenamed(
        "count", "Número de direcciones asociadas"
    )

    values_over_10 = dataframe.groupBy(dataframe.uuid_idt).count().filter("count > 9")
    values_over_10 = spark.createDataFrame(
        [["10 o más", values_over_10.count()]],
        ["Número de direcciones asociadas", "Frecuencia"],
    )

    values = values_under_10.union(values_over_10)
    values.show()


uuid_frecuency(first_df)
uuid_frecuency(second_df)

## Unión de los dataframes

In [None]:
unique_uuid_first_df = first_df.select("uuid_idt").distinct()
unique_uuid_second_df = second_df.select("uuid_idt").distinct()
uuid_comunes = unique_uuid_first_df.join(unique_uuid_second_df, "uuid_idt", "inner")

addresses_df = first_df.unionByName(
    second_df.join(uuid_comunes, "uuid_idt", "inner"), allowMissingColumns=True
)

addresses_df.show()

print("Tamaño del dataframe ampliado: ", addresses_df.count())
print(
    "Número de uuid_idt nuevos del segundo dataframe: ",
    unique_uuid_second_df.subtract(uuid_comunes).count(),
)

In [None]:
from pyspark.sql.functions import col

addresses_df = addresses_df.withColumn(
    "latitud", col("latitud").cast("float")
).withColumn("longitud", col("longitud").cast("float"))

In [None]:
uuid_frecuency(addresses_df)

## Partición del dataset

In [None]:
addresses_df = addresses_df.repartition(32)

## Limpieza del dataset

Tras visualizar el dataset, se observa que hay columnas que tienen valores desconocidos, representados con '_U', y valores nulos.

In [None]:
print(f"Initial dataset size: {addresses_df.count()}")

### Valores nulos

In [None]:
from pyspark.sql.functions import isnan, when, count, col

addresses_df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in addresses_df.columns]
).show()

In [None]:
addresses_df = addresses_df.na.fill("_U")

addresses_df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in addresses_df.columns]
).show()

### Extracción del prefijo para eleminar valores '_U'

Tras analizar el dataset, se observa que para una dirección, el tipo de vía (tvia) puede aparecer vacío, en cambio, en la columna 'direccion' aparece este valor. Por lo que se procederá a extraer el prefijo de la columna 'direccion' para rellenar los valores vacíos de la columna 'tvia' evitando así la perdida de información.

In [None]:
print(
    f"Unknown tvia before prefix extraction: {addresses_df.filter(addresses_df.tvia == '_U').count()}"
)

In [None]:
tvia_types = addresses_df.select("tvia").distinct().collect()
tvia_types = {t.tvia for t in tvia_types if t.tvia != "_U" and not t.tvia.isnumeric()}
print(tvia_types)

In [None]:
from pyspark.sql.functions import regexp_extract, when

condition = (addresses_df.tvia == "_U") & (
    regexp_extract("direccion", r"^(\S+)", 1).isin(tvia_types)
)
addresses_df = addresses_df.withColumn(
    "tvia",
    when(condition, regexp_extract("direccion", r"^(\S+)", 1)).otherwise(
        addresses_df.tvia
    ),
)

In [None]:
print(
    f"Unknown tvia after prefix extraction: {addresses_df.filter(addresses_df.tvia == '_U').count()}"
)

Se puede observar que los tipos de vías que siguen siendo desconocidos es debido a que desde la columna de dirección no se ha podido extraer un prefijo que aparezca en la lista de los tipos de vías que tenemos en el dataset.

### Limpieza de entradas con valor '_U'

Tras realizar la limpieza del dataset y extraer el tipo de via de la columna 'direccion', se procederá a eliminar las entradas restantes que contengan valores '_U' en las columnas 'tvia' pues no aportan información relevante.

In [None]:
print(
    f"Unknown tvia before duplicated null cleaning: {addresses_df.filter(addresses_df.tvia == '_U').count()}"
)
print(addresses_df.count())

In [None]:
addresses_df = addresses_df.filter(addresses_df.tvia != "_U")

In [None]:
print(
    f"Unknown tvia after duplicated null cleaning: {addresses_df.filter(addresses_df.tvia == '_U').count()}"
)

In [None]:
print(f"Tamaño final del dataframe: {addresses_df.count()}")

## Filtros de prueba

In [None]:
AUMENTED_DATA = False
MIN_FRENCUENCY = 50

## Aumento de datos

### Frecuencia de uuid_idt antes

In [None]:
uuid_frecuency(addresses_df)

In [None]:
addresses_df.filter("uuid_idt == '015AF5E4-05FD-4636-B80E-1835A087D3DC'").show()

### Aumento de datos

Se procederá a realizar un aumento de datos si es especificado para crear direcciones artificiales, a partir de las existentes, con el fin de reducir el número de uuid_idt que tienen pocas direcciones asociadas. Lo cambios a realizar a las nuevas direcciones consistirá en: 
- Añadir errores ortográficos intercambiando el dos letras adyacentes de una palabra.
- Cambiar el tipo de vía por uno aleatorio.

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import random


# Define la función UDF
def switch_letters(direcction: str) -> str:
    """Intercambia letras adyacentes en una palabra.

    Args:
        direction: string with the direction
    """
    words = direcction.split()
    word_candidates = [word for word in words if word.isalpha() and len(word) >= 2]
    if not word_candidates:
        return direcction
    word = random.choice(word_candidates)
    pos = random.randint(0, len(word) - 2)
    return " ".join(
        w if w != word else w[:pos] + w[pos + 1] + w[pos] + w[pos + 2 :] for w in words
    )


def update_direction_via(tvia: str, direction: str) -> str:
    """Actualiza el tipo de vía de la dirección"""
    old_tvia = direction.split(" ")[0]
    if tvia.upper() in tvia_types:
        direction = direction.replace(old_tvia, tvia)
    return direction


def select_via_type(tvia: str) -> list:
    """Selecciona tipos de vía alternativos sin duplicados"""
    return random.choice(
        [t for t in tvia_types if t.title() != tvia and t != tvia.lower()]
    )


# Registra la función UDF con Spark
spark.udf.register("switch_letters", switch_letters, StringType())
spark.udf.register("select_via_type", select_via_type, StringType())
spark.udf.register("update_direction_via", update_direction_via, StringType())

if AUMENTED_DATA:
    print("enter")
    unique_filtered_uuids = (
        addresses_df.groupBy("uuid_idt")
        .count()
        .filter(f"count <= {MIN_FRENCUENCY}")
        .select("uuid_idt")
        .sample(0.5)
    )

    to_extend = addresses_df.join(unique_filtered_uuids, "uuid_idt")

    extended_ds = (
        to_extend.withColumn("tvia", udf(select_via_type, StringType())("tvia"))
        .withColumn(
            "direccion", udf(update_direction_via, StringType())("tvia", "direccion")
        )
        .withColumn("direccion", udf(switch_letters, StringType())("direccion"))
    )

    extended_ds.show(truncate=False)
    print(extended_ds.count(), addresses_df.count())
    addresses_df = addresses_df.union(extended_ds)
    print(addresses_df.count())

### Frecuencia de uuid_idt después

In [None]:
uuid_frecuency(addresses_df)

## Filtro


Con el fin de reducir el problema, se aplica un filtro por el municipio con más direccioens asociadas.

In [None]:
MUNICIPIO = "SAN CRISTÓBAL DE LA LAGUNA"

In [None]:
if MUNICIPIO:
    filtered_uuids = (
        addresses_df.groupBy("uuid_idt")
        .count()
        .filter(f"count >= {MIN_FRENCUENCY}")
        .select("uuid_idt")
    )
    addresses_df = addresses_df.join(filtered_uuids, "uuid_idt").filter(
        f"nommun == '{MUNICIPIO}'"
    )
addresses_df = addresses_df.select("uuid_idt", "latitud", "longitud", "direccion")

## División de dataset

In [None]:
# Obtener los uuids únicos y convertirlos a una lista de Python
unique_uuids = [row[0] for row in addresses_df.select("uuid_idt").distinct().collect()]

# Crear DataFrame del 80%
train_df = addresses_df.sampleBy(
    "uuid_idt", fractions={uuid: 0.8 for uuid in unique_uuids}, seed=42
)

# Crear DataFrame del porcentaje restante restando el DataFrame del 80% al original
test_df = addresses_df.subtract(train_df)

In [None]:
print(addresses_df.count())
print(train_df.count())
print(test_df.count())

In [None]:
print(addresses_df.select("uuid_idt").distinct().count())
print(train_df.select("uuid_idt").distinct().count())
print(test_df.select("uuid_idt").distinct().count())

In [None]:
uuid_frecuency(train_df)
uuid_frecuency(test_df)

## Representación gráfica

In [None]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="direccion", outputCol="words")
tokenized_sentences = tokenizer.transform(addresses_df).select("words").collect()
tokenized_sentences = [sentence.words for sentence in tokenized_sentences]
print(tokenized_sentences[:10])

In [None]:
word_counter = dict()
for sentence in tokenized_sentences:
    for word in sentence:
        if word not in word_counter:
            word_counter[word] = 1
        else:
            word_counter[word] += 1

word_counter = spark.createDataFrame(word_counter.items(), ["word", "count"]).orderBy(
    "count", ascending=False
)

# Creo la columa frecuencia
word_counter = word_counter.withColumn(
    "frequency", word_counter["count"] / word_counter.count()
)
word_counter.show()

In [None]:
import matplotlib.pyplot

# Imprimir las 20 palabras más frecuentes
word_counter.toPandas().head(15).plot.bar(x="word", y="frequency")

## Representación con embeddings

### Representación mediante _Word2Vec_

In [None]:
from pyspark.ml.feature import Word2Vec, Tokenizer

# Tokenizar el texto
tokenizer = Tokenizer(inputCol="direccion", outputCol="words")

train_tokens = tokenizer.transform(train_df)
test_tokens = tokenizer.transform(test_df)

# Entrenar el modelo Word2Vec
word2Vec = Word2Vec(vectorSize=200, minCount=0, inputCol="words", outputCol="embedding")
word2vec_model = word2Vec.fit(train_tokens)

train_embeddings_word2vec  = word2vec_model.transform(train_tokens)
test_embeddings_word2vec  = word2vec_model.transform(test_tokens)

### Representación con _GPT_

Lectura de la variable de entorno donde se almacena la API key de OpenAI

In [None]:
import os

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("No API key found in environment variables")

In [None]:
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import udf
from openai import OpenAI
import numpy as np


def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x.tolist()  # Convertir a lista si es un vector unidimensional
        return (x / norm).tolist()
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return (x / norm).tolist()
    

def get_embedding(text, model="text-embedding-3-small"):
    client = OpenAI(api_key=api_key)
    text = text.replace("\n", " ")
    result = client.embeddings.create(input=[text], model=model).data[0].embedding[:256]
    return normalize_l2(result)


# Definir la función UDF y especificar el tipo de dato de retorno como ArrayType(FloatType())
embedding_udf = udf(lambda text: get_embedding(text), ArrayType(FloatType()))

test_embeddings_gpt = test_df.withColumn("embedding", embedding_udf("direccion"))
train_embeddings_gpt = train_df.withColumn("embedding", embedding_udf("direccion"))

### Representación con _MiniLM_

In [None]:
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import udf
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


@udf(returnType=ArrayType(FloatType()))
def minilm_embedding_list(direccion: str):
    """Calcula el embedding de la direccion dada y lo convierte a una lista para evitar la serelización de Spark.

    Args:
        direccion: Direction to be represeted by the embedding model.

    Returns:
        A list with de embedding of the direction
    """
    embedding = model.encode(direccion)
    return embedding.tolist()


train_embeddings_minilm = train_df.withColumn(
    "embedding", minilm_embedding_list("direccion")
)
test_embeddings_minilm = test_df.withColumn(
    "embedding", minilm_embedding_list("direccion")
)

## Similitud de direcciones

### Distancia del coseno

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import numpy as np


def cosine_similarity(v1, v2):
    v1_np = np.array(v1)
    v2_np = np.array(v2)
    cos_sim = np.dot(v1_np, v2_np) / (np.linalg.norm(v1_np) * np.linalg.norm(v2_np))
    return float(cos_sim)


cosine_similarity_udf = udf(cosine_similarity, DoubleType())

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, row_number, expr, max as spark_max
from pyspark.sql.window import Window


def get_top_matches(
    test_df: DataFrame,
    train_df: DataFrame,
    embedding_col: str,
    address_col: str,
    top_n: int = 3,
) -> DataFrame:
    """Return the top N results of the cosine similarity computation.
    
    Args:
        test_df: DataFrame containing the addresses to evaluate.
        train_df: DataFrame containing the addresses used for training.
        embedding_col: Name of the column containing the precomputed embeddings.
        address_col: Name of the column containing the addresses.
        top_n: Number of top results to return for each address in test_df.

    Returns:
        DataFrame containing the addresses from test_df and their top N matches from train_df based on cosine
    """
    test_df = test_df.alias("test_df")
    train_df = train_df.alias("train_df")

    cross_df = test_df.crossJoin(train_df)
    cross_df = cross_df.withColumn(
        "cosine_similarity",
        cosine_similarity_udf(
            col(f"test_df.{embedding_col}"), col(f"train_df.{embedding_col}")
        ),
    )

    window = Window.partitionBy(col(f"test_df.{address_col}")).orderBy(
        col("cosine_similarity").desc()
    )

    cross_df = cross_df.withColumn("rank", row_number().over(window))
    top_n_df = cross_df.filter(col("rank") <= top_n)
    return top_n_df


def evaluate_and_count_matches(
    top_n_df: DataFrame, test_id_col: str, train_id_col: str, address_col: str
) -> DataFrame:
    """Evaluate the result of a pair of test direction data and the best direction result.
    
    Given a Dataframe with set of rows with the test direction and a result direction, 
    confirm if it is a match or not creating a new column 'evaluation'.
    
    Args:
        top_n_df:
        test_id_col:
        train_id_col:
        address_col:

    Returns: The number of matches
    """
    top_n_df = top_n_df.withColumn("evaluation", col(test_id_col) == col(train_id_col))
    matches_by_address = top_n_df.groupBy(f"test_df.{address_col}").agg(
        spark_max(expr("case when evaluation = true then 1 else 0 end")).alias(
            "has_match"
        )
    )
    matches = matches_by_address.filter("has_match = 1")
    return matches.count()

### Word2Vec

In [None]:
print(train_embeddings_word2vec.count())
print(test_embeddings_word2vec.count())

In [None]:
# Obtener las mejores coincidencias
best_results_df = get_top_matches(
    test_embeddings_word2vec,
    train_embeddings_word2vec,
    "embedding",
    "direccion",
    top_n=3,
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

matches_count = evaluate_and_count_matches(
    best_results_df, "test_df.uuid_idt", "train_df.uuid_idt", "direccion"
)
print(f"Number of addresses with matches: {matches_count}")


### MiniLM

In [None]:
best_results_df = get_top_matches(
    test_embeddings_minilm,
    train_embeddings_minilm,
    "embedding",
    "direccion",
    top_n=3,
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

matches_count = evaluate_and_count_matches(
    best_results_df, "test_df.uuid_idt", "train_df.uuid_idt", "direccion"
)
print(f"Number of addresses with matches: {matches_count}")

### GPT

In [None]:
best_results_df = get_top_matches(
    test_embeddings_gpt,
    train_embeddings_gpt,
    "embedding",
    "direccion",
    top_n=3,
)
matches_count = evaluate_and_count_matches(
    best_results_df, "test_df.uuid_idt", "train_df.uuid_idt", "direccion"
)
print(f"Number of addresses with matches: {matches_count}")

## Similitud de direcciones con Chroma DB

#### Connect

In [None]:
import chromadb

chroma_client = chromadb.Client()

In [None]:
from pyspark.ml.linalg import DenseVector
from pyspark.sql import DataFrame


def extract_to_chroma(dataframe: DataFrame, type_df: str):
    """Extrac the uuids, directions and embeddings of each row of a dataframe.

    Args:
        dataframe: Dataframe with the data.
        type_df: Type of the dataframe. Train or test.

    Returns:
        embeddings: List with the numeric representation of each direction.
        documents: List that contains the directions.
        metadata: List with relevant information about the direction
          (uuid, latitud, longitud, type_df).
        ids: List with as many ids as addresses.
    """
    ids = []
    documents = []
    embeddings = []
    metadata = []

    # Recopilar los datos del DataFrame
    rows = dataframe.select(
        "uuid_idt", "latitud", "longitud", "direccion", "embedding"
    ).collect()
    index = 0

    for row in rows:
        # Extraer los datos de cada fila
        uuid_idt = str(row.uuid_idt)
        direccion = str(row.direccion)
        embedding = (
            row.embedding.toArray().tolist()
            if isinstance(row.embedding, DenseVector)
            else row.embedding
        )

        ids.append(str(index))
        documents.append(direccion)
        embeddings.append(embedding)
        metadata.append(
            {
                "uuid_idt": uuid_idt,
                "latitud": float(row.latitud),
                "longitud": float(row.longitud),
                "type_df": type_df,
            }
        )
        index += 1
    return embeddings, documents, metadata, ids


def extract_data(
    test_result_collection: dict, train_result_collection: dict, dataframe
):
    """Extract data from the test and train result collections to a dataframe.

    Args:
        test_result_collection (dict): The result collection from the test data.
        train_result_collection (dict): The result collection from the train data.
        dataframe (DataFrame): The DataFrame to write the extracted data.

    Returns:
        DataFrame: A DataFrame containing the extracted data with the following columns:
            - uuid_idt_test
            - latitud_test
            - longitud_test
            - direccion_test
            - uuid_idt_train
            - latitud_train
            - longitud_train
            - direccion_train
    """
    test_metadata = test_result_collection["metadatas"][0]
    uuid_test_result = test_metadata["uuid_idt"]
    latitud_test_result = test_metadata["latitud"]
    longitud_test_result = test_metadata["longitud"]
    direccion_test_result = test_result_collection["documents"][0]
    test_data = (
        uuid_test_result,
        latitud_test_result,
        longitud_test_result,
        direccion_test_result,
    )

    # Datos de entrenamiento
    train_metadata = train_result_collection["metadatas"][0]
    train_data = [
        (metadata["uuid_idt"], metadata["latitud"], metadata["longitud"], direccion)
        for metadata, direccion in zip(
            train_metadata, train_result_collection["documents"][0]
        )
    ]

    # Crear DataFrames temporales y unirlos
    temp_data = [(test_data + train_row) for train_row in train_data]
    temp_df = spark.createDataFrame(temp_data, schema=schema)
    dataframe = dataframe.union(temp_df)
    return dataframe


def evaluation_collections(
    collection_test: chromadb.api.models.Collection.Collection,
    collection_train: chromadb.api.models.Collection.Collection,
):
    """
    Args:
        collection_test: Collection with the data to search for similar.
        collection_train: Collection to query the n best embeddings.

    Returns: 
        match: Number of hits.
        no_match: Numbers of no hits.
        evaluated_dirs: Dataframe with the made comparations.
    """
    evaluated_dirs = spark.createDataFrame([], schema)

    counter = 0
    match = 0
    no_match = 0
    while counter < collection_test.count():
        data_to_evaluate = collection_test.get(
            ids=[str(counter)], include=["embeddings", "metadatas", "documents"]
        )

        embedding_to_evaluate = data_to_evaluate["embeddings"][0]
        uuid_to_evaluate = data_to_evaluate["metadatas"][0]["uuid_idt"]

        best_similarities = collection_train.query(
            query_embeddings=[embedding_to_evaluate], n_results=3
        )

        best_uuids = [u["uuid_idt"] for i in best_similarities["metadatas"] for u in i]

        if uuid_to_evaluate in best_uuids:
            match += 1
        else:
            no_match += 1
            evaluated_dirs = extract_data(data_to_evaluate, best_similarities, evaluated_dirs)
        counter += 1
    return match, no_match, evaluated_dirs


def chunk_data(data, chunk_size: int):
    """
    Divide a list of data into batches of a specific size.

    Args:
        data: The list of data to be divided into batches.
        chunk_size : The size of each batch.

    Yields:
        list: A batch of data of size `chunk_size`.
    """
    for i in range(0, len(data), chunk_size):
        yield data[i : i + chunk_size]


def add_to_collection(
    collection: chromadb.api.models.Collection.Collection, 
    embeddings_list: list, 
    documents_list: list, 
    metadata_list: list, 
    ids_list: list
):
    """
    Add elements to a collection.

    Args:
        collection: The collection to which the elements will be added.
        embeddings_list (list): A list of embeddings.
        documents_list (list): A list of documents.
        metadata_list (list): A list of metadata.
        ids_list (list): A list of IDs.

    Returns:
        None
    """
    batch_size = 41666
    embeddings_chunks = list(chunk_data(embeddings_list, batch_size))
    documents_chunks = list(chunk_data(documents_list, batch_size))
    metadata_chunks = list(chunk_data(metadata_list, batch_size))
    ids_chunks = list(chunk_data(ids_list, batch_size))

    # Agrega cada lote por separado a la colección
    for embeddings, documents, metadata, ids in zip(
        embeddings_chunks, documents_chunks, metadata_chunks, ids_chunks
    ):
        collection.add(
            embeddings=embeddings, documents=documents, metadatas=metadata, ids=ids
        )


schema = "uuid_idt_test: string, \
        latitud_test: float, \
        longitud_test: float, \
        direccion_test: string, \
        uuid_idt_train: string, \
        latitud_train: float, \
        longitud_train: float, \
        direccion_train: string"

#### Word2Vec

In [None]:
ids_list, documents_list, embeddings_list, metadata_list = extract_to_chroma(
    test_embeddings_word2vec, "test"
)
collection_test_w2v = chroma_client.create_collection(name="test_df_w2v")
add_to_collection(
    collection_test_w2v, ids_list, documents_list, embeddings_list, metadata_list
)

ids_list, documents_list, embeddings_list, metadata_list = extract_to_chroma(
    train_embeddings_word2vec, "train"
)
collection_train_w2v = chroma_client.create_collection(name="train_df_w2v")
add_to_collection(
    collection_train_w2v, ids_list, documents_list, embeddings_list, metadata_list
)

In [None]:
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

In [None]:
matches_count, no_matches_count, best_results_df = evaluation_collections(
    collection_test_w2v, 
    collection_train_w2v, 
)
print(matches_count, no_matches_count)

#### GPT

In [None]:
chroma_client.delete_collection("test_df")
chroma_client.delete_collection("train_df")

In [None]:
collection_test_gpt = chroma_client.create_collection(name="test_df")
collection_train_gpt = chroma_client.create_collection(name="train_df")

In [None]:
ids_list, documents_list, embeddings_list, metadata_list = extract_to_chroma(
    test_embeddings_gpt, "test"
)
add_to_collection(
    collection_test_gpt, ids_list, documents_list, embeddings_list, metadata_list
)

ids_list, documents_list, embeddings_list, metadata_list = extract_to_chroma(
    train_embeddings_gpt, "train"
)
add_to_collection(
    collection_train_gpt, ids_list, documents_list, embeddings_list, metadata_list
)

In [None]:
matches_count, no_matches_count, best_results_df = evaluation_collections(
    collection_test_gpt, 
    collection_train_gpt, 
)
print(matches_count, no_matches_count)

#### MiniLLM

In [None]:
chroma_client.delete_collection("test_df_minilm")
chroma_client.delete_collection("train_df_minilm")

In [None]:
collection_test_minilm = chroma_client.create_collection(name="test_df_minilm")
collection_train_minilm = chroma_client.create_collection(name="train_df_minilm")

In [None]:
embeddings_list, documents_list, metadata_list, ids_list = extract_to_chroma(
    test_embeddings_minilm, "test"
)
add_to_collection(
    collection_test_minilm, embeddings_list, documents_list, metadata_list, ids_list
)

embeddings_list, documents_list, metadata_list, ids_list = extract_to_chroma(
    train_embeddings_minilm, "train"
)
add_to_collection(
    collection_train_minilm, embeddings_list, documents_list, metadata_list, ids_list
)

In [None]:
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

In [None]:
matches_count, no_matches_count, best_results_df = evaluation_collections(
    collection_test_minilm, 
    collection_train_minilm
)
print(matches_count, no_matches_count)

## Evaluación

In [None]:
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Tiempo de ejecución: {elapsed_time} segundos")

Se implementa la distancia de Haversine, por la cual se podrá saber que tan lejos está la dirección de evaluación con respecto a respecto a la media de las N mejroes direcciones.

In [None]:
# modelo = 'Word2Vec'
# modelo = 'GTP'
modelo = "MiniLM"

# tecnology = 'Spark'
tecnology = "Chroma DB"

In [None]:
from pyspark.sql.functions import col, udf
from haversine import haversine
from pyspark.sql.types import DoubleType

haversine_udf = udf(
    lambda lat1, lon1, lat2, lon2: haversine((lat1, lon1), (lat2, lon2)),
    returnType=DoubleType(),
)


if tecnology == "Chroma DB":
    evaluated_dirs = best_results_df.withColumn(
        "haversine_distance",
        haversine_udf(
            col("latitud_test"),
            col("longitud_test"),
            col("latitud_train"),
            col("longitud_train"),
        ),
    )
elif tecnology == "Spark":
    evaluated_dirs = best_results_df.withColumn(
        "haversine_distance",
        haversine_udf(
            col("test_df.latitud"),
            col("test_df.longitud"),
            col("train_df.latitud"),
            col("train_df.longitud"),
        ),
    )
else:
    print("Error, check the tecnology used")
    
evaluated_dirs = evaluated_dirs.repartition(32)

Exporta el dataframe con la evaluación.

In [None]:
evaluated_dirs.coalesce(1).write.csv("../data/proccesed_data/.csv")

In [None]:
from pyspark.sql.functions import avg
mean_value = 0
mean_value = round(evaluated_dirs.agg(avg("haversine_distance")).collect()[0][0], 2)
max_value = round(evaluated_dirs.agg({"haversine_distance": "max"}).collect()[0][0], 2)
hit_rate = round(matches_count / test_df.count(), 2)

In [None]:
import csv
import os


def write_model_results(cabecera: list, data: list):
    """Create or write the data into a csv file."""
    csv_file_path = f"../data/proccesed_data/results.csv"
    escribir_cabecera = not os.path.exists(csv_file_path)

    with open(csv_file_path, mode="a", newline="") as file:
        writer = csv.writer(file)
        if escribir_cabecera:
            writer.writerow(cabecera)
        writer.writerows(data)

    print(f"CSV file '{csv_file_path}' created successfully.")


cabecera = [
    "Model",
    "Tecnology",
    "Municipaly",
    "Train dataset size",
    "Test dataframe size",
    "Aumented data",
    "Frecuency",
    "Hit rate",
    "Mean Haversine distance",
    "Max Haversine distance",
]

data = [
    [
        modelo,
        tecnology,
        MUNICIPIO,
        train_df.count(),
        test_df.count(),
        AUMENTED_DATA,
        MIN_FRENCUENCY,
        hit_rate,
        mean_value,
        max_value,
    ]
]

write_model_results(cabecera, data)