In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd

import numpy as np
from typing import Union

  from tqdm.autonotebook import tqdm, trange


In [2]:
# load data with descriptions
df = pd.read_csv('data\geocoded_addresses.csv')

In [3]:
def generate_embeddings(
    input_csv: Union[str, pd.DataFrame],
    output_csv: str,
    embedding_model_name: str = "all-MiniLM-L6-v2",
    ) -> None:
    """
    Generates combined text and geospatial embeddings for places.

    Parameters:
    - input_csv (str): Path to the input CSV file with place data.
    - output_csv (str): Path to save the output CSV with embeddings.
    - embedding_model_name (str): Pretrained SentenceTransformer model name.
    - pca_components (int): Number of PCA components for dimensionality reduction.

    Returns:
    - None
    """

    if isinstance(input_csv, str):
        df = pd.read_csv(input_csv)
    elif isinstance(input_csv, pd.DataFrame):
        df = input_csv
    else:
        raise ValueError("input_csv should be a path to a CSV file or a DataFrame")

    # check if there is description and used columns
    if "description" not in df.columns:
        raise ValueError("Input CSV should contain a 'description' column")


    model = SentenceTransformer(embedding_model_name)

    df["text"] = df["name"].fillna("") + " " + df["description"].fillna("") + " " + df['category'].fillna("") + " " + df['type'].fillna("")

    text_embeddings = model.encode(df["text"].tolist(), convert_to_numpy=True)

    embeddings_df = pd.DataFrame(
        text_embeddings,
        columns=[f"emb_{i}" for i in range(text_embeddings.shape[1])],
    )

    df = pd.concat([df, embeddings_df], axis=1)
    
    # create an id column
    df['id'] = np.arange(len(df))
    
    df.to_csv(output_csv, index=False)

In [None]:
input_csv = 'data\geocoded_addresses.csv'
output_csv = 'data\database_embeddings.csv'
model = "all-MiniLM-L6-v2"
# model = "embaas/sentence-transformers-e5-large-v2"
# model = "BAAI/bge-m3"

In [5]:

generate_embeddings(input_csv, output_csv, model)

In [6]:
embeddings = pd.read_csv('data\database_embeddings.csv')

In [7]:
# are there nan in emb columns?
emb_cols = [col for col in embeddings.columns if 'emb' in col]
embeddings[emb_cols].isna().sum().sum()

0

In [8]:
def compute_similarity_matrix(embeddings: pd.DataFrame) -> np.ndarray:
    """
    Computes a similarity matrix from the embeddings.

    Parameters:
    - embeddings (pd.DataFrame): DataFrame with embeddings.

    Returns:
    - np.ndarray: Similarity matrix.
    """

    emb_cols = [col for col in embeddings.columns if col.startswith("emb_")]
    
    embeddings = embeddings[emb_cols].values

    similarity_matrix = np.inner(embeddings, embeddings)

    return similarity_matrix


def top_n_similar(
    similarity_matrix: np.ndarray,
    place_id: int,
    n: int = 5,
    ) -> pd.DataFrame:
    """
    Finds top N similar places for a given place.

    Parameters:
    - similarity_matrix (np.ndarray): Similarity matrix.
    - place_id (int): Place ID to find similar places for.
    - n (int): Number of similar places to return.

    Returns:
    - pd.DataFrame: DataFrame with similar places.
    """

    similar_places = np.argsort(similarity_matrix[place_id])[::-1][1:n+1]

    return similar_places


def create_top_n_cols(
    embeddings: pd.DataFrame,
    similarity_matrix: np.ndarray,
    n: int = 5,
    ) -> pd.DataFrame:
    """
    Creates top N similar places columns for each place.

    Parameters:
    - embeddings (pd.DataFrame): DataFrame with embeddings.
    - similarity_matrix (np.ndarray): Similarity matrix.
    - n (int): Number of similar places to return.

    Returns:
    - pd.DataFrame: DataFrame with top N similar places columns.
    """

    top_n_cols = []

    for i in range(len(embeddings)):
        top_n = top_n_similar(similarity_matrix, i, n)
        top_n_cols.append(top_n)

    top_n_df = pd.DataFrame(top_n_cols, columns=[f"top_{i+1}" for i in range(n)])

    top_n_df['id'] = np.arange(len(top_n_df))
    
    embeddings = pd.merge(embeddings, top_n_df, on='id', how='left')

    return embeddings

def compute_similarity_vector(
    embedding_query: np.ndarray,
    embeddings_base: pd.DataFrame,
    ) -> np.ndarray:
    
    """
    Computes a similarity vector for a given place.

    Parameters:
    - embedding (np.ndarray): Embedding of a query place.
    - embeddings (pd.DataFrame): DataFrame with embeddings.
    
    Returns:
    - np.ndarray: Similarity vector.
    
    """

    emb_cols = [col for col in embeddings_base.columns if col.startswith("emb_")]
    embeddings = embeddings_base[emb_cols].values
    
    similarity_vector = np.inner(embedding_query, embeddings)
    
    return similarity_vector

In [9]:
matrix = compute_similarity_matrix(embeddings)
top_n_df = create_top_n_cols(embeddings, matrix, 5)

In [None]:
def get_embedding(
    search_query: str,
    model_name: str = "all-MiniLM-L6-v2",
    ) -> np.ndarray:
    """
    Generates an embedding for a search query.
    
    Parameters:
    - search_query (str): Search query.
    - model_name (str): Pretrained SentenceTransformer model name.
    
    Returns:
    - np.ndarray: Embedding for the search query.
    """
    
    model = SentenceTransformer(model_name)
    
    embedding = model.encode(search_query, convert_to_numpy=True)
    
    return embedding

def top_n_query(
    search_query: str,
    embeddings: pd.DataFrame,
    n: int = 5,
    model_name: str = "all-MiniLM-L6-v2",
    ) -> pd.DataFrame:
    """
    Finds top N similar places for a given search query.
    
    Parameters:
    - search_query (str): Search query.
    - embeddings (pd.DataFrame): DataFrame with embeddings.
    - n (int): Number of similar places to return.
    - model_name (str): Pretrained SentenceTransformer model name.
    
    Returns:
    - pd.DataFrame: DataFrame with similar places.
    """
    
    embedding_query = get_embedding(search_query, model_name)
    
    similarity_vector = compute_similarity_vector(embedding_query, embeddings)
    
    similar_places = np.argsort(similarity_vector)[::-1][:n]
    
    return similar_places

def get_places_by_ids(
    ids: list,
    df: pd.DataFrame,
    ) -> pd.DataFrame:
    """
    Gets places by their IDs.
    
    Parameters:
    - ids (list): List of place IDs.
    - embeddings (pd.DataFrame): DataFrame with embeddings.
    
    Returns:
    - pd.DataFrame: DataFrame with places.
    """
    
    return df.iloc[ids]


In [47]:
test_str = "Cafe local"
get_places_by_ids(top_n_query(test_str, embeddings, 5, model_name=model), embeddings)[['name', 'description', 'category', 'type', 'Longitude', 'Latitude']]

Unnamed: 0,name,description,category,type,Longitude,Latitude
20,Café Noir Bar & Lounge,Dine-in Takeout Delivery,eco_cafes,Cafe,-75.567941,6.207344
23,Cafe Fundación.,Dine-in Takeout,eco_cafes,Cafe,-75.594601,6.243397
30,General Cafe Bar,Dine-in Takeout,eco_cafes,Coffee shop,-75.573069,6.214551
31,Pergamino | Cafe - Laureles,Dine-in Takeout Delivery,eco_cafes,Cafe,-75.596886,6.243245
353,Urbania Café,Dine-in Curbside pickup Delivery,community_gardens,Cafe,-75.567982,6.207446


In [44]:
import json

data_strs = [
    "Eco-friendly hotel",
    "Local Coffee Shop",
    "Local restaurant",
    "Park with beautiful views",
    "Susteinable hostel",
]

results = []

for i, data_str in enumerate(data_strs, start=1):
    result = get_places_by_ids(top_n_query(data_str, embeddings, 5, model_name=model), embeddings)[['name', 'description', 'Longitude', 'Latitude']]
    result_dict = {
        "number": i,
        "prompt": data_str,
        "data": result.to_dict(orient='records')
    }
    results.append(result_dict)

with open('results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [49]:
top_n_similar(embeddings)

TypeError: top_n_similar() missing 1 required positional argument: 'place_id'