# Trying out lancedb


In [1]:
import pandas as pd
import torch
import lancedb
from pathlib import Path
import numpy as np

CACHE_PATH = "./local"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class VectorDBConnection:
    def __init__(self, database_dir="lancedb"):
        """
        embed_func: function that takes a string and returns a numpy array embedding.
        This MUST be same model as original embeddings or else comparison will not work
        """
        Path(CACHE_PATH).mkdir(parents=True, exist_ok=True)
        p = Path(CACHE_PATH) / database_dir

        print("Making new LanceDB connection")

        # NOTE: can save this to file, but potentially causes issues with old tables
        # so right now making new database on start up each time
        # self.connection = duckdb.connect()
        self.connection = lancedb.connect(p)
        self.id_cols = {}
        self.embed_funcs = {}

    def _check(self, table_name, check_conn=False, check_id=False, check_embed=False):
        if check_conn and table_name not in self.connection:
            raise ValueError(f"Table {table_name} not found in LanceDB.")

        if check_id and table_name not in self.id_cols:
            raise ValueError(f"Table {table_name} does not have an id column saved.")

        if check_embed and table_name not in self.embed_funcs:
            raise ValueError(f"Table {table_name} does not have an embedding function.")

    def add_table(
        self, table_name: str, data: pd.DataFrame, id_col_name: str, embed_func
    ):
        """
        Add a table to the database.

        Args:
            table_name (str): The name of the table.
            data (pd.DataFrame): The dataframe containing the metadata and a column called "vector" with numpy arrays per row representing the vector representation.
            id_col_name (str): The name of the column in the dataframe that contains the unique identifier for each row.
            embed_func: function that takes a string and returns a numpy array embedding for this table

        Returns:
            None
        """
        self.table = self.connection.create_table(
            table_name, data=data, mode="overwrite"
        )
        self.id_cols[table_name] = id_col_name
        self.embed_funcs[table_name] = embed_func

    def search(self, table_name: str, vector: np.array, limit: int = 5):
        """
        Find ids of KNN docs to vector
        """
        self._check(table_name, check_conn=True, check_id=True)

        id_col = self.id_cols[table_name]
        result = (
            self.connection[table_name].search(vector).limit(limit).select([id_col])
        )

        return result.to_pandas()[[id_col, "_distance"]]

    def get_embedding_from_string(self, table_name, text: str) -> np.array:
        self._check(table_name, check_embed=True)

        return self.embed_funcs[table_name](text)

    def get_embedding_from_id(self, table_name, id: str) -> np.array:
        """
        Get the embedding from the table by id
        """
        self._check(table_name, check_conn=True, check_id=True)

        df = (
            self.connection[table_name]
            .search()
            .where(f"{self.id_cols[table_name]} = {id}")
            .to_pandas()["vector"]
        )

        return df.iloc[0]

In [3]:
import sentence_transformers


def get_embedding(col: np.ndarray, model_name):
    model = sentence_transformers.SentenceTransformer(model_name)
    e = model.encode(col)
    return e

In [4]:
vectordbconn = VectorDBConnection()

table_name = "vis_papers"

print("Loading vector data...")
vis_paper_df = pd.read_parquet(
    "../datasets/local/vis_papers/processed/vis_papers.parquet"
)
vis_paper_embeddings = torch.load(
    "../datasets/local/vis_papers/processed/vis_papers_embeddings.pt"
)

vis_paper_df["vector"] = list(vis_paper_embeddings.numpy())
embed_func = lambda x: get_embedding(x, "all-mpnet-base-v2")

vectordbconn.add_table(table_name, vis_paper_df, "id", embed_func)

Making new LanceDB connection
Loading vector data...


In [6]:
vec = vectordbconn.get_embedding_from_id(table_name, 1)
r = vectordbconn.search(table_name, vec)

r.to_json(orient="records")

'[{"id":1,"_distance":0.0},{"id":410,"_distance":0.7563974261},{"id":385,"_distance":0.7908704877},{"id":517,"_distance":0.7967621684},{"id":642,"_distance":0.7972276211}]'

In [27]:
r.to_json()

'{"id":{"0":1,"1":410,"2":385,"3":517,"4":642},"_distance":{"0":0.0,"1":0.7563974261,"2":0.7908704877,"3":0.7967621684,"4":0.7972276211}}'

In [28]:
vec2 = vectordbconn.get_embedding_from_string(
    table_name, "visualizing text data system"
)

r = vectordbconn.search(table_name, vec2)

In [29]:
r

Unnamed: 0,id,_distance
0,1593,0.603509
1,819,0.647131
2,3160,0.664667
3,1832,0.666675
4,2271,0.682674


In [33]:
vis_paper_df[vis_paper_df.id.isin([1593, 819, 3160])]

Unnamed: 0,id,Conference,Year,Title,DOI,Link,FirstPage,LastPage,PaperType,Abstract,...,AminerCitationCount,CitationCount_CrossRef,PubsCited_CrossRef,Award,Abstract_text_length,Abstract_num_words,Abstract_max_word_length,Abstract_avg_word_length,Abstract_perc_special_chars,vector
819,819,VAST,2016,TextTile: An Interactive Visualization Tool fo...,10.1109/TVCG.2016.2598447,http://dx.doi.org/10.1109/TVCG.2016.2598447,161,170,J,"We describe TextTile, a data visualization too...",...,39.0,21.0,43.0,,804,127,15,5.338583,0.00995,"[-0.022825448, 0.027768482, -0.058228966, -0.0..."
1593,1593,VAST,2010,Understanding text corpora with multiple facets,10.1109/VAST.2010.5652931,http://dx.doi.org/10.1109/VAST.2010.5652931,99,106,C,Text visualization becomes an increasingly mor...,...,86.0,36.0,29.0,,1113,172,16,5.476744,0.020665,"[-0.012490144, -0.013318891, -0.05780364, 0.00..."
3160,3160,InfoVis,1996,Visualizing the results of multimedia Web sear...,10.1109/INFVIS.1996.559219,http://dx.doi.org/10.1109/INFVIS.1996.559219,64,"65, 122",M,Search engines are very useful because they al...,...,32.0,10.0,2.0,,414,69,14,5.014493,0.021739,"[0.0064505064, -0.017319938, -0.033198442, 0.0..."


In [None]:
uri = "local/sample-lancedb"
db = lancedb.connect(uri)

In [None]:
df["vector"] = list(vectors.numpy())

Make a lancedb table, if want can index to support faster searching. By default does exhaustive KNN search, but once index created can do faster ANN (approx NN) search. Create index with `tbl.create_index()`

Docs say not really necessary unless >50k rows


In [None]:
db.create_table("vis_papers", data=df, mode="overwrite")

In [None]:
def search_by_index(idx):
    query = df.iloc[idx]
    result = do_search(query.vector)
    return query, result


def do_search(vector, limit=5, id_col_name="id"):
    result = db["vis_papers"].search(vector).limit(limit).select([id_col_name])
    return result.to_pandas()[[id_col_name, "_distance"]]

In [None]:
"vis_papers" in db

In [None]:
original, result = search_by_index(100)

In [None]:
original

In [None]:
result

In [None]:
result.select(["id"]).to_pandas()

In [None]:
dir(result)

In [None]:
print(original.Title)

In [None]:
for row in result[["Title", "Abstract"]].iterrows():
    print(row[1].Title)

In [None]:
df

In [None]:
import sentence_transformers

In [None]:
query = "A new approach to visualizing data"


model = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")

e = model.encode(
    query,
)

print("Created embedding of shape", e.shape)

In [None]:
type(e)

In [None]:
def embed_my_data(query):
    model = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
    return model.encode(query)

In [None]:
embed_my_data("A new approach to visualizing data")

In [None]:
p = db["vis_papers"].search().where("id = 55").to_pandas()["vector"]

p.iloc[0]
# p.values[0]

In [None]:
df