# LanceDB vector database

In [138]:
import lancedb 

vector_db = lancedb.connect(uri = "vector_database")

vector_db

LanceDBConnection(uri='/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database')

In [139]:
vector_db.uri

'/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database'

## Create table

In [140]:
import json 

with open("animals_text_embeddings.json") as file:
    data = json.loads(file.read())

data 

[{'text': 'A small brown dog running.', 'vector': [0.12, 0.85, 0.33]},
 {'text': 'A cat resting quietly on a sofa.', 'vector': [0.4, 0.91, 0.1]},
 {'text': 'A large gray elephant drinking water.',
  'vector': [0.88, 0.22, 0.55]},
 {'text': 'A fast cheetah sprinting across the savannah.',
  'vector': [0.95, 0.12, 0.72]},
 {'text': 'A colorful parrot perched on a branch.',
  'vector': [0.25, 0.66, 0.81]},
 {'text': 'A frog sitting on a lily pad.', 'vector': [0.14, 0.44, 0.27]}]

In [141]:
vector_db.create_table("animals", exist_ok=True, data=data)

LanceTable(name='animals', version=3, _conn=LanceDBConnection(uri='/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database'))

In [142]:
vector_db.table_names()

['animals']

In [143]:
# operator overloaded __getitem__
vector_db["animals"]

LanceTable(name='animals', version=3, _conn=LanceDBConnection(uri='/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database'))

In [144]:
vector_db["animals"].head()

pyarrow.Table
text: string
vector: fixed_size_list<item: float>[3]
  child 0, item: float
----
text: [["A small brown dog running.","A cat resting quietly on a sofa.","A large gray elephant drinking water.","A fast cheetah sprinting across the savannah.","A colorful parrot perched on a branch."]]
vector: [[[0.12,0.85,0.33],[0.4,0.91,0.1],[0.88,0.22,0.55],[0.95,0.12,0.72],[0.25,0.66,0.81]]]

In [145]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"
8,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
9,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


In [146]:
more_data = [
    {"text": "A panda eating bamboo peacefully.", "vector": [0.51, 0.37, 0.82]},
    {"text": "A lion roaring loudly on a rock.", "vector": [0.93, 0.18, 0.41]},
]

vector_db["animals"].add(more_data)

AddResult(version=4)

In [147]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"
8,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
9,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


## Create an empty table and drop it

In [148]:
from lancedb.pydantic import LanceModel

class JokeSchema(LanceModel):
    joke: str
    rating: int

vector_db.create_table(name = "joke_throw_away_table", schema=JokeSchema)

LanceTable(name='joke_throw_away_table', version=1, _conn=LanceDBConnection(uri='/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database'))

## drop the throw_away_table

In [149]:
vector_db.table_names()

['animals', 'joke_throw_away_table']

In [150]:
vector_db.drop_table("joke_throw_away_table")

In [151]:
vector_db.table_names()

['animals']

## vector search in lancedb

search with ANN - Approximate Nearest Neighbour

* search with vector directly
* search with natural text and it automatically calculates its embedding

In [152]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"
8,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
9,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


In [153]:
query_vector = [.8,.2,.6]

vector_db["animals"].search(query_vector).limit(5).to_pandas()

Unnamed: 0,text,vector,_distance
0,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]",0.0093
1,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]",0.0433
2,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]",0.0534
3,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]",0.0534
4,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]",0.0534


In [154]:
# can't search with natural text
try:
    vector_db["animals"].search("pandas eat bamboo").limit(3).to_pandas()
except RuntimeError as err:
    print(err)

lance error: Invalid user input: Cannot perform full text search unless an INVERTED index has been created on at least one column, /Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/lance-index-0.39.0/src/scalar/inverted/query.rs:703:25


## Embeddings model

In [157]:
from lancedb.embeddings import get_registry
import numpy as np

embedding_model = get_registry().get("gemini-text").create(name="gemini-embedding-001")

test_embedding = np.array(embedding_model.compute_query_embeddings("hej på dig"))
test_embedding.shape # based on this, we know that the model embeds vectors with dim 3072

(1, 3072)

In [158]:
vector_db.drop_table("jokes")

ValueError: Table 'jokes' was not found

In [159]:
from lancedb.pydantic import LanceModel, Vector

class JokeModel(LanceModel):
    joke: str = embedding_model.SourceField() # joke becomes our source column
    vector: Vector(3072) = embedding_model.VectorField() # target vector column

vector_db.create_table("jokes", schema=JokeModel, exist_ok=True)
vector_db["jokes"]

LanceTable(name='jokes', version=1, _conn=LanceDBConnection(uri='/Users/admin/Documents/github/DP_ML_AI_eyoub_beraki_de_24/09_lancedb_vector_database/vector_database'))

In [160]:
import pandas as pd 
with open("jokes.json") as file:
    jokes_data = json.loads(file.read())

df_jokes = pd.DataFrame(jokes_data).rename({"jokes":"joke"}, axis=1)
df_jokes.head()

Unnamed: 0,joke
0,Parallel lines have so much in common—it’s sad...
1,"ETL stands for “Extract, Transform, Leave for ..."
2,What do you call a snake that runs your script...
3,"Gold walks into a bar. The bartender says, “Au..."
4,C# devs don’t argue; they just throw exceptions.
