# Generating, Indexing and Searching Embeddings

## Installing the Package

In [1]:
%cd ../../../
!python3 -m pip install --upgrade .

/Users/ruxuez/Desktop/dev/GreenplumPython
Processing /Users/ruxuez/Desktop/dev/GreenplumPython
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: greenplum-python
  Building wheel for greenplum-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for greenplum-python: filename=greenplum_python-1.0.1-py3-none-any.whl size=84165 sha256=9746078492ea731ec973e8d87764de0b8fda43f8a9af1de2d4db7a13799ddb8c
  Stored in directory: /private/var/folders/jf/ycmq4_px3nj7gcrs015qqhxm0000gq/T/pip-ephem-wheel-cache-d86u32fo/wheels/56/a3/62/fb507748981bea497278b550674de9ab4cfa5150c30722b3d5
Successfully built greenplum-python
Installing collected packages: greenplum-python
  Attempting uninstall: greenplum-python
    Found existing installation: greenplum-python 1.0.1
    Uninstalling greenplum-python-1.0.1:
      Successfully unins

## Preparing Data

In [2]:
content = ["I have a dog.", "I like eating apples."]

import greenplumpython as gp

db = gp.database("postgres://localhost:7000")
t = (
    db.create_dataframe(columns={"id": range(len(content)), "content": content})
    .save_as(
        table_name="text_sample",
        column_names=["id", "content"],
        distribution_key={"id"},
        distribution_type="hash",
    )
    .check_unique(columns={"id"})
)

## Generating and Indexing Embeddings

In [3]:
import greenplumpython.experimental.embedding

t = t.embedding().create_index(column="content", model="all-MiniLM-L6-v2")
t

id,content
0,I have a dog.
1,I like eating apples.


## Searching Embeddings

In [4]:
t.embedding().search(column="content", query="apple", top_k=1)

id,content
1,I like eating apples.


Batched k-NN search

In [5]:
query = (
    db.create_dataframe(columns={"idd": range(3), "query": ["apple", "dog", "banana"]})
    .save_as(
        table_name="query_sample",
        column_names=["idd", "query"],
        distribution_key={"idd"},
        distribution_type="hash",
    )
    .check_unique(columns={"idd"})
    .embedding()
    .create_index(column="query", model="all-MiniLM-L6-v2")
)

In [6]:
t.embedding().search(column="content", query=query["query"], top_k=2)

idd,id,query,content
1,0,dog,I have a dog.
2,0,banana,I have a dog.
2,1,banana,I like eating apples.
0,1,apple,I like eating apples.


## Cleaning All at Once

In [None]:
%reload_ext sql
%sql postgresql://localhost:7000
%sql DROP TABLE text_sample CASCADE;
%sql DROP TABLE query_sample CASCADE;