# Cluedo Vector Database Setup
This project uses the Pinecone vector database (see https://www.pinecone.io/). This database stores data and converts it into vector embeddings. This project makes use of fast vector similarity search algorithms provided by the database. In this scenario, the data is only loaded once into the database, as it doesn't change over time.

In [90]:
import yaml
from pinecone import Pinecone

In [91]:
conf = yaml.load(open("conf.yml", "r"), Loader=yaml.SafeLoader)

# credentials and configuration
pc_token = conf["pinecone"]["token"]
pc_index_name = "cluedo-py"

## Extract data from Textfile

In [92]:
# this function reads the lines of a txtfile and returns them as an array
def load_from_file(file):
    f = open(file, "r")
    text = f.read().splitlines()
    f.close()

    return text

## Transform data

In [93]:
# this function transforms the string data
def transform_data(data):
    t_data = []

    for id, d in enumerate(data):
        entry = {}
        entry["_id"] = "rec" + str(id)
        entry["chunk_text"] = d
        t_data.append(entry)

    return t_data

## Load data into Vector Database

In [94]:
pc = Pinecone(api_key=pc_token)

# this function uploads an array of strings to the vector db
def upload_to_db(data):
    # empty vector database
    if pc.has_index(pc_index_name):
        pc.delete_index(name=pc_index_name)

    # create vdb index
    pc.create_index_for_model(
        name=pc_index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

    # upload data entries to database
    index = pc.Index(pc_index_name)
    index.upsert_records("__default__", data)

    return ""

## Execution

In [95]:
def setup_db():
    txt_file = "facts.txt"
    
    # load data from textfile
    data = load_from_file(txt_file)

    # transform data into a different format
    t_data = transform_data(data)

    # upload to database
    upload_to_db(t_data)

setup_db()