# Reads data from text files, chunks them and stores them in vector DB along with their embeddings

In [18]:
import os

In [19]:
# Read parameters
import json

parameters_file = "pg_params.json"

with open(parameters_file, 'r') as fh:
    params = json.loads(fh.read())

In [20]:
files_list = ["../data/text_data_ordered.json",
              "../data/pdf_data_ordered.json"]

In [21]:
# Read clean data list from json file

for file in files_list:
    with open(file, 'r') as fh:
        data = json.loads(fh.read())

In [25]:
import ollama

vectors = [ollama.embeddings(model=params["embedding_model"], prompt=x['text'])["embedding"] for x in data]

In [26]:
import psycopg2

# Connect to the database
conn = psycopg2.connect(database = "test_db", 
                        user = "postgres", 
                        host= 'localhost',
                        password = "deep",
                        port = 5432)

# To make the changes to the database persistent
conn.autocommit = True

In [27]:
# Create table in database
cur = conn.cursor()

cur.execute(f'''CREATE TABLE IF NOT EXISTS {params['table_name']}( 
            doc_id SERIAL PRIMARY KEY,  
            doc_name TEXT NOT NULL,
            doc_order INT NOT NULL,
            doc_text TEXT NOT NULL,
            embedding VECTOR({str(params['vector_size'])})
            );''') 


# Close cursor
cur.close()

In [28]:
# Insert data with its embeddings into the database
cur = conn.cursor()

for x, vector in zip(data, vectors):
    cur.execute(f"""INSERT INTO {params['table_name']} (doc_name, doc_order, doc_text, embedding)
    VALUES ('{x["f_no"]}', {x["f_order"]}, '{x["text"]}', '{vector}');
    """)

# Close cursor and communication with the database
cur.close()
conn.close()