# Reads data from text files, chunks them and stores them in vector DB along with their embeddings

In [None]:
import os
import dspy

In [58]:
# Read parameters

import json

parameters_file = "params.json"

with open(parameters_file, 'r') as fh:
    params = json.loads(fh.read())

In [57]:
files_list = ["../data/text_data_ordered.json",
              "../data/pdf_data_ordered.json"]

In [59]:

data = []

for file in files_list:
    with open(file, 'r') as fh:
        data.extend(json.loads(fh.read()))

In [60]:
data[0].keys()

dict_keys(['f_no', 'f_order', 'text'])

In [61]:
len(data)

606

In [62]:
data[:5]

[{'f_no': 'tf_1',
  'f_order': 1,
  'text': 'Student to Student Support Peer support recognizes that students naturally turn to each other for support and connection. Our Student Ambassadors utilize their lived experience as international students to provide friendly, respectful support to help students build a healthy and successful college experience. Click here(https://tbcollege0.sharepoint.com/:b:/s/ExternShare/EWPHCe-qNINHgqTW486g3-YBD-0vkdNy2Ke0sKixC01BLQ?e=cAGXGg) to know more. '},
 {'f_no': 'tf_2',
  'f_order': 1,
  'text': ' STUDENT SUCCESS: Student to Student Support Peer support recognizes that students naturally turn to each other for support and connection. Our Student Ambassadors utilize their lived experience as international students to provide friendly, respectful support to help students build a healthy and successful college experience. Click here(https://tbcollege0.sharepoint.com/:b:/s/ExternShare/EWPHCe-qNINHgqTW486g3-YBD-0vkdNy2Ke0sKixC01BLQ?e=cAGXGg) to know more

In [63]:
import ollama
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

In [64]:
ids = list(range(0, len(data)))
docs = data

# Create vectors for the text using the embedding model
vectors = [ollama.embeddings(model=params["embedding_model"], prompt=doc['text'])["embedding"] for doc in docs]

In [65]:
# Create connection with Qdrant DB
client = QdrantClient("localhost", port=6333)

In [66]:
# Create a collection in the vector DB
client.recreate_collection(
    collection_name=params["collection_name"],
    vectors_config=VectorParams(size=params["vector_size"], distance=Distance.COSINE)
    )

  client.recreate_collection(


True

In [67]:
# Organize data in a PointStruct object

points = []

for (id, vector, doc) in  zip(ids, vectors, docs):
    points.append(
        PointStruct(
            id=id,
            vector=vector,
            payload={'f_no': doc['f_no'], 'f_order': doc['f_order'], 'text': doc['text']},
        )
    )

In [68]:
# Upload the data to the vector DB
client.upsert(params["collection_name"], points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)