In [1]:
import os
from tqdm import tqdm

from langchain.vectorstores.chroma import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings

In [2]:
dir_path = './db'
db_paths = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]

In [3]:
new_db = Chroma(persist_directory='./db/chroma_db_main', embedding_function=OpenAIEmbeddings())

In [4]:
def chunks(lst: list, n: int) -> list:
    return [lst[i:i + n] for i in range(0, len(lst), n)]

In [5]:
for path in db_paths:
    print(f'Processing {path}')
    db = Chroma(persist_directory=path, embedding_function=OpenAIEmbeddings())
    data = db._collection.get(
        include=['metadatas', 'embeddings', 'documents']
    )
    data = [
        {'metadata': meta, 'embedding': emb, 'document': doc, 'id': id_}
        for meta, emb, doc, id_ in zip(data['metadatas'], data['embeddings'], data['documents'], data['ids'])
    ]
    print(f'Adding {len(data)} documents to main db')

    for chunk in tqdm(chunks(data, 1000)):
        new_db._collection.add(
            metadatas=[el['metadata'] for el in chunk],
            embeddings=[el['embedding'] for el in chunk],
            documents=[el['document'] for el in chunk],
            ids=[el['id'] for el in chunk]
        )


Processing ./db\chroma_db_html
Adding 9142 documents to main db


100%|██████████| 10/10 [00:18<00:00,  1.85s/it]


Processing ./db\chroma_db_paths
Adding 7172 documents to main db


100%|██████████| 8/8 [00:10<00:00,  1.31s/it]


Processing ./db\chroma_db_semantic
Adding 9592 documents to main db


100%|██████████| 10/10 [00:24<00:00,  2.42s/it]


Processing ./db\chroma_db_znaki_1000
Adding 19458 documents to main db


100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


Processing ./db\chroma_db_znaki_150
Adding 130201 documents to main db


100%|██████████| 131/131 [05:51<00:00,  2.69s/it]


Processing ./db\chroma_db_znaki_400
Adding 50191 documents to main db


100%|██████████| 51/51 [02:38<00:00,  3.10s/it]
