In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [2]:
path_data = 'data/multi_docs/'

In [3]:
path_persist_db = '/Users/prb000j/Downloads/downloaded_models/vector_dbs/'

## A. Data Processing
**Expected Input:** New Data as txt, csv, etc.

```python
data_load(file) -> raw_doc
data_preprocess(raw_doc) -> clean_doc
doc_chunk(clean_doc) -> chunks
add_metadata() -> dictionary of metadata
```
**Expected Output:** List of Docs and their respective metadata

In [4]:
from codes.file_to_docs import Data2Docs

In [5]:
filename = 'Luminate Report Builder.docx'
file_path = path_data + filename

metadata_test = {
    'title': filename,
    'topic': 'RB, Luminate', 
    'data_type': 'word document',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_test, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

28

[Document(page_content='Luminate Report Builder', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='Introduction', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='Report Builder is a tool that', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='that is part of the Luminate', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'})]

[Document(page_content='Timeline', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='We will be rolling out the', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='the first version by the end', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'}),
 Document(page_content='end of July, 2024.', metadata={'source': 'data/multi_docs/Luminate Report Builder.docx', 'title': 'Luminate Report Builder.docx', 'topic': 'RB, Luminate', 'data_type': 'word document'})]

## B. Docs to Vector DB
**Expected Input:** List of Docs with metadata (from single or multiple document)

```python
embed_docs(chunk) -> embedding
add_docs_to_vdb(embedding, index) -> none
```
**Expected Output:** None (Just a confirmation msg that docs have been added to vector db)

In [6]:
from codes.file_to_docs import Docs2VectorDb

### If vector store needs to be:
- created from scratch (`True`)
- loaded from directory (`False`)

In [7]:
create_vector_db = True

In [8]:
import os, shutil

In [9]:
# deletes the vector store and creates the path again
if create_vector_db:  # create vector store from scratch
    shutil.rmtree(path_persist_db, ignore_errors=True)
    os.mkdir(path_persist_db)
    print(f'vector store deleted from:\n{path_persist_db}')
    vector_store_multi = Docs2VectorDb.main(doc_w_md, path_persist_db)
    print(f'vector store created and persists here:\n{path_persist_db}')
else:  # load vector store
    vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)
    print(f'vector store loaded and persists here:\n{path_persist_db}')

vector store deleted from:
/Users/prb000j/Downloads/downloaded_models/vector_dbs/


  from .autonotebook import tqdm as notebook_tqdm


vector store created and persists here:
/Users/prb000j/Downloads/downloaded_models/vector_dbs/


### Add one file

In [10]:
filename = 'dataframe.csv'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'qna on topics like RB, luminate', 
    'data_type': 'dataframe',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

21

[Document(page_content='question,answer,source', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='what is this tool,this tool', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='tool is called report', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='builder. It is part of', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'})]

[Document(page_content='on the top right corner of', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='of the page. You will need to', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='to follow along the', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='the guidelines.,testsource2', metadata={'source': 'data/multi_docs/dataframe.csv', 'title': 'dataframe.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'})]

In [11]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 21 indices to vector store
sources available after insertion:
{'source': {'data/multi_docs/dataframe.csv', 'data/multi_docs/Luminate Report Builder.docx'}}


### Add one file

In [12]:
filename = 'marketing.txt'
file_path = path_data + filename

metadata_file = {
    title = filename,
    'topic': 'marketing, toys', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 200,
    'chunk_overlap': 20,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

144

[Document(page_content='Girls Are Caring, Boys Are Violent: How Toys’ Marketing Reinforces Gender Norms\nAnd why its impact on children’s development shouldn’t be dismissed', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='One recurring critique of my articles on gender norms and how they’re mostly learned, not innate, comes from parents with young kids.', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='If these norms are really learned, how come little girls and boys often gravitate to gender-typical toys anyway, even when raised in mostly gender-neutral environments with full freedom to choose', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='freedom to choose their toys?', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys

[Document(page_content='and use your energy somewhere else.', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Just like you have to sell your resume and experience to a potential employer to land a job, the same applies to your writing.', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Either sell your writing, or stay at a job and keep selling your resume to earn a non-writing salary. There’s no in-between.', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Looking to grow online with writing? You could try trial and error. Or you could join 200K+ people who get the shortcuts from me.', metadata={'source': 'data/multi_docs/marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'})]

In [13]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 144 indices to vector store
sources available after insertion:
{'source': {'data/multi_docs/marketing.txt', 'data/multi_docs/dataframe.csv', 'data/multi_docs/Luminate Report Builder.docx'}}


### Update Doc from a Source
- Create new docs from updated source
- Delete all docs from the source in vector store
- Ingest new docs to vector store

In [14]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/multi_docs/Luminate Report Builder.docx',
  'data/multi_docs/dataframe.csv',
  'data/multi_docs/marketing.txt'}}

#### Add a test file

In [15]:
filename = 'Luminate Report Builder.pdf'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'marketing, toys', 
    'data_type': 'pdf',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)


25

[Document(page_content='Luminate Report Builder', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='Introduction Report Builder', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='is a tool that is part of the', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='the Luminate Suite. RB', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'})]

[Document(page_content='to create reports with ease.', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='Timeline We will be rolling', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='out the ﬁrst version by the', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'}),
 Document(page_content='the end of July, 2024.', metadata={'source': 'data/multi_docs/Luminate Report Builder.pdf', 'page': 0, 'title': 'Luminate Report Builder.pdf', 'topic': 'marketing, toys', 'data_type': 'pdf'})]

In [16]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 25 indices to vector store
sources available after insertion:
{'source': {'data/multi_docs/marketing.txt', 'data/multi_docs/Luminate Report Builder.pdf', 'data/multi_docs/dataframe.csv', 'data/multi_docs/Luminate Report Builder.docx'}}


#### Delete docs from existing source

In [17]:
metadata_file = {
    'data_type': 'pdf',
    # 'topic': 'marketing, toys',
}

vector_store_multi = Docs2VectorDb.update_docs_using_metadata(vector_store_multi, doc_w_md, metadata_file)

deleted 25 indices from vector store
sources available after deletion:
{'source': {'data/multi_docs/marketing.txt', 'data/multi_docs/dataframe.csv', 'data/multi_docs/Luminate Report Builder.docx'}}
added 25 indices to vector store
sources available after insertion:
{'source': {'data/multi_docs/marketing.txt', 'data/multi_docs/Luminate Report Builder.pdf', 'data/multi_docs/dataframe.csv', 'data/multi_docs/Luminate Report Builder.docx'}}


### What you can do with the vector store

- get all documents
- filter some documents based on a condition
- add documents
- delete documents
- update documents