In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
path_data = os.getenv('PATH_DATA')
path_data

'data/'

In [5]:
path_persist_db = os.getenv('PATH_PERSIST_DB')
path_persist_db

'/Users/pinaki/Downloads/models/vector-dbs/'

## A. Data Processing
**Expected Input:** New Data as txt, csv, etc.

```python
data_load(file) -> raw_doc
data_preprocess(raw_doc) -> clean_doc
doc_chunk(clean_doc) -> chunks
add_metadata() -> dictionary of metadata
```
**Expected Output:** List of Docs and their respective metadata

In [6]:
from codes.file_to_docs import Data2Docs

In [7]:
filename = 'garb_in_garb_out.pdf'
file_path = path_data + filename

metadata_test = {
    'title': filename,
    'topic': 'philosophy', 
    'data_type': 'pdf',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_test, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

53

[Document(page_content='In the quiet town of Eldoria,', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='an old tailor named Ethan ran', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='ran a small, dimly lit shop.', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='The sign above his door read', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'})]

[Document(page_content='a reminder that every action,', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='like every thread, wove the', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='the fabric of their', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='collective existence.', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'})]

## B. Docs to Vector DB
**Expected Input:** List of Docs with metadata (from single or multiple document)

```python
embed_docs(chunk) -> embedding
add_docs_to_vdb(embedding, index) -> none
```
**Expected Output:** None (Just a confirmation msg that docs have been added to vector db)

In [8]:
from codes.file_to_docs import Docs2VectorDb

### If vector store needs to be:
- created from scratch (`True`)
- loaded from directory (`False`)

In [9]:
create_vector_db = True

In [10]:
import os, shutil

In [11]:
# deletes the vector store and creates the path again
if create_vector_db:  # create vector store from scratch
    shutil.rmtree(path_persist_db, ignore_errors=True)
    os.mkdir(path_persist_db)
    print(f'vector store deleted from:\n{path_persist_db}')
    vector_store_multi = Docs2VectorDb.main(doc_w_md, path_persist_db)
    print(f'vector store created and persists here:\n{path_persist_db}')
else:  # load vector store
    vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)
    print(f'vector store loaded and persists here:\n{path_persist_db}')

vector store deleted from:
/Users/pinaki/Downloads/models/vector-dbs/




vector store created and persists here:
/Users/pinaki/Downloads/models/vector-dbs/


### Add one file

In [12]:
filename = 'qna_table.csv'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'qna on topics like RB, luminate', 
    'data_type': 'dataframe',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

131

[Document(page_content='questions,answers,level', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='What is the difference', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='between supervised and', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='and unsupervised', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'})]

[Document(page_content='overfitting. The goal is to', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='to find a model with low bias', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='bias and variance to achieve', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'}),
 Document(page_content='good generalization.",l1', metadata={'source': 'data/qna_table.csv', 'title': 'qna_table.csv', 'topic': 'qna on topics like RB, luminate', 'data_type': 'dataframe'})]

In [13]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 131 indices to vector store
sources available after insertion:
{'source': {'data/garb_in_garb_out.pdf', 'data/qna_table.csv'}}


### Add one file

In [14]:
filename = 'marketing.txt'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'marketing, toys', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 200,
    'chunk_overlap': 20,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

144

[Document(page_content='Girls Are Caring, Boys Are Violent: How Toys’ Marketing Reinforces Gender Norms\nAnd why its impact on children’s development shouldn’t be dismissed', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='One recurring critique of my articles on gender norms and how they’re mostly learned, not innate, comes from parents with young kids.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='If these norms are really learned, how come little girls and boys often gravitate to gender-typical toys anyway, even when raised in mostly gender-neutral environments with full freedom to choose', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='freedom to choose their toys?', metadata={'source': 'data/marketi

[Document(page_content='and use your energy somewhere else.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Just like you have to sell your resume and experience to a potential employer to land a job, the same applies to your writing.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Either sell your writing, or stay at a job and keep selling your resume to earn a non-writing salary. There’s no in-between.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'}),
 Document(page_content='Looking to grow online with writing? You could try trial and error. Or you could join 200K+ people who get the shortcuts from me.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys', 'data_type': 'txt'})]

In [15]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 144 indices to vector store
sources available after insertion:
{'source': {'data/garb_in_garb_out.pdf', 'data/qna_table.csv', 'data/marketing.txt'}}


### Update Doc from a Source
- Create new docs from updated source
- Delete all docs from the source in vector store
- Ingest new docs to vector store

In [16]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/garb_in_garb_out.pdf',
  'data/marketing.txt',
  'data/qna_table.csv'}}

#### Add a test file

In [17]:
filename = 'garb_in_garb_out.txt'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'philosophy', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 30,
    'chunk_overlap': 5,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

53

[Document(page_content='In the quiet town of Eldoria,', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='an old tailor named Ethan ran', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='ran a small, dimly lit shop.', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='The sign above his door read', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'})]

[Document(page_content='a reminder that every action,', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='like every thread, wove the', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='the fabric of their', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='collective existence.', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'})]

In [18]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 53 indices to vector store
sources available after insertion:
{'source': {'data/garb_in_garb_out.pdf', 'data/garb_in_garb_out.txt', 'data/qna_table.csv', 'data/marketing.txt'}}


#### Delete docs from existing source

In [22]:
metadata_file = {'$and': [
    {'data_type': {'$eq': 'pdf'}},
    {'topic': {'$eq': 'philosophy'}},
    ]
                 }

vector_store_multi = Docs2VectorDb.update_docs_using_metadata(vector_store_multi, doc_w_md, metadata_file)

deleted 53 indices from vector store
sources available after deletion:
{'source': {'data/garb_in_garb_out.txt', 'data/qna_table.csv', 'data/marketing.txt'}}
added 53 indices to vector store
sources available after insertion:
{'source': {'data/garb_in_garb_out.txt', 'data/qna_table.csv', 'data/marketing.txt'}}


### What you can do with the vector store

- get all documents
- filter some documents based on a condition
- add documents
- delete documents
- update documents