In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [4]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [2]:
path_data = 'data/'

In [5]:
path_persist_db = os.getenv('PATH_PERSIST_DB')

## A. Data Processing
**Expected Input:** New Data as txt, csv, etc.

```python
data_load(file) -> raw_doc
data_preprocess(raw_doc) -> clean_doc
doc_chunk(clean_doc) -> chunks
add_metadata() -> dictionary of metadata
```
**Expected Output:** List of Docs and their respective metadata

In [4]:
from codes.file_to_docs import Data2Docs

In [5]:
filename = 'marketing.txt'
file_path = path_data + filename

metadata_test = {
    'title': filename,
    'topic': 'toys, gender bias', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 200,
    'chunk_overlap': 20,
}

doc_w_md = Data2Docs.main(file_path, metadata_test, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

144

[Document(page_content='Girls Are Caring, Boys Are Violent: How Toys’ Marketing Reinforces Gender Norms\nAnd why its impact on children’s development shouldn’t be dismissed', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='One recurring critique of my articles on gender norms and how they’re mostly learned, not innate, comes from parents with young kids.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='If these norms are really learned, how come little girls and boys often gravitate to gender-typical toys anyway, even when raised in mostly gender-neutral environments with full freedom to choose', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='freedom to choose their toys?', metadata={'source': 'data/m

[Document(page_content='and use your energy somewhere else.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='Just like you have to sell your resume and experience to a potential employer to land a job, the same applies to your writing.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='Either sell your writing, or stay at a job and keep selling your resume to earn a non-writing salary. There’s no in-between.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'}),
 Document(page_content='Looking to grow online with writing? You could try trial and error. Or you could join 200K+ people who get the shortcuts from me.', metadata={'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'toys, gender bias', 'data_type': 'txt'})

## B. Docs to Vector DB
**Expected Input:** List of Docs with metadata (from single or multiple document)

```python
embed_docs(chunk) -> embedding
add_docs_to_vdb(embedding, index) -> none
```
**Expected Output:** None (Just a confirmation msg that docs have been added to vector db)

In [6]:
from codes.file_to_docs import Docs2VectorDb

### If vector store needs to be:
- created from scratch (`True`)
- loaded from directory (`False`)

In [7]:
create_vector_db = True

In [8]:
import os, shutil

In [9]:
# deletes the vector store and creates the path again
if create_vector_db:  # create vector store from scratch
    shutil.rmtree(path_persist_db, ignore_errors=True)
    os.mkdir(path_persist_db)
    print(f'vector store deleted from:\n{path_persist_db}')
    vector_store_multi = Docs2VectorDb.main(doc_w_md, path_persist_db)
    print(f'vector store created and persists here:\n{path_persist_db}')
else:  # load vector store
    vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)
    print(f'vector store loaded and persists here:\n{path_persist_db}')

vector store deleted from:
/Users/pinaki/Downloads/models/vector-dbs/




vector store created and persists here:
/Users/pinaki/Downloads/models/vector-dbs/


### Add one file

In [10]:
filename = 'personal_finance.txt'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'finance, investment, insurance', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 200,
    'chunk_overlap': 20,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

251

[Document(page_content='5 habits to have better finances than 99% of people\nHow to end up richer than your peers, on the long game', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Document(page_content='Finances are the only topic of conversation that never gets old. Everyone is always interested in making and managing money. And they always will!\n\nIt’s in our nature! Providing, being abundant.', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Document(page_content='There’s a reason our species was pre-historically called “hunter gatherers”. It’s because we like to gather, to collect, to have! It offers us security, safety and status.', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Doc

[Document(page_content='and unrealistic expectations.', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Document(page_content='That’s why things have gotten worse. Our expectations are no longer aligned with the reality we face.', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Document(page_content='The book is Nickel and Dimed by Barbara Ehrenreich. It highlights how hard it was for low-wage workers to get by at the turn of the century — and how much harder it’s become for professional workers', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance, investment, insurance', 'data_type': 'txt'}),
 Document(page_content='workers to get by today.', metadata={'source': 'data/personal_finance.txt', 'title': 'personal_finance.txt', 'topic': 'finance

In [11]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 251 indices to vector store
sources available after insertion:
{'source': {'data/marketing.txt', 'data/personal_finance.txt'}}


### Add one file

In [16]:
filename = 'qna_ds.csv'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'data science', 
    'data_type': 'csv',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 100,
    'chunk_overlap': 10,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

41

[Document(page_content='questions,answers,level', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content='What is the difference between supervised and unsupervised learning?,"Supervised learning uses', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content='uses labeled data to train models, while unsupervised learning uses unlabeled data to find patterns', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content='patterns and relationships.",l0', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'})]

[Document(page_content='Can you explain the bias-variance tradeoff?,"The bias-variance tradeoff is the balance between a', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content="between a model's simplicity and its ability to generalize. High bias means underfitting, while", metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content='while high variance means overfitting. The goal is to find a model with low bias and variance to', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'}),
 Document(page_content='to achieve good generalization.",l1', metadata={'source': 'data/qna_ds.csv', 'title': 'qna_ds.csv', 'topic': 'data science', 'data_type': 'csv'})]

In [17]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 41 indices to vector store
sources available after insertion:
{'source': {'data/qna_ds.csv', 'data/marketing.txt', 'data/personal_finance.txt'}}


### Update Doc from a Source
- Create new docs from updated source
- Delete all docs from the source in vector store
- Ingest new docs to vector store

In [18]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/marketing.txt',
  'data/personal_finance.txt',
  'data/qna_ds.csv'}}

#### Add a test file

In [19]:
filename = 'garb_in_garb_out.pdf'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'philosophy', 
    'data_type': 'pdf',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 70,
    'chunk_overlap': 20,
}

doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(doc_w_md)
doc_w_md[0:4]
doc_w_md[-4:]

25

[Document(page_content='In the quiet town of Eldoria, an old tailor named Ethan ran a small,', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='Ethan ran a small, dimly lit shop. The sign above his door read "Garb', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='his door read "Garb in, Garb out." For decades, townsfolk brought', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='townsfolk brought their clothes to him, but Ethan’s work extended', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'})]

[Document(page_content='streets were filled with Ethan’s quiet influence. "Garb in, Garb out"', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='"Garb in, Garb out" became a mantra for mindful living, a reminder', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='living, a reminder that every action, like every thread, wove the', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'}),
 Document(page_content='thread, wove the fabric of their collective existence.', metadata={'source': 'data/garb_in_garb_out.pdf', 'page': 0, 'title': 'garb_in_garb_out.pdf', 'topic': 'philosophy', 'data_type': 'pdf'})]

In [20]:
vector_store_multi = Docs2VectorDb.add_docs_to_vector_db(vector_store_multi, doc_w_md)

added 25 indices to vector store
sources available after insertion:
{'source': {'data/garb_in_garb_out.pdf', 'data/qna_ds.csv', 'data/marketing.txt', 'data/personal_finance.txt'}}


#### Delete docs from existing source

In [21]:
filename = 'garb_in_garb_out.txt'
file_path = path_data + filename

metadata_file = {
    'title': filename,
    'topic': 'philosophy', 
    'data_type': 'txt',
}

chunking_strategy = {
    'key1': 'bla bla bla',
    'chunk_size': 70,
    'chunk_overlap': 20,
}

upd_doc_w_md = Data2Docs.main(file_path, metadata_file, **chunking_strategy)
len(upd_doc_w_md)
upd_doc_w_md[0:4]
upd_doc_w_md[-4:]

26

[Document(page_content='In the quiet town of Eldoria, an old tailor named Ethan ran a small,', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='Ethan ran a small, dimly lit shop. The sign above his door read "Garb', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='his door read "Garb in, Garb out." For decades, townsfolk brought', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='townsfolk brought their clothes to him, but Ethan’s work extended', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'})]

[Document(page_content='Eldoria’s streets were filled with Ethan’s quiet influence. "Garb in,', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='"Garb in, Garb out" became a mantra for mindful living, a reminder', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='living, a reminder that every action, like every thread, wove the', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'}),
 Document(page_content='thread, wove the fabric of their collective existence.', metadata={'source': 'data/garb_in_garb_out.txt', 'title': 'garb_in_garb_out.txt', 'topic': 'philosophy', 'data_type': 'txt'})]

In [22]:
metadata_file = {
    'topic': 'philosophy',
}

vector_store_multi = Docs2VectorDb.update_docs_using_metadata(vector_store_multi, upd_doc_w_md, metadata_file)

deleted 25 indices from vector store
sources available after deletion:
{'source': {'data/qna_ds.csv', 'data/marketing.txt', 'data/personal_finance.txt'}}
added 26 indices to vector store
sources available after insertion:
{'source': {'data/qna_ds.csv', 'data/garb_in_garb_out.txt', 'data/marketing.txt', 'data/personal_finance.txt'}}


### What you can do with the vector store

- get all documents
- filter some documents based on a condition
- add documents
- delete documents
- update documents