# January 30, 2024 - Session Notes

### Data Connection

#### 1. Load 
(Source -> Load)

In [1]:
from langchain_community.document_loaders import TextLoader 

In [2]:
loader = TextLoader(file_path="./sample_datasets/100_tokens_story.txt")

In [3]:
story = loader.load()

In [4]:
print(story[0].page_content)


In the clockwork heart of steampunk London, young Amelia tinkered with her latest invention - a hummingbird automaton designed to fly messages 
past smog-choked streets. But when a sinister airship loomed, spewing clockwork spiders, Amelia's metal bird became her only hope. With a whirring 
heart and wings of brass, she soared, messages clutched in her tiny talons, a spark of rebellion glinting in her eyes. 
The fate of London, tick by tick, depended on her flight.


#### 2. Splitting into chunks 
(Transform)

In [5]:
from langchain.text_splitter import CharacterTextSplitter

In [6]:
splitter_func = CharacterTextSplitter(separator="\n", chunk_size=150, chunk_overlap=5)

In [7]:
chunks = splitter_func.split_documents(story)

In [8]:
chunks[0].page_content

'In the clockwork heart of steampunk London, young Amelia tinkered with her latest invention - a hummingbird automaton designed to fly messages'

In [9]:
print(len(chunks))

4


In [10]:
chunks[0].metadata

{'source': './sample_datasets/100_tokens_story.txt'}

#### 3. Embedding transformation
(Embed)

In [25]:
from langchain_openai import OpenAIEmbeddings
import os 

In [26]:
embeddings_func = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY_PERSONAL"))

In [27]:
embeded_text = embeddings_func.embed_documents(chunks[0].page_content)

In [28]:
len(embeded_text)

142

In [29]:
# first 10 embeded values of the given document
embeded_text[0][:10]

[-0.013163183686005138,
 -0.03380830552625906,
 0.0077192372812761945,
 -0.014559067642206104,
 -0.020826585385515878,
 0.0194027831727709,
 -0.014363643571688296,
 -0.02120347486394077,
 0.012918904296349802,
 -0.005607963018711344]

#### 3. Storing the embeded values
(Store)

In [30]:
from langchain_community.vectorstores import Chroma

In [31]:
stateUnion_db = Chroma.from_documents(chunks, embedding=embeddings_func, persist_directory="./chromaDB")

In [33]:
stateUnion_db.persist()

<<< End of Document >>>