# January 30, 2024 - Session Notes

### Data Connection

#### 1. Load 
(Source -> Load)

In [1]:
from langchain_community.document_loaders import TextLoader 

In [2]:
loader = TextLoader(file_path="./sample_datasets/100_tokens_story.txt")

In [3]:
story = loader.load()

In [4]:
print(story[0].page_content)


In the clockwork heart of steampunk London, young Amelia tinkered with her latest invention - a hummingbird automaton designed to fly messages 
past smog-choked streets. But when a sinister airship loomed, spewing clockwork spiders, Amelia's metal bird became her only hope. With a whirring 
heart and wings of brass, she soared, messages clutched in her tiny talons, a spark of rebellion glinting in her eyes. 
The fate of London, tick by tick, depended on her flight.


#### 2. Splitting into chunks 
(Transform)

In [5]:
from langchain.text_splitter import CharacterTextSplitter

In [6]:
splitter_func = CharacterTextSplitter(separator="\n", chunk_size=150, chunk_overlap=5)

In [7]:
chunks = splitter_func.split_documents(story)

In [8]:
chunks[0].page_content

'In the clockwork heart of steampunk London, young Amelia tinkered with her latest invention - a hummingbird automaton designed to fly messages'

In [9]:
print(len(chunks))

4


In [10]:
chunks[0].metadata

{'source': './sample_datasets/100_tokens_story.txt'}

#### 3. Embedding transformation
(Embed)

In [11]:
from langchain_community.embeddings import OpenAIEmbeddings
import os 

In [12]:
embeddings_func = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY_PERSONAL"))

  warn_deprecated(


In [13]:
embded_text = embeddings_func.embed_documents(chunks[0].page_content)

In [14]:
embded_text

[[-0.013163183686005138,
  -0.03380830552625906,
  0.0077192372812761945,
  -0.014559067642206104,
  -0.020826585385515878,
  0.0194027831727709,
  -0.014363643571688296,
  -0.02120347486394077,
  0.012918904296349802,
  -0.005607963018711344,
  0.01718332974063424,
  0.0177277231703878,
  0.01592703334186306,
  0.00525550213117778,
  -0.010615696094581108,
  -0.0007332751920798355,
  0.028978546795659855,
  0.010825078827709637,
  0.01277233647629209,
  -0.012946821621571248,
  -0.012423365254411206,
  0.001605266340083533,
  -0.011208946706117325,
  -0.015968909329695226,
  -0.009526906441106292,
  0.00929658571406168,
  0.01871880125426499,
  -0.014852203282321532,
  0.009652536267247923,
  -0.03193781865939045,
  0.04207193437464368,
  -0.015159297585047683,
  -0.008794067340817722,
  -0.0210080498621004,
  -0.019793631313806518,
  -0.010992584641683435,
  -0.009324503039283125,
  -0.021594321142331254,
  0.0005544275572812037,
  -0.0034199151804649545,
  0.004459848583479677,
  0.

In [15]:
from langchain_community.vectorstores import Chroma

In [16]:
stateUnion_db = Chroma.from_documents(chunks, embedding=embeddings_func, persist_directory="./chromaDB")

In [17]:
stateUnion_db.persist()