# Indexing

#### Dependencies

In [1]:
import langchain, warnings

from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

#### Disable Warnings

In [2]:
warnings.filterwarnings('ignore')

#### OpenAI Key

In [3]:
openai_key = None
with open('openai.pem', 'r') as file:
    openai_key = file.read().strip()

#### Load Model

In [4]:
llm = OpenAI(openai_api_key=openai_key, model_name='gpt-3.5-turbo')
llm

OpenAIChat(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', model_kwargs={}, openai_api_key='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', openai_api_base=None, openai_proxy=None, max_retries=6, prefix_messages=[], streaming=False, allowed_special=set(), disallowed_special='all')

#### Document Splitter

In [5]:
splitter = CharacterTextSplitter(
    separator='\n\n',
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    # is_separator_regex=False
)

splitter

<langchain.text_splitter.CharacterTextSplitter at 0x10be451c0>

<div style="text-align:justify">
Including 10 or more retrieved documents in a model's architecture leads to significant performance decline, regardless of the model's design. This problem arises when models need to find relevant information within lengthy contexts, causing them to overlook the provided documents. To tackle this, a suggested solution is to rearrange the retrieved documents post-retrieval, effectively preventing the drop in performance. Source: <a href="https://arxiv.org/abs/2307.03172">https://arxiv.org/abs/2307.03172</a>.
</div>

#### Load and Split Documents

In [6]:
loader = TextLoader('croxson.txt')

document = loader.load()
pages = splitter.split_documents(document)

Created a chunk of size 1312, which is longer than the specified 1000
Created a chunk of size 1052, which is longer than the specified 1000
Created a chunk of size 1235, which is longer than the specified 1000
Created a chunk of size 1047, which is longer than the specified 1000
Created a chunk of size 1207, which is longer than the specified 1000
Created a chunk of size 1294, which is longer than the specified 1000
Created a chunk of size 1209, which is longer than the specified 1000
Created a chunk of size 1578, which is longer than the specified 1000
Created a chunk of size 1838, which is longer than the specified 1000
Created a chunk of size 1846, which is longer than the specified 1000
Created a chunk of size 1681, which is longer than the specified 1000
Created a chunk of size 1353, which is longer than the specified 1000
Created a chunk of size 1354, which is longer than the specified 1000
Created a chunk of size 1162, which is longer than the specified 1000
Created a chunk of s

#### Persist Embeddings

In [7]:
db = Chroma.from_documents(
    pages,
    OpenAIEmbeddings(openai_api_key=openai_key),
    persist_directory='./chroma_db'
)

db.persist()

<a href="https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture">Click here</a> to see more.