### Installing libraries

In [None]:
%pip install -qU langchain_openai langchain_community

### OpenAI API key Setup

In [None]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

### LLM Configuration

In [None]:
from langchain.globals import set_llm_cache
from langchain_openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo-instruct")

### Response generation without Caching

In [None]:
%%time
from langchain.cache import InMemoryCache

set_llm_cache(InMemoryCache())

# The first time, it is not yet in cache, so it should take longer
llm.invoke("What is memory caching? Explain in less than 100 words")

CPU times: user 35 ms, sys: 1.13 ms, total: 36.2 ms
Wall time: 1.8 s


"\n\nMemory caching is a technique used in computer systems to improve the performance of accessing data. It involves storing frequently used data in a faster and closer location, such as the computer's main memory, rather than retrieving it from a slower and more distant location, such as the hard drive. This allows for quicker access to data, reducing the time and resources needed to retrieve it. When data is requested, the system checks the cache first and if the data is found, it is retrieved from the cache instead of the original location. This results in faster data retrieval and improved overall system performance."

### Response generation with Caching

In [None]:
%%time
# The second time it is, so it goes faster
llm.invoke("What is memory caching? Explain in less than 100 words")

CPU times: user 716 µs, sys: 0 ns, total: 716 µs
Wall time: 752 µs


"\n\nMemory caching is a technique used in computer systems to improve the performance of accessing data. It involves storing frequently used data in a faster and closer location, such as the computer's main memory, rather than retrieving it from a slower and more distant location, such as the hard drive. This allows for quicker access to data, reducing the time and resources needed to retrieve it. When data is requested, the system checks the cache first and if the data is found, it is retrieved from the cache instead of the original location. This results in faster data retrieval and improved overall system performance."

### Caching Embeddings

In [None]:
%pip install --upgrade --quiet  langchain-openai faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings.cache import CacheBackedEmbeddings

underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

list(store.yield_keys())

[]

In [None]:
raw_documents = TextLoader("GOT_script.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

### creating the vector embeddings

In [None]:
%%time
db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 819 ms, sys: 46.7 ms, total: 866 ms
Wall time: 1.71 s


### creating the vector store again will be faster as we have employed embedding caching and there is no need for recomputation of vectors

In [None]:
%%time
db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 4.74 ms, sys: 56 µs, total: 4.79 ms
Wall time: 12 ms


In [None]:
list(store.yield_keys())[:5]

['text-embedding-ada-002cb0f15e2-ae17-50ef-809e-cdd44eb36665',
 'text-embedding-ada-0023fea82f3-f0a6-5b24-84cb-cefe7dd8ba8b']