## Load document

In [23]:
from langchain_community.document_loaders import PyPDFLoader

pdf_url = "https://arxiv.org/pdf/2312.16862.pdf"

pdf_loader = PyPDFLoader(pdf_url)
pdf_pages = pdf_loader.load()

In [24]:
len(pdf_pages)

14

## Split document 

In [25]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 30
chunk_overlap = 0

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False,
)

docs = splitter.split_documents(pdf_pages)

In [26]:
len(docs)

2252

In [27]:
docs[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-06-24T00:23:57+00:00', 'author': 'Zhengqing Yuan, Zhaoxu Li, Weiran Huang, Yanfang Ye, Lichao Sun', 'keywords': 'Efficient Deep Learning, WANT, ICML2024', 'moddate': '2024-06-24T00:23:57+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': 'Proceedings of the International Conference on Machine Learning 2024', 'title': 'TinyGPT-V: Efficient Multimodal Large Language Model  via Small Backbones', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2312.16862.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1'}, page_content='TinyGPT-V: Efficient')

## Embedding model

In [28]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings()

  embedding_model = HuggingFaceEmbeddings()


# 1. Chrom vector database

In [7]:
# %pip install langchain-chroma

In [29]:
from langchain_chroma import Chroma

In [32]:
chroma_db = Chroma.from_documents(docs, embedding=embedding_model)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [10]:
query = "what is multimodal large language models?"

In [11]:
similar_docs = chroma_db.similarity_search(query, k=4)

In [12]:
len(similar_docs)

4

In [13]:
similar_docs

[Document(page_content='1 Introduction\nIn recent years, the field of artificial intelligence has seen significant advancements through\nthe development of multimodal large language models (MLLMs), such as GPT-4V , which\nhave shown exceptional performance across a range of vision-language tasks (Yang', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='In recent years, multimodal large language models (MLLMs) such as GPT-\n4V have demonstrated remarkable advancements, excelling in a variety\nof vision-language tasks. Despite their prowess, the closed-source na-\nture and computational demands of such models limit their accessibility', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre

# 2. FASII vector database

In [14]:
# %pip install faiss-cpu

In [15]:
from langchain_community.vectorstores import FAISS

In [16]:
faiss_db = FAISS.from_documents(docs, embedding=embedding_model)

In [17]:
query = "what is multimodal large language models?"

In [18]:
similar_docs = chroma_db.similarity_search(query, k=4)

In [19]:
len(similar_docs)

4

In [20]:
similar_docs

[Document(page_content='1 Introduction\nIn recent years, the field of artificial intelligence has seen significant advancements through\nthe development of multimodal large language models (MLLMs), such as GPT-4V , which\nhave shown exceptional performance across a range of vision-language tasks (Yang', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='In recent years, multimodal large language models (MLLMs) such as GPT-\n4V have demonstrated remarkable advancements, excelling in a variety\nof vision-language tasks. Despite their prowess, the closed-source na-\nture and computational demands of such models limit their accessibility', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre