<a href="https://colab.research.google.com/github/gforgurups/LLMOps/blob/main/1_LangChain_1_DocLoaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r sample_data/requirements.txt

In [None]:
##Text loader
from langchain_community.document_loaders import TextLoader
txt_loader = TextLoader('sample_data/speech.txt')
txt_documents = txt_loader.load()
txt_documents

In [None]:
##PDF loader
from langchain_community.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader("sample_data/attention.pdf")
pdf_documents= pdf_loader.load()
pdf_documents

In [None]:
##Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4
web_loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                           bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                                   class_=("post-title","post-content","post-header")
                               ))
                           )
web_documents = web_loader.load()
web_documents

In [None]:
##Arxiv Loader
from langchain_community.document_loaders import ArxivLoader

# Supports all arguments of `ArxivAPIWrapper`
arxiv_loader = ArxivLoader(
    query="reasoning",
    load_max_docs=2,
    # doc_content_chars_max=1000,
    # load_all_available_meta=False,
    # ...
)
arxiv_documents = arxiv_loader.load()
arxiv_documents[0]

In [None]:
print(arxiv_documents[0].metadata)

In [None]:
##Wikipedia loader
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Generative AI", load_max_docs=2).load()
len(docs)
print(docs)

In [None]:
print(docs[0].metadata)

## How to recursively split text by characters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)
pdf_texts = text_splitter.split_documents(pdf_documents)
print(pdf_texts)


## Html Header TextSplitter

In [None]:
from langchain_text_splitters  import HTMLHeaderTextSplitter

html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about Bar.</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar.</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar.</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
    </div>
</body>
</html>
"""

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits


## JSON Splitter

In [None]:
import json
import requests
from langchain_text_splitters import RecursiveJsonSplitter
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()
#json_data = json.dumps(json_data)
#print(json_data)

json_splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = json_splitter.split_json(json_data)
print(json_chunks)

In [None]:
json_docs = json_splitter.create_documents(texts=[json_data])
print(json_docs)

## Embedding - OpenAIEmbedding

In [None]:
from google.colab import userdata

OPENAI_API_KEY  = userdata.get('OPENAI_API_KEY')


#https://platform.openai.com/docs/guides/embeddings/
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY,model="text-embedding-ada-002")
print(embeddings)

In [None]:
embedded_text = embeddings.embed_query("Hello world")
print(embedded_text)

In [None]:
print(len(embedded_text))

## Text Splitting + Embedding + Vectorstore

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)
txt_docs = text_splitter.split_documents(txt_documents)
print(txt_docs)

In [None]:
from langchain_community.vectorstores import Chroma
db= Chroma.from_documents(txt_docs,embeddings)

In [None]:
query="It is a distressing and oppressive duty"
retrieved_results = db.similarity_search(query)
print(retrieved_results)

# Ollama Embedding

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="gemma:2b")
print(embeddings)

In [None]:
r1 = embeddings.embed_documents(
    [
        "Alpha is the first character of the alphabet.",
        "Beta is the second character of the alphabet"
    ]
)

print(r1)
