In [None]:
!pip install -q  transformers sentence-transformers torch langchain-community bitsandbytes langchain-huggingface

1. **Web Crawling and Scraping:** Ability to crawl and scrape a website for LLM RAG, with deduplication of data between pages.
2. **PDF Support:** Extract and process data from PDFs.
3. **QA Support:** Implement a question-answering functionality.
4. **Text Blob Support:** Handle and process large text blobs.

The system should utilize FAISS for efficient indexing and OpenAI for embedding and fetching data. The final deliverable is a fully functional Jupyter Notebook in Google Colab that demonstrates these capabilities.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,BitsAndBytesConfig
import torch
from langchain_huggingface import HuggingFacePipeline

def load_llm(model_name: str="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
             device: str='cuda') -> HuggingFacePipeline:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    )
    model.gradient_checkpointing_enable()

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.3,
        top_p=0.95,
        repetition_penalty=1.15,
    )
    return HuggingFacePipeline(pipeline=pipe)


In [None]:
!pip install -q langchain langchain-community  langchain-openai faiss-cpu pypdf nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
from langchain_community.document_loaders import AsyncHtmlLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os
from typing import List, Union
from langchain.schema import Document

def load_documents(sources: List[Union[str, dict]]) -> List[Document]:
    documents = []
    for source in sources:
        if isinstance(source, str) and source.startswith("http"):
            loader = AsyncHtmlLoader([source])
            documents.extend(loader.load())
        elif isinstance(source, str) and source.endswith(".pdf"):
            loader = PyPDFLoader(source)
            documents.extend(loader.load())
        elif isinstance(source, dict) and "url" in source:
            loader = AsyncHtmlLoader([source["url"]])
            docs = loader.load()
            for doc in docs:
                doc.metadata.update(source.get("metadata", {}))
            documents.extend(docs)
        else:
            raise ValueError(f"Unsupported source type: {source}")
    return documents



from textblob import TextBlob

def process_with_textblob(document: Document) -> Document:
    blob = TextBlob(document.page_content)

    # Perform TextBlob analysis
    sentiment = blob.sentiment
    noun_phrases = blob.noun_phrases

    # Add TextBlob analysis to document metadata
    document.metadata["sentiment_polarity"] = sentiment.polarity
    document.metadata["sentiment_subjectivity"] = sentiment.subjectivity
    document.metadata["noun_phrases"] = noun_phrases[:5]  # Limit to top 5 noun phrases

    return document


def split_documents(
    documents: List[Document],
    chunk_size: int = 300,
    chunk_overlap: int = 30
) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    processed_docs = [process_with_textblob(doc) for doc in split_docs]
    return processed_docs


def create_vectorstore(texts: List[Document],embeddings_model_name: str="text-embedding-3-small") -> FAISS:
    embeddings = OpenAIEmbeddings(model=embeddings_model_name)
    return FAISS.from_documents(texts, embeddings)



In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


def create_qa_chain(vectorstore: FAISS, llm: HuggingFacePipeline) -> RetrievalQA:
    prompt_template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer:"""
    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

# web Query

In [None]:
# web Query
urls = [
    "https://python.langchain.com/docs/integrations/llms/llamacpp"
]
print("Loading documents...")
documents = load_documents(urls)
print(f"Loaded {len(documents)} documents")

print("Splitting documents...")
split_docs = split_documents(documents)
print(f"Created {len(split_docs)} document chunks")

print("Creating vector store...")
vectorstore = create_vectorstore(split_docs)

print("Loading language model...")
llm = load_llm()

print("Creating QA chain...")
qa_chain = create_qa_chain(vectorstore, llm)

print("Generating answer...")
result = qa_chain.invoke({"query": "Are any option install llama-cpp package?"})
print(f"\nAnswer: {result['result']}")

Loading documents...


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.20it/s]


Loaded 1 documents
Splitting documents...
Created 1016 document chunks
Creating vector store...
Loading language model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Creating QA chain...
Generating answer...





Answer: Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    pip install llama</span><span class="token operator" style="color:rgb(0, 0, 0)">-</span><span class="token plain">cpp</span><span class="token operator" style="color:rgb(0, 0, 0)">-</span><span class="token plain">python</span><br></span></code></pre><div class="buttonGroup__atx"><button

to Compiling and installing" title="Direct link to Compiling and installing">​</a></h4><p>Now you can <code>cd</code> into the <code>llama-cpp-python</code> directory and install the package</p><div class="codeBlockContainer_Ckt0 theme-code-block"

class="token plain"> pip install llama</span><span class="token operator" style="color:rgb(0, 0, 0)">-</span><span class="token plain">cpp</span><span class="token operator" style="color:rgb(0, 0, 0)">-</span><span class="token plain">python</span><br></span></code></pre><div

l