# step 1: scrape data from url and save in a folder

In [None]:
import requests
from bs4 import BeautifulSoup
import os
url = "https://en.wikipedia.org/wiki/History_of_India"
output_folder = "langchain-doc1"
def scrape_website(url, output_folder):
  response = requests.get(url)
  response.raise_for_status()  # Raise an exception for bad status codes

  soup = BeautifulSoup(response.content, 'html.parser')

  # Create output folder if it doesn't exist
  os.makedirs(output_folder, exist_ok=True)

  # Extract and save text content
  text_content = soup.get_text(separator='\n')
  text_file_path = os.path.join(output_folder, 'text_content.txt')
  with open(text_file_path, 'w', encoding='utf-8') as f:
    f.write(text_content)

  # Extract and save images (example)
  for img in soup.find_all('img'):
    img_src = img.get('src')
    if img_src:
      img_response = requests.get(img_src)
      img_filename = os.path.basename(img_src)
      img_file_path = os.path.join(output_folder, img_filename)
      with open(img_file_path, 'wb') as f:
        f.write(img_response.content)
scrape_website(url, output_folder)


# step 2: create vectors and store in chromadb

In [None]:
from bs4 import BeautifulSoup
from llama_index.llms.gemini import Gemini
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb
documents = SimpleDirectoryReader("langchain-doc1").load_data()

print(len(documents))

api_key = "AIzaSyAidqIAZXrcjkb00FyJZSg7syVf5eSqz_U"
llm = Gemini(api_key=api_key, model_name="models/gemini-pro")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

db = chromadb.PersistentClient(path="./chroma_db1")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

# step 3: load chromadb and start query session

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb
from llama_index.llms.gemini import Gemini
import gradio as gr
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
llm = Gemini(api_key=api_key, model_name="models/gemini-1.5-flash")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# Load from disk
db2 = chromadb.PersistentClient(path="./chroma_db1")
chroma_collection = db2.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)
Settings.llm = llm

# Query Data from the persisted index
query_engine = index.as_query_engine()

def chat(message):
    response = query_engine.query(message)
    return response

interface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(label="Enter your message"),
    outputs=gr.Textbox(label="Response"),
    # title="Chatbot Interface"
)

interface.launch(debug=False, share=False)

