In [1]:
pip install langchain pdfminer.six unstructured chromadb langchain_community langchain_google_genai python-dotenv

Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting unstructured
  Downloading unstructured-0.15.1-py3-none-any.whl.metadata (29 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-1.0.8-py3-none-any.whl.metadata (3.8 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.28-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
 

In [2]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.2.0 (from gradio)
  Downloading gradio_client-1.2.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.5.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Downloading gradio-4

In [4]:
!pip install -U langchain-huggingface sentence-transformers

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch

In [5]:
import os
from dotenv import load_dotenv

api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    with open('.env', 'w') as f:
        f.write("GOOGLE_API_KEY = AIzaSyDBgm7bko5TU-GKNKaWdXqtR-6WGrDMRsg")
    load_dotenv()
    api_key = os.getenv("GOOGLE_API_KEY")

os.environ["GOOGLE_API_KEY"] = api_key

In [6]:
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [7]:
from pdfminer.high_level import extract_text
from langchain.text_splitter import CharacterTextSplitter

def process_pdf(file_path):
    pdf_text = extract_text(file_path)
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    texts = text_splitter.split_text(pdf_text)
    return texts

In [8]:
def process_text(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    texts = text_splitter.split_text(text)
    return texts

In [9]:
def create_vectorstore(texts):
    embeddings = HuggingFaceEmbeddings()
    vectorstore = Chroma.from_texts(texts, embeddings)
    return vectorstore

In [10]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import RetrievalQA

def create_qa_chain(vectorstore):
    llm = GoogleGenerativeAI(model = "gemini-pro", temperature = 0.7)
    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = "stuff",
        retriever = vectorstore.as_retriever()
    )
    return qa_chain

In [11]:
def process_file(file_path):
    if file_path.lower().endswith('.pdf'):
        texts = process_pdf(file_path)
    elif file_path.lower().endswith('.txt'):
        texts = process_text(file_path)
    else:
        raise ValueError("Unsupported file type")
    vectorstore = create_vectorstore(texts)
    qa_chain = create_qa_chain(vectorstore)
    return qa_chain

In [12]:
def handle_query(qa_chain, query):
    response = qa_chain.run(query)
    return response

In [13]:
from google.colab import files
import gradio as gr

In [14]:
def gradio_app(file, query):
    qa_chain = process_file(file)
    response = handle_query(qa_chain, query)
    return response

with gr.Blocks() as app:
    gr.HTML("""
    <div style="text-align: center; background-color: #f7f9f6; padding: 20px;">
        <h1>Chat with Documents using Gemini</h1>
    </div>
    """)

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload a PDF or Text File")
            query_input = gr.Textbox(label="Enter your query")
            submit_button = gr.Button("Submit")
            output_text = gr.Textbox(label="Response", interactive=False)

    submit_button.click(fn=gradio_app, inputs=[file_input, query_input], outputs=output_text)

app.launch(share=True, inbrowser=True, server_name="0.0.0.0", server_port=7860)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e4b09e8f49dc45d91e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


