In [1]:
import gradio as gr
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import ollama
import re
import os
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install pypdf

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
with open('config.yaml','r') as file:
    config = yaml.safe_load(file)

In [4]:


model_embedding = config['models']['embedding']
embeddings_directory = config['path']['emmbeddings']
pdf_bytes = config['path']['pdfs']
create_embeddings = config['parameters']['create_embeddings']
chunk_size = config['parameters']['chunk_size']
chunk_overlap = config['parameters']['chunk_overlap']

In [5]:
question ='please provide me the structure of a docker-compose.yaml file'


In [6]:
def process_pdf(pdf_bytes, model_embedding, persist_directory):
    if pdf_bytes is None:
        return None, None, None

    #loader = PyMuPDFLoader(pdf_bytes)
    loader = PyPDFDirectoryLoader(pdf_bytes)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=100
    )
    chunks = text_splitter.split_documents(data)

    embeddings = OllamaEmbeddings(model=model_embedding)
    vectorstore = Chroma.from_documents(
        documents=chunks, embedding=embeddings, persist_directory=persist_directory
    )
    retriever = vectorstore.as_retriever()

    return text_splitter, vectorstore, retriever

In [7]:
def load_embeddings(model_embedding, persist_directory, chunk_size, chunk_overlap):
    
    vectordb = Chroma(persist_directory=persist_directory,
                      embedding_function= OllamaEmbeddings(model=model_embedding))
    
    text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size, 
         chunk_overlap=chunk_overlap
    )
    
    retriever = vectordb.as_retriever()

    
    return text_splitter, vectordb, retriever

In [8]:
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [9]:

def ollama_llm(question, context, model_embedding):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"

    response = ollama.chat(
        model=model_embedding,
        messages=[{"role": "user", "content": formatted_prompt}],
    )

    response_content = response["message"]["content"]

    # Remove content between <think> and </think> tags to remove thinking output
    final_answer = re.sub(r"<think>.*?</think>", "", response_content, flags=re.DOTALL).strip()

    return final_answer

In [10]:
def rag_chain(question, text_splitter, retriever, model_embedding):
    
    retrieved_docs = retriever.invoke(question)
    
    formatted_content = combine_docs(retrieved_docs)
    
    return ollama_llm(question, formatted_content, model_embedding)

In [16]:
def ask_question(pdf_bytes, question, create_embeddings, embeddings_directory, model_embedding, chunk_size, chunk_overlap):
    if create_embeddings is True:
        text_splitter, vectorstore, retriever = process_pdf(pdf_bytes,
                                                            embeddings_directory,
                                                            chunk_size,
                                                            chunk_overlap
                                                           )
    else:
        text_splitter, vectorstore, retriever = load_embeddings(model_embedding,
                                                                embeddings_directory,
                                                                chunk_size,
                                                                chunk_overlap
                                                               )

    if embeddings_directory is None:
        return None  # No PDF uploaded

    result = rag_chain(question, 
                       text_splitter,  
                       retriever, 
                       model_embedding)
    
    return {result}
    #return result

In [17]:
result = ask_question(pdf_bytes ,question,create_embeddings, embeddings_directory, model_embedding, chunk_size, chunk_overlap)

In [18]:
result


{'Here\'s an example of a `docker-compose.yaml` file for a machine learning pipeline using pandas and ydata_profiling:\n\n```yaml\nservices:\n  data-loader:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: Data loader service\n    description: Loads CSV data for EDA\n    arguments:\n      - python324: $input_path\n    parameters:\n      - input_path: String\n\n  ydata_profiling_server:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: YData Profiling Server\n    description: Runtime service for generating EDA report\n    arguments:\n      - $input_path: String\n\n  report-generator:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: Report Generator\n    description: Exports EDA report to HTML\n    parameters:\n      - output_path: String\n\n  scheduler:\n    context: root_context\n    arguments:\n      - "data-loader $ydata_profiling_server report-generator"\n\n  start_from: root_context\nendservices\n\ndocker-com

In [14]:
type(result)

str

'Here\'s an example of a `docker-compose.yaml` file for a machine learning pipeline using pandas and ydata_profiling:\n\n```yaml\nservices:\n  data-loader:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: Data loader service\n    description: Loads CSV data for EDA\n    arguments:\n      - python324: $input_path\n    parameters:\n      - input_path: String\n\n  ydata_profiling_server:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: YData Profiling Server\n    description: Runtime service for generating EDA report\n    arguments:\n      - $input_path: String\n\n  report-generator:\n    environment: python324\n    ports:\n      - "10800:10800"\n    name: Report Generator\n    description: Exports EDA report to HTML\n    parameters:\n      - output_path: String\n\n  scheduler:\n    context: root_context\n    arguments:\n      - "data-loader $ydata_profiling_server report-generator"\n\n  start_from: root_context\nendservices\n\ndocker-compose up --build\n```\n\n### Explanation:\n\n1. **Services**:\n   - `data-loader`: A service that reads a CSV file from an input path.\n   - `ydata_profiling_server`: The runtime service that performs the EDA using ydata_profiling.\n   - `report-generator`: A service that exports the EDA report to HTML.\n\n2. **Containers and Context**:\n   - Services are defined in the root container context.\n   - The `scheduler` task starts services after `data-loader` initializes, ensuring all services start in order.\n\n3. **Imports and Runtime Profiles**:\n   - All necessary libraries (pandas, ydata_profiling) are imported at the root level to ensure they\'re available when the container runs.\n   - Both `ydata_profiling_server` and `report-generator` use `ServerProfile`, which is recommended for runtime dependencies that should run in the background.\n\n4. **Port Usage**:\n   - Uses port 10800 for multiple containers, allowing services to coexist efficiently.\n\nThis structure ensures a clean and efficient containerized pipeline for EDA using pandas and ydata_profiling.'