In [1]:
# Install dependencies
!pip install -q litellm mlflow langchain langchain-community faiss-cpu sentence-transformers pypdf ragas langchain-groq groq datasets langchain_huggingface polars

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# --- Imports ---
import os
import mlflow
import litellm
import polars as pl
from dotenv import load_dotenv
from datasets import Dataset
from pypdf import PdfReader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall, ContextPrecision, FactualCorrectness
from ragas.llms import LangchainLLMWrapper
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings as RagasHFEmbeddings

In [3]:
# --- Environment Setup ---
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY_")
mlflow.set_experiment("SolidPrinciple_CorrectiveRAG")
mlflow.litellm.autolog()

2025/07/13 16:40:02 INFO mlflow.tracking.fluent: Experiment with name 'SolidPrinciple_SelfRAG' does not exist. Creating a new experiment.


In [4]:
# --- Start MLflow Run ---
with mlflow.start_run() as run:
    # --- Load PDFs ---
    pdf_folder = "/content/IncidentManuals"  # Replace with your local path
    loaders = [PyPDFLoader(os.path.join(pdf_folder, fn)) for fn in os.listdir(pdf_folder) if fn.endswith(".pdf")]
    documents = []
    for loader in loaders:
        documents.extend(loader.load())

    mlflow.log_param("num_documents", len(documents))
    mlflow.log_param("pdf_files", [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")])

    # --- Chunking ---
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(documents)
    mlflow.log_param("num_chunks", len(chunks))

    # Save sample chunks to artifact
    chunk_texts = [chunk.page_content for chunk in chunks[:5]]
    with open("sample_chunks.txt", "w") as f:
        f.write("\n---\n".join(chunk_texts))
    mlflow.log_artifact("sample_chunks.txt")

    # --- Embedding & VectorStore ---
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    embedding = HuggingFaceEmbeddings(model_name=embedding_model)
    mlflow.log_param("embedding_model", embedding_model)

    vectordb = FAISS.from_documents(chunks, embedding)
    retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"lambda_mult": 0.5, "k": 5})

    # --- Prompts ---
    rag_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a mining safety expert. Use the context below to answer questions accurately."),
        ("user", "Context:\n{context}\n\nQuestion: {question}")
    ])
    editor_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a technical editor. Refine the answer for accuracy and clarity."),
        ("user", "Draft:\n{draft}")
    ])

    memory = FAISS.from_texts(["init"], embedding)

    def retrieve_correct(query):
        memory_results = memory.similarity_search(query)
        if memory_results:
            mlflow.log_param("memory_retrieval_used", True)
            return memory_results
        mlflow.log_param("memory_retrieval_used", False)
        return retriever.get_relevant_documents(query)

    def generate(query):
        context_docs = retrieve_correct(query)
        context = "\n\n".join([doc.page_content for doc in context_docs])

        draft = litellm.completion(
            model="openai/gpt-4o-mini",
            messages=[{"role": "user", "content": rag_prompt.format(context=context, question=query)}]
        )
        draft_content = draft['choices'][0]['message']['content']

        refined = litellm.completion(
            model="groq/llama3-70b-8192",
            messages=[{"role": "user", "content": editor_prompt.format(draft=draft_content)}]
        )
        refined_content = refined['choices'][0]['message']['content']

        memory.add_texts([refined_content])

        return refined_content, context, draft_content

    # --- Sample Coal Mining Q&A Dataset ---
    questions = [
        "What are the primary safety concerns in underground coal mining?",
        "How do you prevent methane explosions in coal mines?",
        "What role does ventilation play in coal mine safety?",
        "How is dust managed in underground coal mines?",
        "What personal protective equipment is mandatory in coal mining?"
    ]

    references = [
        "Underground coal mining involves risks such as roof collapse, gas explosion, and equipment hazards.",
        "Methane explosions are prevented using gas detectors, proper ventilation, and drainage systems.",
        "Ventilation ensures removal of harmful gases and maintains breathable air in mines.",
        "Dust is controlled using water sprays, ventilation, and dust collectors.",
        "Helmets, boots, respirators, gloves, and high-visibility clothing are mandatory in coal mines."
    ]

    # --- Generate Answers ---
    data = {"question": [], "response": [], "retrieved_contexts": [], "reference": []}
    for i, (q, ref) in enumerate(zip(questions, references)):
        ans, ctx, draft_ans = generate(q)
        data["question"].append(q)
        data["response"].append(ans)
        data["retrieved_contexts"].append([ctx])
        data["reference"].append(ref)

        # Log individual question, context, draft, and refined answers as artifacts
        mlflow.log_text(q, f"question_{i+1}.txt")
        mlflow.log_text(ctx, f"retrieved_context_{i+1}.txt")
        mlflow.log_text(draft_ans, f"draft_answer_{i+1}.txt")
        mlflow.log_text(ans, f"refined_answer_{i+1}.txt")

    # --- Create HuggingFace Dataset ---
    ds = Dataset.from_dict(data)

    # --- RAGAS Evaluation ---
    llm = LangchainLLMWrapper(
        ChatGroq(api_key=os.environ["GROQ_API_KEY"], model_name="llama3-8b-8192", temperature=0.0)
    )
    ragas_emb = RagasHFEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    result = evaluate(
        ds,
        metrics=[Faithfulness(), AnswerRelevancy(), ContextRecall(), ContextPrecision(), FactualCorrectness()],
        embeddings=ragas_emb,
        llm=llm
    )

    ragas_df = result.to_pandas()
    ragas_df.to_csv("ragas_eval_results.csv", index=False)
    mlflow.log_artifact("ragas_eval_results.csv")

    # --- Log final dataset ---
    final_df = pl.DataFrame(data)
    # mlflow.log_input(final_df, context="training")

    print("Evaluation complete. Results tracked in MLflow.")

  embedding = HuggingFaceEmbeddings(model_name=embedding_model)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[21]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01j3302wb3e99bxkkghndsye1b` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 20769, Requested 899. Please try again in 2m36.686s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
ERROR:ragas.executor:Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01j3302wb3e99bxkkghndsye1b` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 20608, Requested 1439. Please try again in 2m40.473s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
ERROR:ragas.executor:Exception raised in Job[17]: RateLimitError(Err

Evaluation complete. Results tracked in MLflow.


In [5]:
!pip install -q pyngrok

In [6]:
import os
import getpass
from pyngrok import ngrok, conf
from google.colab import userdata
os.environ["NGROK"]=userdata.get('NGROK_TOKEN')

In [7]:
# Set up ngrok tunnel for MLflow UI
conf.get_default().auth_token = os.environ["NGROK"]
ngrok.kill()
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://1a5d220640c1.ngrok-free.app


In [8]:
!mlflow ui

[2025-07-13 16:51:20 +0000] [6079] [INFO] Starting gunicorn 23.0.0
[2025-07-13 16:51:20 +0000] [6079] [INFO] Listening at: http://127.0.0.1:5000 (6079)
[2025-07-13 16:51:20 +0000] [6079] [INFO] Using worker: sync
[2025-07-13 16:51:20 +0000] [6081] [INFO] Booting worker with pid: 6081
[2025-07-13 16:51:20 +0000] [6082] [INFO] Booting worker with pid: 6082
[2025-07-13 16:51:20 +0000] [6083] [INFO] Booting worker with pid: 6083
[2025-07-13 16:51:20 +0000] [6084] [INFO] Booting worker with pid: 6084

[2025-07-13 16:53:16 +0000] [6079] [INFO] Handling signal: int
Aborted!
[2025-07-13 16:53:16 +0000] [6084] [INFO] Worker exiting (pid: 6084)
[2025-07-13 16:53:16 +0000] [6081] [INFO] Worker exiting (pid: 6081)
[2025-07-13 16:53:16 +0000] [6082] [INFO] Worker exiting (pid: 6082)
[2025-07-13 16:53:16 +0000] [6083] [INFO] Worker exiting (pid: 6083)
[2025-07-13 16:53:17 +0000] [6079] [INFO] Shutting down: Master
