In [1]:
%%capture
%pip install langchain langchain-community langchain-openai pymupdf faiss-cpu pydantic python-dotenv 
%pip install langchain-ollama
%pip install langchain-groq

In [2]:
%%capture
%pip install sentence-transformers langchain-huggingface 
!pip install ipywidgets
!pip install pdfplumber

In [3]:
# cell -2 Imports and API Setup
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# NOTICE: No OpenAI imports here anymore!
from langchain_ollama import ChatOllama # New free LLM
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional
# # Securely enter your API Key if not already set in environment
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

# Configuration
PDF_PATH = "sample-service-manual.pdf"  

In [4]:
# Cell 3: Load and Inspect PDF Text

print(f"Loading PDF: {PDF_PATH}...")
loader = PyMuPDFLoader(PDF_PATH)
documents = loader.load()

# Debug: Print the first 500 characters of page 2 
# This helps you check if table rows are being read line-by-line or column-by-column.
print(f"--- Preview of Page 24 ---")
print(documents[1].page_content[:1000])

Loading PDF: sample-service-manual.pdf...
--- Preview of Page 24 ---
Symptom Chart ‚Äî Suspension System 
Condition 
Possible Sources 
Action 
z Incorrect thrust 
angle (dogtracking) 
z Rear 
suspension 
components 
z INSPECT the rear suspension 
system. CHECK the rear alignment 
for the correct thrust angle. 
REPAIR or INSTALL new 
suspension components as 
necessary. REFER to Section 204-
02 . 
z Vehicle drifts/pulls 
z Unevenly loaded 
or overloaded 
vehicle 
z Tires/tire 
pressure 
z Alignment is not 
within 
specification 
z Brake drag 
z Steering 
components 
z GO to Pinpoint Test A . 
z Front bottoming or 
riding low 
z Worn, damaged 
or incorrect 
springs 
z MEASURE the ride height. REFER 
to Ride Height Measurement in this 
section. INSTALL new springs as 
necessary. Refer to the appropriate 
section in Group 204 for the 
procedure. 
z Worn front 
shock absorbers 
z INSTALL new shock absorbers as 
necessary. Refer to the appropriate 
section in Group 204 for the 
procedure. 
z

In [5]:
# --- CELL 4: Smart "Header Search" Chunking Strategy  ---
import pdfplumber
from langchain_core.documents import Document

def process_pdf_with_header_injection(pdf_path, batch_size=5):
    """
    Reads the PDF. Hunts for the TRUE header row (containing 'Nm' or 'lb-ft') 
    before processing, ensuring we don't accidentally use page titles as headers.
    """
    print(f"‚öôÔ∏è Processing {pdf_path} with Smart Header Search...")
    smart_docs = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Aggressive table extraction
            table = page.extract_table({
                "vertical_strategy": "text", 
                "horizontal_strategy": "text"
            })
            
            if table:
                # --- LOGIC UPDATE: Find the Real Header ---
                headers = None
                data_start_idx = 0
                
                # Look through the first 5 rows to find the true header
                for idx, row in enumerate(table[:5]):
                    # Check if this row looks like a header (contains units)
                    row_str = " ".join([str(c).lower() for c in row if c])
                    if "nm" in row_str or "lb-ft" in row_str or "description" in row_str:
                        headers = row
                        data_start_idx = idx + 1
                        break
                
                # If we found a valid header, proceed
                if headers:
                    # Clean headers
                    clean_headers = [str(h).replace('\n', ' ') if h else f"Col_{j}" for j, h in enumerate(headers)]
                    data_rows = table[data_start_idx:]
                    
                    current_batch = []
                    for row_idx, row in enumerate(data_rows):
                        clean_row = [str(cell).replace('\n', ' ') if cell else "N/A" for cell in row]
                        
                        # Match header length
                        if len(clean_headers) == len(clean_row):
                            # Create context string: "Component: Bolt, Nm: 17..."
                            row_context = ", ".join([f"{h}: {r}" for h, r in zip(clean_headers, clean_row)])
                            current_batch.append(row_context)
                        
                        if len(current_batch) >= batch_size or row_idx == len(data_rows) - 1:
                            if current_batch:
                                doc = Document(
                                    page_content="\n".join(current_batch),
                                    metadata={"source": pdf_path, "page": i + 1, "type": "table_chunk"}
                                )
                                smart_docs.append(doc)
                                current_batch = []
                else:
                    # If no unit header found, fallback to text (safer than using a bad header)
                    text = page.extract_text()
                    if text:
                        smart_docs.append(Document(page_content=text, metadata={"source": pdf_path, "page": i+1}))
            else:
                # Fallback if no table detected
                text = page.extract_text()
                if text:
                    smart_docs.append(Document(page_content=text, metadata={"source": pdf_path, "page": i+1}))

    return smart_docs

# --- EXECUTE & RELOAD ---
pdf_filename = "sample-service-manual.pdf" 
try:
    chunks = process_pdf_with_header_injection(pdf_filename)
    print(f"‚úÖ Success! Created {len(chunks)} smart chunks.")
    
    # RELOAD DATABASE
    from langchain_community.vectorstores import FAISS
    # Re-run embedding (Cell 5 logic)
    # vector_store = FAISS.from_documents(chunks, embeddings) 
    # vector_store.save_local("faiss_db_index")
    # print("‚úÖ Database updated.")
except Exception as e:
    print(f"‚ùå Error: {e}")

‚öôÔ∏è Processing sample-service-manual.pdf with Smart Header Search...
‚úÖ Success! Created 1009 smart chunks.


In [6]:
# ---CELL 5: Vector Store with Sentence Transformers ---
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS 

print("Loading local embedding model ...")

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("Creating vector store...")

vector_store = FAISS.from_documents(chunks, embeddings)
print("Vector store created successfully using Sentence Transformers!") 


Loading local embedding model ...
Creating vector store...
Vector store created successfully using Sentence Transformers!


In [7]:
# Cell 6: Test Retrieval (Debugging Step)
test_query = "Torque specifications for suspension"
results = vector_store.similarity_search(test_query, k=10)

print(f"--- Top Retrieval Result for '{test_query}' ---")
print(results[0].page_content)

--- Top Retrieval Result for 'Torque specifications for suspension' ---
2014 F-150 Workshop Manual Page 1sur 1
SECTION 204-01A: Front Suspension ‚ÄîRear Wheel Drive (RWD) 2014 F-150 Workshop Manual
SPECIFICATIONS Procedure revision date: 10/25/2013
Torque Specifications
Description Nm lb-ft lb-in
Brake disc shield bolts 17 ‚Äî 150
Brake hose bracket bolt 12 ‚Äî 106
Lower arm forward and rearward nuts 350 258 ‚Äî
Lower ball joint nut 175 129 ‚Äî
Shock absorber lower nuts 90 66 ‚Äî
Shock absorber upper mount nuts 63 46 ‚Äî
Shock rod nut 55 41 ‚Äî
Stabilizer bar bracket nuts 55 41 ‚Äî
Stabilizer bar link nuts 70 52 ‚Äî
Tie-rod end nut 115 85 ‚Äî
Upper arm-to-frame nuts 150 111 ‚Äî
Upper ball joint nut 115 85 ‚Äî
Wheel bearing and wheel hub bolts 175 129 ‚Äî
Wheel speed sensor bolt 18 ‚Äî 159
Wheel speed sensor harness bracket bolt 12 ‚Äî 106
file:///C:/TSO/tsocache/VDTOM2_10764/SE2~us~en~file=SE241A01.HTM~gen~ref.H... 2014-03-01


In [8]:
# Cell 7: Define Output Structure

class VehicleSpec(BaseModel):
    """Information about a specific vehicle specification."""
    component: str = Field(..., description="The specific part or component name (e.g., 'Brake Caliper Bolt').")
    spec_type: str = Field(..., description="The type of specification (e.g., 'Torque', 'Capacity', 'Clearance').")
    value: str = Field(..., description="The numerical value of the specification.")
    unit: Optional[str] = Field(None, description="The unit of measurement (e.g., 'Nm', 'lb-ft', 'L').")

class SpecList(BaseModel):
    """A list of extracted vehicle specifications."""
    specs: List[VehicleSpec]

In [9]:
# --- CELL 8: Main Extraction Loop (Optimized for Smart Chunks) ---
import json
import time
import re
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# ==========================================
# 1. SETUP CLOUD LLM
# ==========================================
GROQ_API_KEY = "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"

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile",
    api_key=GROQ_API_KEY
)

# ==========================================
# 2. HELPER: Bulletproof JSON Extractor
# ==========================================
def extract_json_from_text(text):
    try:
        text = text.replace("```json", "").replace("```", "")
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match: return json.loads(match.group(0))
        return json.loads(text)
    except: return None

# ==========================================
# 3. MASTER PROMPT (Optimized for Header Injection)
# ==========================================
prompt_template = """
You are a highly accurate technical data extractor.
Analyze the provided text context and extract specifications for: '{question}'.

CRITICAL INSTRUCTIONS:
1. **TRUST EXPLICIT LABELS**: The text often contains explicit keys like "Nm: 175" or "lb-ft: 129". Use these labels to identify values and units.
2. **FALLBACK PATTERN**: If labels are missing or look like "Column_1", use the standard manual pattern: **Nm** (1st number) -> **lb-ft** (2nd) -> **lb-in** (3rd).
3. **EXTRACT EVERYTHING**: If the context lists multiple components (e.g., "Nut", "Bolt", "Link"), extract ALL of them.

Output JSON: {{ "specs": [ {{ "component": "...", "spec_type": "...", "value": "...", "unit": "..." }} ] }}
If no relevant data is found, return: {{ "specs": [] }}

Context:
{context}
"""

queries = [
    "Torque specifications for front suspension",
    "Torque for lower ball joint",  # Added your specific failing query
    "Torque specifications for braking system",
    "Fluid capacities"
]

all_extracted_data = []

print("üöÄ Starting Batch Extraction Job...")

for query in queries:
    print(f"   Processing: {query}...")
    start_ts = time.time()
    
    # --- FIX IS HERE: Increased k from 3 to 6 ---
    # Since we broke tables into small 5-row chunks, we need to retrieve MORE chunks
    # to ensure we capture the specific row the user asked for.
    docs = vector_store.similarity_search(query, k=12) 
    context = "\n\n".join([d.page_content for d in docs])
    
    # Generate
    try:
        chain = ChatPromptTemplate.from_template(prompt_template) | llm
        response = chain.invoke({"context": context, "question": query})
        
        # Parse
        data = extract_json_from_text(response.content)
        
        if data and "specs" in data:
            items = data["specs"]
            # Filter out empty results
            if items:
                all_extracted_data.extend(items)
                print(f"   ‚úÖ Found {len(items)} items in {time.time()-start_ts:.2f}s.")
            else:
                print("   ‚ö†Ô∏è Valid JSON, but no specific data found.")
        else:
            print("   ‚ö†Ô∏è No JSON found.")
            
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
    # ---  PAUSE FOR 10 SECONDS ---
    print("   ‚è≥ Sleeping 10s to respect API limits...")
    time.sleep(10)

# Save
with open("vehicle_specs.json", "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"\nüéâ DONE! Saved {len(all_extracted_data)} total specs.")

üöÄ Starting Batch Extraction Job...
   Processing: Torque specifications for front suspension...
   ‚úÖ Found 40 items in 3.28s.
   ‚è≥ Sleeping 10s to respect API limits...
   Processing: Torque for lower ball joint...
   ‚úÖ Found 2 items in 0.98s.
   ‚è≥ Sleeping 10s to respect API limits...
   Processing: Torque specifications for braking system...
   ‚úÖ Found 141 items in 9.88s.
   ‚è≥ Sleeping 10s to respect API limits...
   Processing: Fluid capacities...
   ‚úÖ Found 6 items in 3.18s.
   ‚è≥ Sleeping 10s to respect API limits...

üéâ DONE! Saved 189 total specs.


In [10]:
# Cell 9: Save and View Results

import json

# Save to JSON file
output_file = "vehicle_specs.json"
with open(output_file, "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"Saved data to {output_file}")

# Display first 5 results
print(json.dumps(all_extracted_data[:5], indent=2))

Saved data to vehicle_specs.json
[
  {
    "component": "Brake disc shield bolts",
    "spec_type": "Torque",
    "value": "17",
    "unit": "Nm"
  },
  {
    "component": "Brake disc shield bolts",
    "spec_type": "Torque",
    "value": "150",
    "unit": "lb-in"
  },
  {
    "component": "Brake hose bracket bolt",
    "spec_type": "Torque",
    "value": "12",
    "unit": "Nm"
  },
  {
    "component": "Brake hose bracket bolt",
    "spec_type": "Torque",
    "value": "106",
    "unit": "lb-in"
  },
  {
    "component": "Lower arm forward and rearward nuts",
    "spec_type": "Torque",
    "value": "350",
    "unit": "Nm"
  }
]


In [11]:
# Run this in your notebook to save the index to disk
vector_store.save_local("faiss_db_index_test")
print("‚úÖ Index saved to folder 'faiss_db_index_test'")

‚úÖ Index saved to folder 'faiss_db_index_test'


In [12]:
%%capture
%pip install langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
# --- Basic UI -  Mechanic AI BOT(Groq Llama 3.3) ---
import ipywidgets as widgets
from IPython.display import display, HTML
import json
import time
import re
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# ============================================
# 1. SETUP GROQ API
# ============================================   
GROQ_API_KEY = "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"

if GROQ_API_KEY.startswith("PASTE"):
    print("‚ö†Ô∏è PLEASE PASTE YOUR ACTUAL GROQ API KEY ABOVE!")

gui_llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile",
    api_key=GROQ_API_KEY
)

# ============================================
# 2. UNIVERSAL PROMPT
# ============================================
prompt_template = """
You are a technical assistant. Extract ALL specifications for: '{question}'.

CRITICAL INSTRUCTIONS:
1. **Torque Tables**: Pattern is **Nm** -> **lb-ft** -> **lb-in**.
2. **Fluids/Parts**: If no unit exists (like a part number), return null for unit.
3. **General**: Extract EVERY row. Clean up values (numbers only for torque).

Output JSON: {{ "specs": [ {{ "component": "...", "spec_type": "...", "value": "...", "unit": "..." }} ] }}
If empty, return {{ "specs": [] }}

Context:
{context}
"""

# ============================================
# 3. HELPER: Bulletproof JSON Cleaner
# ============================================
def clean_and_parse_json(raw_response):
    try:
        if isinstance(raw_response, list): text = "".join([str(item) for item in raw_response])
        else: text = str(raw_response)
        
        # Remove markdown ticks and find the JSON object
        text = text.replace("```json", "").replace("```", "")
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match: return json.loads(match.group(0))
        return {"specs": []}
    except:
        return {"specs": []}

# ============================================
# 4. PROFESSIONAL UI (Updated CSS for Left Align)
# ============================================
header_html = """
<style>
    .mechanic-header { 
        background: #2C3E50; 
        color: white; 
        padding: 15px 20px; 
        border-radius: 8px 8px 0 0; 
        text-align: left; 
        font-family: sans-serif; 
    }
    .mechanic-subheader { font-size: 12px; opacity: 0.8; margin-top: 5px; }
    
    .result-table { 
        width: 100%; 
        border-collapse: collapse; 
        margin-top: 15px; 
        font-family: sans-serif; 
        font-size: 14px; 
        table-layout: fixed; 
    }
    
    .result-table th { 
        background: #34495E; 
        color: white; 
        padding: 10px; 
        text-align: left; /* Header Left Align */
    }
    
    .result-table td { 
        border-bottom: 1px solid #ddd; 
        padding: 8px; 
        color: #333; 
        word-wrap: break-word;
        text-align: left !important; /* <--- FORCE LEFT ALIGNMENT ON ROWS */
    }
    
    .result-table tr {
        text-align: left !important;
    }
    
    .result-table th:nth-child(1) { width: 45%; } 
    .highlight-val { color: #C0392B; font-weight: bold; }
</style>
<div class="mechanic-header">
    <h2>üöó Mechanic AI: Service Hub (Cloud Edition)</h2>
    <div class="mechanic-subheader">Powered by Groq Llama 3.3</div>
</div>
"""
header_widget = widgets.HTML(value=header_html)

PRESET_QUERIES = [
    "--- Select a Quick Query ---",
    "Torque specifications for front suspension",
    "Torque specifications for braking system",
    "Service materials",             
    "Wheel alignment specifications",
    "Grease and lubricants"
]

dropdown = widgets.Dropdown(options=PRESET_QUERIES, layout=widgets.Layout(width='98%'))
query_input = widgets.Text(placeholder='...or type a specific question here (PLEASE DO NOT PASTE QUERY)', layout=widgets.Layout(width='98%'))
search_btn = widgets.Button(description=' Extract Data', icon='search', button_style='primary', layout=widgets.Layout(width='200px', height='40px'))
output_area = widgets.Output(layout={'border': '1px solid #ddd', 'padding': '15px', 'margin_top':'15px', 'min_height': '100px'})

def on_dropdown_change(change):
    if change['new'] != "--- Select a Quick Query ---":
        query_input.value = change['new']

def on_search_click(b):
    output_area.clear_output()
    q = query_input.value
    
    if not q or q == "--- Select a Quick Query ---":
        with output_area: display(HTML("<b style='color:orange;'>‚ö†Ô∏è Please select or type a query.</b>"))
        return
    
    with output_area:
        display(HTML("<b>‚öôÔ∏è Querying Groq (Llama 3.3)... please wait...</b>"))
        start_ts = time.time()
        
        try:
            # 1. RETRIEVE
            docs = vector_store.similarity_search(q, k=3)
            context = "\n\n".join([d.page_content for d in docs])
            
            # 2. GENERATE
            prompt = ChatPromptTemplate.from_template(prompt_template)
            chain = prompt | gui_llm
            response = chain.invoke({"context": context, "question": q})
            
            # 3. PARSE
            data = clean_and_parse_json(response.content)
            specs = data.get("specs", [])
            elapsed = time.time() - start_ts
            output_area.clear_output()
            
            if specs:
                print(f"‚úÖ Found {len(specs)} specs in {elapsed:.2f}s:")
                rows = ""
                for s in specs:
                    rows += f"<tr><td>{s.get('component','-')}</td><td>{s.get('spec_type','-')}</td><td class='highlight-val'>{s.get('value','-')}</td><td>{s.get('unit','') or ''}</td></tr>"
                table = f"<table class='result-table'><thead><tr><th>Component</th><th>Type</th><th>Value</th><th>Unit</th></tr></thead><tbody>{rows}</tbody></table>"
                display(HTML(table))
            else:
                display(HTML(f"<div style='color:red; padding:10px;'>‚ö†Ô∏è No data found for: '{q}'</div>"))
                
        except Exception as e:
            output_area.clear_output()
            print(f"‚ùå Error: {e}")

dropdown.observe(on_dropdown_change, names='value')
search_btn.on_click(on_search_click)

controls = widgets.VBox([
    widgets.Label(value="Quick Select:"), dropdown, 
    widgets.Label(value="Custom Query:"), query_input, 
    widgets.Box([search_btn], layout=widgets.Layout(margin='15px 0 0 0'))
], layout=widgets.Layout(padding='20px'))

display(widgets.VBox([header_widget, controls, output_area]))



VBox(children=(HTML(value='\n<style>\n    .mechanic-header { \n        background: #2C3E50; \n        color: w‚Ä¶