In [None]:
# ================================
# CELL 1 — INSTALL DEPENDENCIES
# ================================

!pip install pymupdf sentence-transformers faiss-cpu transformers langchain --quiet
!pip install -U langchain-text-splitters --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# IMPORTS

import os
import re
import numpy as np
import pandas as pd

# For PDF using PyMuPDF
import fitz

# For Embeddings
from sentence_transformers import SentenceTransformer

# For Vector Database
import faiss

# For LLM (Hugging Face)
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# For Text Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# FOr  GPU Check
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [None]:
# ================================
# CELL 3 — LOAD PDF PATH
# ================================

# ---- If using Google Colab ----
from google.colab import drive
drive.mount('/content/drive')

# ---- Define PDF path ----
pdf_path = "/content/drive/MyDrive/predi/sample-service-manual 1.pdf"




Mounted at /content/drive


In [None]:
# TEXT EXTRACTION

def extract_text(pdf_path):

    doc = fitz.open(pdf_path)
    text_data = ""

    for page in doc:
        page_text = page.get_text()

        if page_text:
            text_data += page_text

    doc.close()

    return text_data


raw_text = extract_text(pdf_path)

print("Total characters:", len(raw_text))

print(raw_text[:1000])


Total characters: 856936
Suspension System 
Inspection and Verification 
1.
Road test. 
z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are 
apparent, refer to Section 100-04 . 
2.
Inspect tires. 
z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the 
Vehicle Certification (VC) label. 
z Verify that all tires are sized to specification. Refer to the VC label. 
z Inspect the tires for incorrect wear and damage. Install new tires as necessary. 
3.
Inspect chassis and underbody. 
4.
Inspect for aftermarket equipment. 
z Check for aftermarket changes to the steering, suspension, wheel and tire components (such 
as competition, heavy duty, etc.). The specifications shown in this manual do not apply to 
vehicles equipped with aftermarket equipment. 
  Visual Inspection Chart 
5.
If an obvious cause for an observed or reported condition is found, correct the cause (if possible) 
before proceeding to the ne

In [None]:
# ================================
# CELL 5 — TEXT CLEANING
# ================================

import re

def clean_text(text):
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove unwanted special characters
    text = text.replace("—", " ")

    return text.strip()


cleaned_text = clean_text(raw_text)

print("Cleaned text length:", len(cleaned_text))

print(cleaned_text[:1000])


Cleaned text length: 837853
Suspension System Inspection and Verification 1. Road test. z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are apparent, refer to Section 100-04 . 2. Inspect tires. z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the Vehicle Certification (VC) label. z Verify that all tires are sized to specification. Refer to the VC label. z Inspect the tires for incorrect wear and damage. Install new tires as necessary. 3. Inspect chassis and underbody. 4. Inspect for aftermarket equipment. z Check for aftermarket changes to the steering, suspension, wheel and tire components (such as competition, heavy duty, etc.). The specifications shown in this manual do not apply to vehicles equipped with aftermarket equipment. Visual Inspection Chart 5. If an obvious cause for an observed or reported condition is found, correct the cause (if possible) before proceeding to the next step. 6. If t

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Create splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 100
)
# Split text
chunks = splitter.split_text(cleaned_text)
print("Total chunks created:", len(chunks))
# Preview few chunks
for i in range(2):
    print(f"\nChunk {i+1}:\n")
    print(chunks[i][:500])


Total chunks created: 1197

Chunk 1:

Suspension System Inspection and Verification 1. Road test. z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are apparent, refer to Section 100-04 . 2. Inspect tires. z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the Vehicle Certification (VC) label. z Verify that all tires are sized to specification. Refer to the VC label. z Inspect the tires for incorrect wear and damage. Install new tires as necessary. 

Chunk 2:

etc.). The specifications shown in this manual do not apply to vehicles equipped with aftermarket equipment. Visual Inspection Chart 5. If an obvious cause for an observed or reported condition is found, correct the cause (if possible) before proceeding to the next step. 6. If the fault is not visually evident, GO to Symptom Chart - Suspension System or GO to Symptom Chart - NVH . Symptom Chart   Suspension System SECTION 204-00: Suspension Syste

In [None]:

# CREATE EMBEDDINGS
from sentence_transformers import SentenceTransformer
# Load embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# embeddings for all chunks
embeddings = embed_model.encode(chunks, show_progress_bar=True)

print("Total embeddings created:", len(embeddings))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Total embeddings created: 1197


In [None]:

# embeddings to numpy array
embeddings_np = np.array(embeddings).astype("float32")
# embedding dimension
dimension = embeddings_np.shape[1]
# FAISS indexing
index = faiss.IndexFlatL2(dimension)
# embeddings to index
index.add(embeddings_np)
print("Total vectors stored:", index.ntotal)


Total vectors stored: 1197


In [None]:

#RETRIEVAL FUNCTION
def retrieve_chunks(query, k=5):
    # Converting query to embedding
    query_embedding = embed_model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")
    # Search FAISS index
    distances, indices = index.search(query_embedding, k)
    # retrieving relevant chunks
    retrieved = [chunks[i] for i in indices[0]]

    return retrieved


In [None]:

# LOAD LLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
model_name = "google/flan-t5-large"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Move to GPU if available
model = model.to(device)
print("LLM loaded successfully")


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/558 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

LLM loaded successfully


In [None]:
# ================================
# CELL 11 — SPEC EXTRACTION
# ================================

def extract_specifications(query):
    # Retrieve relevant chunks
    retrieved_chunks = retrieve_chunks(query, k=5)
    # Combine context
    context = "\n\n".join(retrieved_chunks)

    # Prompt
    prompt = f"""
Extract vehicle specifications from the text below.

Return output in this format:

Component | Spec Type | Value | Unit

If not found, return 'Not Available'.

Text:
{context}

Query:
{query}
"""

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response


In [None]:
# FINAL STRUCTURED EXTRACTION
def clean_component_name(name):
    name = name.strip()
    remove_words = [
        "description",
        "nm",
        "lb-ft",
        "lb-in",
        "pt)",
        "specification"
    ]

    for word in remove_words:
        name = name.replace(word, "")
        name = name.replace(word.upper(), "")

    name = " ".join(name.split())

    return name

def structured_extraction(query):
    retrieved_chunks = retrieve_chunks(query, k=5)
    context = "\n".join(retrieved_chunks)
    pattern = r'([A-Za-z \-\(\)]+)\s+(\d+)\s*(Nm)?'
    matches = re.findall(pattern, context)
    data = []
    noise_words = [
        "section", "manual", "page", "specification",
        "information", "procedure", "workshop",
        "general", "item", "fluid", "thickness",
        "dot", "performance", "capacity",
        "system", "actuation", "control"
    ]

    for comp, value, unit in matches:
        comp_clean = clean_component_name(comp)
        if len(comp_clean) < 8:
            continue
        if any(word in comp_clean.lower() for word in noise_words):
            continue
        if int(value) > 500:
            continue

        data.append({
            "component": comp_clean,
            "spec_type": "Torque",
            "value": value,
            "unit": "Nm"
        })

    return data


In [None]:
# CELL 12 — TEST EXTRACTION
query ="Front brake torque values"
result = extract_specifications(query)
print("Query:\n", query)
print("\nExtracted Specifications:\n")
print(result)


Query:
 Front brake torque values

Extracted Specifications:

Component | Spec Type | Value | Unit


In [None]:
query = "Brake torque specifications"
results = structured_extraction(query)
print("Total specs extracted:", len(results))
results[:10]


Total specs extracted: 22


[{'component': 'Description Nm Brake booster nuts a Brake master cylinder nuts',
  'spec_type': 'Torque',
  'value': '25',
  'unit': 'Nm'},
 {'component': 'lower Air Cleaner (ACL) housing assembly bolts',
  'spec_type': 'Torque',
  'value': '15',
  'unit': 'Nm'},
 {'component': 'Description Nm ABS module screws',
  'spec_type': 'Torque',
  'value': '3',
  'unit': 'Nm'},
 {'component': 'Front wheel speed sensor bolt',
  'spec_type': 'Torque',
  'value': '17',
  'unit': 'Nm'},
 {'component': 'Front wheel speed sensor harness bolt',
  'spec_type': 'Torque',
  'value': '12',
  'unit': 'Nm'},
 {'component': 'HCU -to-bracket nuts',
  'spec_type': 'Torque',
  'value': '8',
  'unit': 'Nm'},
 {'component': 'HCU bracket bolts',
  'spec_type': 'Torque',
  'value': '20',
  'unit': 'Nm'},
 {'component': 'Master cylinder primary brake tube-to- HCU fitting',
  'spec_type': 'Torque',
  'value': '28',
  'unit': 'Nm'},
 {'component': 'Master cylinder secondary brake tube-to- HCU fitting',
  'spec_type':

In [None]:
# Run extraction
query = "Brake torque specifications"
results = structured_extraction(query)
# Convert to DataFrame
df = pd.DataFrame(results)
# Save CSV
csv_path = "vehicle_torque_specs.csv"

df.to_csv(csv_path, index=False)

print("CSV saved as:", csv_path)

df.head()



CSV saved as: vehicle_torque_specs.csv


Unnamed: 0,component,spec_type,value,unit
0,Description Nm Brake booster nuts a Brake mast...,Torque,25,Nm
1,lower Air Cleaner (ACL) housing assembly bolts,Torque,15,Nm
2,Description Nm ABS module screws,Torque,3,Nm
3,Front wheel speed sensor bolt,Torque,17,Nm
4,Front wheel speed sensor harness bolt,Torque,12,Nm
