In [1]:
# Install necessary libraries
! apt-get install -y tesseract-ocr poppler-utils tesseract-ocr-hin
! pip install pytesseract opencv-python-headless pillow numpy pdf2image datasets groq numpy scikit-learn


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-hin tesseract-ocr-osd
0 upgraded, 5 newly installed, 0 to remove and 29 not upgraded.
Need to get 5,915 kB of archives.
After this operation, 17.5 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy/universe amd64 te

In [2]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import cv2
from google.colab import files
from pdf2image import convert_from_path

# In Google Colab, Tesseract is installed in /usr/bin/tesseract
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [3]:
# import pytesseract
# from PIL import Image, ImageEnhance
# import cv2
# import numpy as np
# from pdf2image import convert_from_path

# # Set the correct path for Tesseract in Google Colab
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# # Function to adjust image properties
# def preprocess_image(image_path):
#     img = Image.open(image_path)  # Open the image using PIL

#     img_gray = img.convert('L')  # Convert the image to grayscale (helpful for OCR)

#     enhancer_contrast = ImageEnhance.Contrast(img_gray)  # Enhance contrast
#     img_contrast = enhancer_contrast.enhance(2)  # Increase contrast by a factor of 2

#     enhancer_brightness = ImageEnhance.Brightness(img_contrast)  # Enhance brightness
#     img_bright = enhancer_brightness.enhance(1.5)  # Increase brightness by a factor of 1.5

#     enhancer_sharpness = ImageEnhance.Sharpness(img_bright)  # Enhance sharpness
#     img_sharp = enhancer_sharpness.enhance(2)  # Increase sharpness by a factor of 2

#     return img_sharp  # Return the processed image

# # Function to process PDF
# def pdf_to_text(pdf_path):
#     images = convert_from_path(pdf_path)  # Convert PDF to a list of images

#     text = ""
#     for page_img in images:
#         processed_img = preprocess_image(page_img)  # Preprocess each page image
#         page_text = pytesseract.image_to_string(processed_img)  # Extract text from the image
#         text += page_text  # Add extracted text to the overall text

#     return text  # Return the extracted text from the entire PDF

# # Function to process image (JPG/PNG)
# def image_to_text(image_path):
#     processed_img = preprocess_image(image_path)  # Preprocess the image
#     text = pytesseract.image_to_string(processed_img)  # Extract text using Tesseract
#     return text  # Return the extracted text

# # Main function
# def convert_to_text(file_path):
#     if file_path.lower().endswith('.pdf'):  # Check if the file is a PDF
#         return pdf_to_text(file_path)  # If it's a PDF, process it as a PDF

#     elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):  # If it's an image file
#         return image_to_text(file_path)  # Process the image and extract text

#     return "Unsupported file format"  # Return an error if the file format is not supported

# # Example usage
# file_path = '/content/hw2.png'  # Path to the file you want to process
# extracted_text = convert_to_text(file_path)  # Extract text from the file
# print(extracted_text)  # Print the extracted text


In [4]:
# Set the correct path for Tesseract in Google Colab
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Function to preprocess the image for better OCR results (for handwritten text)
def preprocess_image_for_handwriting(img):
    # Convert the image to grayscale
    img_gray = img.convert('L')

    # Thresholding (adaptive) to make the image more binary (black and white)
    img_threshold = img_gray.point(lambda p: p > 150 and 255)

    # Enhance contrast
    enhancer_contrast = ImageEnhance.Contrast(img_threshold)
    img_contrast = enhancer_contrast.enhance(2)

    # Enhance sharpness
    enhancer_sharpness = ImageEnhance.Sharpness(img_contrast)
    img_sharp = enhancer_sharpness.enhance(2)

    # Optional: Convert to NumPy array to use OpenCV for additional noise removal (if needed)
    img_np = np.array(img_sharp)
    img_filtered = cv2.medianBlur(img_np, 3)  # Apply median filter to remove noise

    # Convert the NumPy array back to PIL image
    img_final = Image.fromarray(img_filtered)

    return img_final

# Set custom Tesseract configurations for handwriting and Hindi + English languages
custom_config = r'--oem 3 --psm 6'  # --psm 6 is good for uniform blocks like tables

# Function to extract text using Tesseract after preprocessing
def image_to_text_with_tesseract(img, language='eng'):
    processed_img = preprocess_image_for_handwriting(img)  # Preprocess the image
    text = pytesseract.image_to_string(processed_img, config=custom_config, lang=language)  # Extract text using Tesseract
    return text

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    # Convert each page of the PDF into an image
    images = convert_from_path(pdf_path, dpi=300)  # You can adjust dpi for better quality
    return images

# Function to process image or PDF
def process_pdf_or_image(file_path, language='eng'):
    if file_path.endswith('.pdf'):
        images = pdf_to_images(file_path)  # If it's a PDF, convert to images
        all_text = ''
        for page_num, img in enumerate(images):
            print(f"Processing page {page_num + 1}...")
            text = image_to_text_with_tesseract(img, language)  # Process and extract text for each page
            all_text += f"--- Page {page_num + 1} ---\n{text}\n"
        return all_text
    else:
        # If it's an image, use the image to text extraction logic
        img = Image.open(file_path)
        return image_to_text_with_tesseract(img, language)

# Upload the PDF or image file to session storage (via Google Colab file upload)
uploaded = files.upload()

# Get the file path of the uploaded file from the session storage
file_name = list(uploaded.keys())[0]
file_path = '/content/' + file_name  # Path to the uploaded file

# Set language to both Hindi and English
language = 'eng'  # Hindi and English

# Extract text using Tesseract (after preprocessing)
extracted_text = process_pdf_or_image(file_path, language)

# Print the result
print("Extracted text:\n", extracted_text)


Saving Land Register Extract.pdf to Land Register Extract.pdf
Processing page 1...
Processing page 2...
Extracted text:
 --- Page 1 ---
Land Register Extract

District: Sample District

Tehsil: Sample Tehsil

Village: Green Valley

Khata No.: 23456

Khasra No.: 78901

Plot No.: 112

Owner Name: Jane Smith
Father'’s/Husband's Name: William Smith
Address: 456 Green Avenue, Green City
Area: 3000 sq meters

Class of Land: Agricultural

Soil Type: Loamy

Boundary: North: Highway, South: River, East: Orchard, West: Forest
Current Use: Agriculture

Previous Use: Agricultural

Date of Mutation: 12-02-2023

Mutation Number: MUT234567

Reason for Mutation: Sale

Mortgage: None

Lease: None

Court Decree: No pending cases
Registering Authority: Officer Name
Designation: Land Records Officer

Date: 10-03-2025

Land Register Extract

District: Blue Hills District

Tehsil: Blue Hills Tehsil

Village: Riverstone

Khata No.: 34567

Khasra No.: 89012

Plot No.: 113

Owner Name: Sarah Williams
Father's/

In [None]:
import os
import time
from groq import Groq
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Set up Groq API (Replace with your actual API key)
GROQ_API_KEY = ""
client = Groq(api_key=GROQ_API_KEY)

# Define the prompt template
system_prompt = """
[INST]<<SYS>> You are a Json maker and not a chatbot, just give json, no other text.You have a text extracted from an image. Please correct the text while keeping the format and items intact. If something is unclear or not easily understandable, do not change it.
Kindly ensure the following:
Correct any obvious errors in spelling, numbers, and formatting.
Do not modify or guess any unclear words or numbers.
Maintain the structure and layout as it is, only correcting what is necessary."
now fetch all necessary information and convert it to json, use snake case in json

do not add any text other than json so that it can be directly sent to in api response

<</SYS>>[/INST]"""

In [6]:
response = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": extracted_text}
        ],
        max_tokens=1000,
        temperature=0,
        top_p=0.95,
        stop=['INST']
        )

refined_document = response.choices[0].message.content

print(refined_document)

[
    {
        "district": "sample_district",
        "tehsil": "sample_tehsil",
        "village": "green_valley",
        "khata_no": 23456,
        "khasra_no": 78901,
        "plot_no": 112,
        "owner_name": "jane_smith",
        "father_husband_name": "william_smith",
        "address": "456_green_avenue_green_city",
        "area": 3000,
        "class_of_land": "agricultural",
        "soil_type": "loamy",
        "boundary": "north_highway_south_river_east_orchard_west_forest",
        "current_use": "agriculture",
        "previous_use": "agricultural",
        "date_of_mutation": "12-02-2023",
        "mutation_number": "mut234567",
        "reason_for_mutation": "sale",
        "mortgage": "none",
        "lease": "none",
        "court_decree": "no_pending_cases",
        "registering_authority": "officer_name",
        "designation": "land_records_officer",
        "date": "10-03-2025"
    },
    {
        "district": "blue_hills_district",
        "tehsil": "blue_hi

In [7]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install scikit-learn

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

In [8]:
import json
import requests
import faiss
import numpy as np

In [9]:
from sentence_transformers import SentenceTransformer

print(f"Type of refined_document before conversion: {type(refined_document)}")
if isinstance(refined_document, str):
    print("Parsing refined_document from JSON string to dictionary...")
    refined_document = json.loads(refined_document)
print(f"Type of refined_document before conversion: {type(refined_document)}")
print(refined_document)

Type of refined_document before conversion: <class 'str'>
Parsing refined_document from JSON string to dictionary...
Type of refined_document before conversion: <class 'list'>
[{'district': 'sample_district', 'tehsil': 'sample_tehsil', 'village': 'green_valley', 'khata_no': 23456, 'khasra_no': 78901, 'plot_no': 112, 'owner_name': 'jane_smith', 'father_husband_name': 'william_smith', 'address': '456_green_avenue_green_city', 'area': 3000, 'class_of_land': 'agricultural', 'soil_type': 'loamy', 'boundary': 'north_highway_south_river_east_orchard_west_forest', 'current_use': 'agriculture', 'previous_use': 'agricultural', 'date_of_mutation': '12-02-2023', 'mutation_number': 'mut234567', 'reason_for_mutation': 'sale', 'mortgage': 'none', 'lease': 'none', 'court_decree': 'no_pending_cases', 'registering_authority': 'officer_name', 'designation': 'land_records_officer', 'date': '10-03-2025'}, {'district': 'blue_hills_district', 'tehsil': 'blue_hills_tehsil', 'village': 'riverstone', 'khata_no'

In [10]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# def extract_text_from_json(refined_document):
#     text_list = []
#     key_value_list = []  # Will store key-value pairs for direct referencing

#     # Combine related fields into larger chunks
#     for page, content in refined_document.items():
#         combined_chunk = " ".join([f"{key}: {value}" for key, value in content.items()])
#         text_list.append(combined_chunk)
#         key_value_list.append((page, combined_chunk))  # Store the page and its combined content

#     return text_list, key_value_list
# This function extracts text from a list of records (refined_document)
def extract_text_from_json(refined_document):
    text_list = []
    key_value_list = []  # Will store key-value pairs for direct referencing

    # Iterate over each record in the refined_document
    for idx, record in enumerate(refined_document):
        # Combine related fields into a larger chunk for each record
        combined_chunk = " ".join([f"{key}: {value}" for key, value in record.items()])

        # Append the combined chunk to the text_list
        text_list.append(combined_chunk)

        # Store the original index (or record identifier) and its combined content
        key_value_list.append((f"Record {idx + 1}", combined_chunk))  # Using "Record 1", "Record 2", etc.

    return text_list, key_value_list

# FAISS Search Function
def search_faiss(query, index, text_list, key_value_list, top_k=5):
    # Convert the query to an embedding
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().detach().numpy()

    # Perform the search in the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the matching text or key-value pairs from the FAISS index results
    search_results = []

    # Loop over the first dimension (queries) and retrieve the key-value pairs or text
    for idx, dist in zip(indices[0], distances[0]):
        search_results.append((text_list[idx], dist))  # Use the full sentence/response
    return search_results

# Function to process the results with LLM (Groq or similar)
def process_with_llm(search_results, query):
    # Join the relevant data from FAISS search results
    relevant_data = "\n".join([result[0] for result in search_results])

    # Now craft a prompt that includes the query and the relevant data
    prompt = f"""
    You are a helpful assistant. Based on the data provided below, answer the user's question as specifically as possible in the form of a summary text, give it in form of table only when if asked specifically.
    The user is asking: {query}

    Here is the relevant data:
    {relevant_data}
    """

    # Assuming client is properly set up for LLM (Groq or similar)
    response = client.chat.completions.create(
        model="llama3-70b-8192",  # Specify the LLM model
        messages=[{"role": "system", "content": "You are an assistant."},
                  {"role": "user", "content": prompt}],
    )

    answer = response.choices[0].message.content
    return answer

def handle_query(query):
    # Extract text and key-value pairs from the refined document
    text_list, key_value_list = extract_text_from_json(refined_document)

    # Generate embeddings for the combined chunks of data
    embeddings = model.encode(text_list, convert_to_tensor=True)
    embeddings = embeddings.cpu().detach().numpy()

    # Create FAISS index to store vectors
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Search relevant data in FAISS index based on the query
    search_results = search_faiss(query, index, text_list, key_value_list, top_k=5)

    # Process search results with LLM for the final answer
    answer = process_with_llm(search_results, query)
    return answer

# Example dynamic query
user_query = "father name of person having land with khasra number 89012?"
answer = handle_query(user_query)
print("Answer from LLM:", answer)

Answer from LLM: The father's name of the person having land with khasra number 89012 is Michael Williams.
