In [3]:
import os
import json
import random
import string

# ---------- CONFIGURABLE DATA ----------
document_types = {
    "passport": ["name", "dob", "issue_date", "expiry_date", "nationality", "passport_number", "place_of_birth", "gender"],
    "id_card": ["name", "dob", "issue_date", "expiry_date", "father_name", "cnic_number", "address", "gender"],
    "license": ["name", "dob", "issue_date", "expiry_date", "license_number", "vehicle_class", "blood_group", "address"],
    "birth_certificate": ["name", "dob", "father_name", "mother_name", "place_of_birth", "registration_number", "issue_date", "gender"]
}

# ---------- RANDOM VALUE GENERATORS ----------
def generate_random_id():
    return "".join(random.choices(string.digits, k=12))

def generate_random_date(start_year=1990, end_year=2025):
    year = random.randint(start_year, end_year)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    return f"{year:04d}-{month:02d}-{day:02d}"

def random_value_for_field(field):
    if "date" in field:
        return generate_random_date()
    elif "dob" in field:
        return generate_random_date(1970, 2010)
    elif "name" in field:
        return random.choice(["Ghulam Ahmad", "Ali Khan", "Sara Malik", "Hassan Raza"])
    elif "number" in field:
        return "".join(random.choices(string.digits, k=random.randint(8, 12)))
    elif "gender" in field:
        return random.choice(["Male", "Female"])
    elif "nationality" in field:
        return random.choice(["Pakistani", "Indian", "British", "Canadian"])
    elif "place" in field:
        return random.choice(["Lahore", "Karachi", "Islamabad", "London", "Toronto"])
    elif "blood" in field:
        return random.choice(["A+", "B+", "O+", "AB+"])
    elif "address" in field:
        return random.choice(["123 Main Street", "45 Park Avenue", "House #21, DHA", "Street 7, Gulberg"])
    else:
        return "N/A"

def generate_text_from_response(doctype, id_val, response_data):
    text_lines = [f"This is an official {doctype.upper()} document.", f"Document ID: {id_val}"]
    for key, value in response_data.items():
        key_clean = key.replace("_", " ").title()
        text_lines.append(f"{key_clean}: {value}")
    text_lines.append(f"This document certifies that {response_data.get('name','')} is the rightful holder of this {doctype}.")
    return "\n".join(text_lines)

# ---------- MAIN SCRIPT ----------
output_base = "synthetic_documents"
os.makedirs(output_base, exist_ok=True)

for doctype, fields in document_types.items():
    folder_path = os.path.join(output_base, doctype)
    os.makedirs(folder_path, exist_ok=True)

    for i in range(1, 5):  # 4 JSONs per document type
        id_val = generate_random_id()

        # Build response data dynamically
        response_data = {field: random_value_for_field(field) for field in fields}

        # Generate text containing key-value pairs
        text_val = generate_text_from_response(doctype, id_val, response_data)

        data = {
            "id": id_val,
            "doctype": doctype,
            "text": text_val,
            "response": response_data
        }

        file_path = os.path.join(folder_path, f"{doctype}_{i}.json")
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4)

print(f"✅ Synthetic JSONs with up to 8 response fields generated in '{output_base}' folder!")


✅ Synthetic JSONs with up to 8 response fields generated in 'synthetic_documents' folder!


In [4]:
import os
import json

input_base = "synthetic_documents"  # Folder containing document type folders
output_file = "extraction_labels.py"

doctype_keys = {}

# Traverse all folders and JSON files
for doctype_folder in os.listdir(input_base):
    folder_path = os.path.join(input_base, doctype_folder)
    if os.path.isdir(folder_path):
        keys_set = set()
        for file in os.listdir(folder_path):
            if file.endswith(".json"):
                file_path = os.path.join(folder_path, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    if "response" in data:
                        keys_set.update(data["response"].keys())
        doctype_keys[doctype_folder] = sorted(list(keys_set))

# Write extraction_labels.py
with open(output_file, "w", encoding="utf-8") as f:
    f.write("extraction_labels = {\n\n")
    for doctype, keys in doctype_keys.items():
        f.write(f'    "{doctype}": [\n')
        for key in keys:
            f.write(f'        "{key}",\n')
        f.write("    ],\n\n")
    f.write("}\n")

print(f"✅ extraction_labels.py created successfully with keys from all documents!")


✅ extraction_labels.py created successfully with keys from all documents!


In [1]:
# AIzaSyDNaddBVC8ca4s97xpRwYxdA4CYQ3cOlJM

In [None]:
# import os
# import json
# import re
# import importlib.util
# import google.generativeai as genai

# # ------------- CONFIGURATION -------------
# GEMINI_API_KEY = "AIzaSyDNaddBVC8ca4s97xpRwYxdA4CYQ3cOlJM"  # Replace with your API key
# INPUT_DIR = "copied_docs"
# OUTPUT_DIR = "gemini_predictions"

# # ------------- LOAD EXTRACTION LABELS -------------
# def load_extraction_labels(file_path="extraction_labels.py"):
#     spec = importlib.util.spec_from_file_location("extraction_labels", file_path)
#     labels = importlib.util.module_from_spec(spec)
#     spec.loader.exec_module(labels)
#     return labels.extraction_labels

# # ------------- GEMINI SETUP -------------
# def setup_gemini():
#     genai.configure(api_key=GEMINI_API_KEY)
#     return genai.GenerativeModel("gemini-1.5-flash")

# # ------------- PROMPT BUILDER -------------
# def build_prompt(text, doctype, keys):
#     keys_str = ", ".join(keys)
#     return f"""
# You are an information extraction model. 

# Task:
# Extract the following fields from the OCR text of a {doctype}: {keys_str}.
# Return ONLY a valid JSON object. 
# Do not include any explanations, text before or after JSON, or markdown formatting. 
# Ensure all keys appear exactly as provided. 
# If a value is missing, use an empty string ("").

# OCR Text:
# {text}
# """

# # ------------- SAFE JSON PARSER -------------
# def safe_parse_json(response_text, keys):
#     # Try direct JSON parse first
#     try:
#         return json.loads(response_text)
#     except:
#         pass

#     # Attempt to extract JSON substring using regex
#     match = re.search(r'\{[\s\S]*\}', response_text)
#     if match:
#         json_str = match.group(0)
#         try:
#             return json.loads(json_str)
#         except:
#             pass

#     # If all fails, return empty fields
#     return {k: "" for k in keys}

# # ------------- MAIN PROCESSING FUNCTION -------------
# def process_documents():
#     extraction_labels = load_extraction_labels()
#     model = setup_gemini()

#     os.makedirs(OUTPUT_DIR, exist_ok=True)

#     for doctype_folder in os.listdir(INPUT_DIR):
#         folder_path = os.path.join(INPUT_DIR, doctype_folder)
#         if not os.path.isdir(folder_path):
#             continue

#         output_folder = os.path.join(OUTPUT_DIR, doctype_folder)
#         os.makedirs(output_folder, exist_ok=True)

#         for file in os.listdir(folder_path):
#             if file.endswith(".json"):
#                 with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
#                     data = json.load(f)

#                 doctype = data.get("doctype")
#                 text = data.get("text")
#                 keys = extraction_labels.get(doctype, [])

#                 if not keys:
#                     print(f"⚠️ No extraction keys found for doctype {doctype}, skipping {file}")
#                     continue

#                 prompt = build_prompt(text, doctype, keys)
#                 response = model.generate_content(prompt)

#                 response_json = safe_parse_json(response.text, keys)

#                 new_data = {
#                     "id": data["id"],
#                     "doctype": doctype,
#                     "text": text,
#                     "response": response_json
#                 }

#                 with open(os.path.join(output_folder, file), "w", encoding="utf-8") as out_f:
#                     json.dump(new_data, out_f, indent=4)

#     print(f"✅ Gemini predictions saved in '{OUTPUT_DIR}' folder!")

# # ------------- RUN SCRIPT -------------
# if __name__ == "__main__":
#     process_documents()


✅ Gemini predictions saved in 'gemini_predictions' folder!


In [None]:
import os
import json
import re
import importlib.util
import google.generativeai as genai

# ------------- CONFIGURATION -------------
GEMINI_API_KEY = "API Key"  # Replace with your API key
INPUT_DIR = "copied_docs"
OUTPUT_DIR = "gemini_predictions_v2"

# ------------- LOAD EXTRACTION LABELS -------------
def load_extraction_labels(file_path="extraction_labels.py"):
    spec = importlib.util.spec_from_file_location("extraction_labels", file_path)
    labels = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(labels)
    return labels.extraction_labels

# ------------- GEMINI SETUP -------------
def setup_gemini():
    genai.configure(api_key=GEMINI_API_KEY)
    return genai.GenerativeModel("gemini-1.5-flash")

# ------------- PROMPT BUILDER -------------
def build_prompt(text, doctype, keys):
    keys_str = ", ".join(keys)
    return f"""
You are an advanced structured data extraction model.

You will be given OCR text for a document of type: "{doctype}".
Your job is to extract values for ALL of these fields: {keys_str}.

Guidelines:
1. Always include every key exactly as provided in the list.
2. If a value is not found in the text, set it as an empty string "" (do NOT omit the key).
3. If a value is found, extract it exactly as it appears in the text.
4. Return a single valid JSON object with all keys present.
5. Do NOT include explanations, markdown, or extra text — only output valid JSON.

Example:
OCR TEXT:
"This is a Passport of Ali Khan. Date of Birth: 1995-07-21. Passport Number: PK1234567"

Expected JSON:
{{
    "name": "Ali Khan",
    "dob": "1995-07-21",
    "passport_number": "PK1234567",
    "nationality": ""
}}
OCR TEXT:
{text}
"""

# ------------- SAFE JSON PARSER -------------
def safe_parse_json(response_text, keys):
    # Try direct JSON parse first
    try:
        return json.loads(response_text)
    except:
        pass

    # Attempt to extract JSON substring using regex
    match = re.search(r'\{[\s\S]*\}', response_text)
    if match:
        json_str = match.group(0)
        try:
            return json.loads(json_str)
        except:
            pass

    # If all fails, return empty fields
    return {k: "" for k in keys}

# ------------- MAIN PROCESSING FUNCTION -------------
def process_documents():
    extraction_labels = load_extraction_labels()
    model = setup_gemini()

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    for doctype_folder in os.listdir(INPUT_DIR):
        folder_path = os.path.join(INPUT_DIR, doctype_folder)
        if not os.path.isdir(folder_path):
            continue

        output_folder = os.path.join(OUTPUT_DIR, doctype_folder)
        os.makedirs(output_folder, exist_ok=True)

        for file in os.listdir(folder_path):
            if file.endswith(".json"):
                with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
                    data = json.load(f)

                doctype = data.get("doctype")
                text = data.get("text")
                keys = extraction_labels.get(doctype, [])

                if not keys:
                    print(f"⚠️ No extraction keys found for doctype {doctype}, skipping {file}")
                    continue

                prompt = build_prompt(text, doctype, keys)
                response = model.generate_content(prompt)

                response_json = safe_parse_json(response.text, keys)

                new_data = {
                    "id": data["id"],
                    "doctype": doctype,
                    "text": text,
                    "response": response_json
                }

                with open(os.path.join(output_folder, file), "w", encoding="utf-8") as out_f:
                    json.dump(new_data, out_f, indent=4)

    print(f"✅ Gemini predictions saved in '{OUTPUT_DIR}' folder!")

# ------------- RUN SCRIPT -------------
if __name__ == "__main__":
    process_documents()


  from .autonotebook import tqdm as notebook_tqdm


✅ Gemini predictions saved in 'gemini_predictions_v2' folder!
