In [5]:
import os
import csv
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.text.paragraph import Paragraph
from docx.table import Table

# --------- CONFIGURATION -----------
DOCX_FILE = "Coded Segments.docx"  # change to your docx file path
OUTPUT_IMAGES_FOLDER = "extracted_images"
CSV_FILE = "image_metadata.csv"
# -----------------------------------


def iter_block_items(parent):
    """
    Generate a sequence of block items (paragraphs and tables) in document order.
    (Adapted from https://github.com/python-openxml/python-docx/issues/40)
    """
    if hasattr(parent, 'element'):
        parent_elm = parent.element.body
    else:
        parent_elm = parent._element
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def paragraph_contains_image(paragraph):
    """
    Returns True if the paragraph has a drawing element (an inline image).
    """
    # Look for a drawing element in the paragraph XML.
    drawings = paragraph._element.xpath('.//w:drawing')
    return bool(drawings)


def extract_inline_images(doc):
    """
    Extracts all inline images from the document (in order) and returns a list of tuples:
       (image_blob, image_extension)
    """
    images = []
    # The doc.inline_shapes list holds images in document order.
    # Note: each inline shape has an attribute that points to the rId of the image.
    for shape in doc.inline_shapes:
        try:
            rId = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
            image_part = doc.part.related_parts[rId]
            content_type = image_part.content_type
            if content_type == 'image/png':
                ext = '.png'
            elif content_type in ('image/jpeg', 'image/jpg'):
                ext = '.jpg'
            elif content_type == 'image/gif':
                ext = '.gif'
            else:
                ext = ''  # fallback
            images.append((image_part.blob, ext))
        except Exception as e:
            print("Warning: could not extract an inline image:", e)
    return images


def process_document(docx_path):
    # Open the document
    doc = Document(docx_path)
    # Make sure the folder for saving images exists
    os.makedirs(OUTPUT_IMAGES_FOLDER, exist_ok=True)
    # Pre‐extract all inline images in order.
    inline_images = extract_inline_images(doc)
    image_index = 0  # index into inline_images

    # Prepare a list to hold metadata records.
    records = []

    # Iterate over block items in document order.
    blocks = list(iter_block_items(doc))
    num_blocks = len(blocks)
    idx = 0

    while idx < num_blocks:
        block = blocks[idx]
        if isinstance(block, Paragraph) and paragraph_contains_image(block):
            # We assume this paragraph marks the beginning of a record.
            image_index += 1  # increase image counter

            # Save the corresponding image from inline_images
            if image_index - 1 < len(inline_images):
                image_blob, ext = inline_images[image_index - 1]
                image_filename = f"image_{image_index}{ext}"
                image_path = os.path.join(OUTPUT_IMAGES_FOLDER, image_filename)
                with open(image_path, "wb") as img_file:
                    img_file.write(image_blob)
                print(f"Extracted image saved to: {image_path}")
            else:
                image_filename = f"image_{image_index}{ext}"
                print("Warning: No inline image found for this record!")
            
            # Look ahead in subsequent blocks for metadata.
            # We assume that after the image block, there may be an optional description,
            # then a block starting with "Code:" and finally a block with the article name.
            code = ""
            article = ""
            j = idx + 1
            while j < num_blocks:
                next_block = blocks[j]
                if isinstance(next_block, Paragraph):
                    text = next_block.text.strip()
                    # If we find another image block, we assume the current record is finished.
                    if paragraph_contains_image(next_block):
                        break
                    # Look for a paragraph that starts with "Code:"
                    if text.startswith("Code:"):
                        code = text[len("Code:"):].strip()
                        # If the next block exists, we assume it is the article name.
                        if j + 1 < num_blocks:
                            article = blocks[j + 1].text.strip()
                            j += 1  # skip the article name block in the next iteration
                        break  # metadata found; stop scanning further
                j += 1

            records.append({
                "image_file": image_filename,
                "code": code,
                "article": article
            })
            # Continue from where we left off.
            idx = j
        else:
            idx += 1

    return records


def save_records_to_csv(records, csv_filename):
    # Check if CSV file exists; if not, write header.
    file_exists = os.path.isfile(csv_filename)
    with open(csv_filename, "a", newline="", encoding="utf-8-sig") as csvfile:
        fieldnames = ["image_file", "code", "article"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for record in records:
            writer.writerow(record)
    print(f"Metadata saved to CSV file: {csv_filename}")


if __name__ == "__main__":
    records = process_document(DOCX_FILE)
    if records:
        save_records_to_csv(records, CSV_FILE)
    else:
        print("No image records were found in the document.")


Extracted image saved to: extracted_images\image_1.png
Extracted image saved to: extracted_images\image_2.png
Extracted image saved to: extracted_images\image_3.png
Extracted image saved to: extracted_images\image_4.png
Extracted image saved to: extracted_images\image_5.png
Extracted image saved to: extracted_images\image_6.png
Extracted image saved to: extracted_images\image_7.png
Extracted image saved to: extracted_images\image_8.png
Extracted image saved to: extracted_images\image_9.png
Extracted image saved to: extracted_images\image_10.png
Extracted image saved to: extracted_images\image_11.png
Extracted image saved to: extracted_images\image_12.png
Extracted image saved to: extracted_images\image_13.png
Extracted image saved to: extracted_images\image_14.png
Extracted image saved to: extracted_images\image_15.png
Extracted image saved to: extracted_images\image_16.png
Extracted image saved to: extracted_images\image_17.png
Extracted image saved to: extracted_images\image_18.png
E

In [None]:
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ----------------- CONFIGURATION ------------------
# Folder where the images extracted from the Word document are stored.
IMAGES_FOLDER = r"C:\Users\Myrosalva Volosko\Documents\scraper_google_image\extracted_images"
# CSV file created by your DOCX processing code.
METADATA_CSV = r"C:\Users\Myrosalva Volosko\Documents\scraper_google_image\image_metadata.csv"
# CSV file where search results will be saved.
SEARCH_RESULTS_CSV = "image_search_results.csv"

# Set up Chrome options (adjust if needed)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
# --------------------------------------------------

def search_exact_image_matches(image_path):
    """
    Open Google Images, upload the image, click the 'Exact matches' button (if available),
    and then pause to collect user input.
    """
    driver = webdriver.Chrome(options=chrome_options)
    try:
        # Open Google Images
        driver.get("https://www.google.com/imghp")

        # Try to dismiss any cookie prompt.
        try:
            reject_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[normalize-space()='Reject all']"))
            )
            reject_button.click()
            print("Cookie prompt: Rejected cookies.")
        except Exception:
            print("Cookie prompt not found or already handled.")

        # Click on "Search by image" (the camera icon)
        try:
            camera_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='Search by image']"))
            )
            driver.execute_script("arguments[0].click();", camera_button)
            print("Clicked the camera button.")
        except Exception as e:
            print("Camera button not clickable via aria-label. Trying alternative method...")
            camera_button = driver.find_element(By.CLASS_NAME, "nDcEnd")
            camera_button.click()

        # Wait for the image upload modal to appear.
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "encoded_image"))
        )

        # Upload the image.
        upload_input = driver.find_element(By.NAME, "encoded_image")
        upload_input.send_keys(image_path)
        print("Image uploaded. Waiting for search results...")

        # Wait until search results load.
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "search"))
        )

        # Click on the "Exact matches" button if available.
        try:
            exact_matches_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(., 'Exact matches')]"))
            )
            driver.execute_script("arguments[0].click();", exact_matches_button)
            print("Clicked 'Exact Matches' button.")
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "search"))
            )
        except Exception:
            print("No 'Exact Matches' button found. Proceeding with available results.")

        # At this point the results page is displayed.
        # Prompt the user for details.
        user_data = get_user_input(driver.current_url)
        return user_data

    except Exception as e:
        print(f"Error during search for image {image_path}: {e}")
        return None
    finally:
        driver.quit()  # Ensure the browser is closed

def get_user_input(current_url):
    """
    Prompts the user for details once the search results are loaded.
    If the user types "skip" for the first question, the remaining questions are skipped,
    and a prompt asks for a comment which is saved in the 'comments' field.
    """
    num_articles = input("How many articles are there? (or type 'skip' to skip): ").strip()
    if num_articles.lower() == "skip":
        comment = input("Please explain why you skipped: ").strip()
        return {
            "num_articles": "N/A",
            "topics": "N/A",
            "image_details": "image was analysed before",
            "comments": comment,
            "search_url": current_url
        }
    
    topics = input("What are the topics discussed in the search? ").strip()
    image_details = input("Is it international or ukrainian shared image? Additional detail: ").strip()
    return {
        "num_articles": num_articles,
        "topics": topics,
        "image_details": image_details,
        "comments": "",
        "search_url": current_url
    }

def save_search_results_to_csv(record, csv_file=SEARCH_RESULTS_CSV):
    """
    Saves the combined metadata and search results to a CSV file.
    If the CSV file does not exist, a header is written.
    Uses 'utf-8-sig' encoding so Excel can properly open Ukrainian text.
    """
    file_exists = os.path.isfile(csv_file)
    with open(csv_file, 'a', newline='', encoding='utf-8-sig') as csvfile:
        # Combined field names: metadata from the DOCX + search result details.
        fieldnames = ['image_file', 'code', 'article', 'search_url', 'num_articles', 'topics', 'image_details', 'comments']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(record)
    print(f"Data for image {record.get('image_file')} saved to CSV ({csv_file}).")

def load_metadata_records(metadata_csv=METADATA_CSV):
    """
    Loads the image metadata records from the CSV file.
    Each record is expected to have at least:
       - image_file
       - code
       - article
    """
    records = []
    try:
        with open(metadata_csv, newline='', encoding='utf-8-sig') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                records.append(row)
        print(f"Loaded {len(records)} metadata record(s) from {metadata_csv}.")
    except FileNotFoundError:
        print(f"Metadata CSV file not found: {metadata_csv}")
    return records

if __name__ == "__main__":
    # Load metadata records.
    metadata_records = load_metadata_records()

    if not metadata_records:
        print("No metadata records to process. Exiting.")
        exit(1)

    # Process each image using the metadata records.
    for record in metadata_records:
        image_file = record.get("image_file")
        if not image_file:
            print("No image file specified in record. Skipping.")
            continue

        image_path = os.path.join(IMAGES_FOLDER, image_file)
        if not os.path.exists(image_path):
            print(f"Image file does not exist: {image_path}. Skipping.")
            continue

        print(f"\nProcessing image: {image_path}")
        user_data = search_exact_image_matches(image_path)
        if user_data:
            # Process the "code" field: keep only text after ">" if it exists.
            code_field = record.get("code", "")
            if ">" in code_field:
                code_field = code_field.split(">")[-1].strip()
            else:
                code_field = code_field.strip()

            # Combine metadata from the DOCX (code, article) with the search results.
            combined_record = {
                "image_file": image_file,
                "code": code_field,
                "article": record.get("article", "").strip(),
                "search_url": user_data.get("search_url", ""),
                "num_articles": user_data.get("num_articles", ""),
                "topics": user_data.get("topics", ""),
                "image_details": user_data.get("image_details", ""),
                "comments": user_data.get("comments", "")
            }
            save_search_results_to_csv(combined_record)
        else:
            print(f"Search failed for image: {image_path}")

        # Pause briefly before processing the next image.
        time.sleep(2)


Loaded 259 metadata record(s) from C:\Users\Myrosalva Volosko\Documents\scraper_google_image\image_metadata.csv.

Processing image: C:\Users\Myrosalva Volosko\Documents\scraper_google_image\extracted_images\image_1.png
Cookie prompt: Rejected cookies.
Clicked the camera button.
Image uploaded. Waiting for search results...
Clicked 'Exact Matches' button.
Data for image image_1.png saved to CSV (image_search_results.csv).

Processing image: C:\Users\Myrosalva Volosko\Documents\scraper_google_image\extracted_images\image_2.png
Cookie prompt: Rejected cookies.
Clicked the camera button.
Image uploaded. Waiting for search results...
Clicked 'Exact Matches' button.
