In [None]:
import os
import re
import cv2
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
from concurrent.futures import ThreadPoolExecutor

#pip install pytesseract pandas pillow openpyxl opencv-python

#tesseract ocr file path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' 

In [None]:
# Define a function to extract phone numbers using regex
def extract_phone_numbers(text):
    pattern = r'\+?\d{1,3} \d{3,6} \d{3,6}'
    return re.findall(pattern, text)


In [None]:
# Image preprocessing function
def preprocess_image(image_path):
    # Read the image using OpenCV
    image_cv = cv2.imread(image_path, cv2.IMREAD_COLOR)
    gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    inverted_binary = cv2.bitwise_not(binary)
    return inverted_binary

In [None]:
# Define function to process each image
def process_image(image_path):
    preprocessed = preprocess_image(image_path)
    text = pytesseract.image_to_string(preprocessed)
    numbers = extract_phone_numbers(text)
    return os.path.basename(image_path), numbers

In [None]:
def main():
    # List all image files in the screenshots folder (adjust this to your directory)
    image_directory = "screenshots"
    image_files = [os.path.join(image_directory, f) for f in os.listdir(image_directory) if f.endswith(('.png', '.jpg', '.jpeg'))]

    # Use multi-threading with 10 workers to process images
    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        intermediate_results = list(executor.map(process_image, image_files))
    
    # Flatten the results so that each number is in a separate row with its filename
    for filename, numbers in intermediate_results:
        for number in numbers:
            results.append([filename, number])

    # Save results to Excel
    df = pd.DataFrame(results, columns=['Filename', 'Phone Number'])
    df.to_excel("output.xlsx", index=False, engine='openpyxl')



In [None]:
if __name__ == "__main__":
    main()