<a href="https://colab.research.google.com/github/emiliawisnios/Social-and-Public-Policy-python/blob/main/Notebooks/Social_and_Public_Policy_Coding_Python_05_12_12_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In today's class we will talk about OCR - Optical Character Recognition.


Optical Character Recognition (OCR) is the technology that enables computers to extract text from images.
Common applications in political science:
- Digitizing historical political documents and archives
- Processing campaign materials and political advertisements
- Analyzing scanned policy documents
- Converting protest signs and banners to text
- Processing voting ballots and election materials

In [None]:
!apt-get install tesseract-ocr

In [None]:
!pip install pytesseract opencv-python pillow numpy matplotlib --q

In [None]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd

In [None]:
def perform_basic_ocr(image_path):
    """
    Performs basic OCR on an image file.

    Args:
        image_path (str): Path to the image file

    Returns:
        str: Extracted text from the image
    """
    # Read the image
    image = Image.open(image_path)

    # Extract text
    text = pytesseract.image_to_string(image)

    return text

In [None]:
from urllib.request import urlretrieve

file_url = 'https://raw.githubusercontent.com/emiliawisnios/Social-and-Public-Policy-python/refs/heads/main/Documents/nr057532-1.png'
urlretrieve(file_url, "image.jpg")

In [None]:
sample_text = perform_basic_ocr('/content/image.jpg')
print("Extracted text:")
print(sample_text)

In [None]:
def get_detailed_ocr_info(image_path):
    """
    Gets detailed OCR information including confidence scores and bounding boxes.

    Args:
        image_path (str): Path to the image file

    Returns:
        dict: Dictionary containing detailed OCR information
    """
    image = Image.open(image_path)

    # Get detailed OCR data
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(ocr_data)

    return df

In [None]:
df = get_detailed_ocr_info('/content/image.jpg')
print("\nAverage confidence score:", df['conf'].mean())
print("\nWords with confidence > 90%:")
print(df[df['conf'] > 90][['text', 'conf']])

In [None]:
df

## Task

Get yor own document from the web and do OCR.

In [None]:
#### YOUR CODE GOES HERE #####

# Image Preprocessing Functions

In [None]:
def preprocess_image(image_path):
    """
    Applies various preprocessing techniques to improve OCR accuracy.

    Args:
        image_path (str): Path to the image file

    Returns:
        numpy.ndarray: Preprocessed image
    """
    # Read image
    image = cv2.imread(image_path)

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to preprocess the image
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # Apply dilation to connect text components
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    gray = cv2.dilate(gray, kernel, iterations=1)

    # Apply noise reduction
    gray = cv2.medianBlur(gray, 3)

    return gray


In [None]:
def display_image_comparison(original_path, preprocessed_image):
    """
    Displays original and preprocessed images side by side.

    Args:
        original_path (str): Path to original image
        preprocessed_image (numpy.ndarray): Preprocessed image
    """
    original = cv2.imread(original_path)
    original_rgb = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)

    plt.figure(figsize=(12, 6))
    plt.subplot(121)
    plt.imshow(original_rgb)
    plt.title('Original Image')
    plt.axis('off')

    plt.subplot(122)
    plt.imshow(preprocessed_image, cmap='gray')
    plt.title('Preprocessed Image')
    plt.axis('off')

    plt.show()

In [None]:
def perform_advanced_ocr(image_path):
    """
    Performs OCR with preprocessing steps for better accuracy.

    Args:
        image_path (str): Path to the image file

    Returns:
        tuple: (preprocessed_text, original_text, confidence_comparison)
    """
    # Original OCR
    original_text = perform_basic_ocr(image_path)

    # Preprocess and perform OCR
    preprocessed_image = preprocess_image(image_path)
    preprocessed_text = pytesseract.image_to_string(preprocessed_image)

    # Compare confidence scores
    original_conf = pytesseract.image_to_data(Image.open(image_path),
                                            output_type=pytesseract.Output.DICT)
    preprocessed_conf = pytesseract.image_to_data(preprocessed_image,
                                                output_type=pytesseract.Output.DICT)

    conf_comparison = {
        'original_mean_conf': np.mean([conf for conf in original_conf['conf'] if conf != -1]),
        'preprocessed_mean_conf': np.mean([conf for conf in preprocessed_conf['conf'] if conf != -1])
    }

    return preprocessed_text, original_text, conf_comparison

In [None]:
def improve_image_quality(image_path):
    """
    Applies additional preprocessing techniques for challenging images.

    Args:
        image_path (str): Path to the image file

    Returns:
        numpy.ndarray: Enhanced image
    """
    image = cv2.imread(image_path)

    # Increase contrast
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    cl = clahe.apply(l)
    enhanced = cv2.merge((cl,a,b))
    enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)

    # Denoise
    enhanced = cv2.fastNlMeansDenoisingColored(enhanced, None, 10, 10, 7, 21)

    return enhanced


# 5. Evaluation Functions
def evaluate_ocr_quality(predicted_text, ground_truth):
    """
    Evaluates OCR quality using basic metrics.

    Args:
        predicted_text (str): OCR output text
        ground_truth (str): Correct text

    Returns:
        dict: Dictionary containing evaluation metrics
    """
    from difflib import SequenceMatcher

    # Calculate similarity ratio
    similarity = SequenceMatcher(None, predicted_text, ground_truth).ratio()

    # Word accuracy (simple implementation)
    pred_words = set(predicted_text.lower().split())
    true_words = set(ground_truth.lower().split())
    word_accuracy = len(pred_words.intersection(true_words)) / len(true_words)

    return {
        'similarity_ratio': similarity,
        'word_accuracy': word_accuracy
    }