In [12]:
import os 
import numpy as np 
import cv2
from pdf2image import convert_from_path
from IPython.display import display
import pytesseract
import unicodedata
import re
import csv
from datetime import datetime

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'

In [None]:
# Extracts the year from the filename after 'Thèses_'.
def extract_year_from_filename(filename):
    match = re.search(r'Theses_(.*?)(\.pdf)?$', filename)  # Capture everything after "Thèses_" and remove .pdf
    if match:
        extracted = match.group(1)  # Extract the part after "Thèses_"
        return extracted
    return "Erreur"

# Converts a PDF file into images, saving each page as a PNG.
# # Images are stored in the specified output folder.
def convert_pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        print(f"Page {i+1} saved: {image_path}")

# Processes all PDFs in the input directory.
# Extracts the year from the filename and creates a subfolder for each year.
# Converts each PDF into images and stores them in the corresponding subfolder.
def process_all_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensure the parent directory exists
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)  # Full path to PDF
            year = extract_year_from_filename(filename)
            print(f"Extracted year: {year}")  # Debugging
            if not year or year == "Erreur":  # Skip if invalid year extracted
                print(f" Invalid year extracted: {year} for {filename}")
                continue

            output_folder = os.path.join(output_dir, year)
            print(f"Creating subfolder: {output_folder}")  # Debugging
            os.makedirs(output_folder, exist_ok=True)  # Ensure the subfolder is created
            
            convert_pdf_to_images(pdf_path, output_folder)



In [None]:
current_dir = os.getcwd()

# Define the input and output paths relative to the current directory
input_dir = os.path.join(current_dir, '..', 'data', 'pdf_Tables_theses_Paris_1870_1939')  # Relative path to 'lib/data/pdf_Tables_thèses_Paris_1870_1939'
output_dir = os.path.join(current_dir, '..', 'data', 'pdfs_en_images_png')  # Output path within 'lib/data/pdfs_en_images_png'

os.makedirs(output_dir, exist_ok = True)
print(f"Output directory: {output_dir}")  # Debugging line to check the output path

process_all_pdfs(input_dir, output_dir)

In [None]:
# Extracts text from an image
def extract_text_from_image(image_path):
    image = cv2.imread(image_path)
    return pytesseract.image_to_string(image)

# Remonving the accents
def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

# Post processing
def post_process_text(extracted_text):
    extracted_text = remove_accents(extracted_text)  # Remove accents
    extracted_text = re.sub(r'\n+', '\n', extracted_text) # Replace multiple line breaks (\n\n\n...) with a single \n
    extracted_text = re.sub(r'[\x00-\x1F\x7F]', '', extracted_text) # Remove unwanted control characters
    extracted_text = re.sub(r'\s+', ' ', extracted_text)  # Replace multiple spaces with a single space
    extracted_text = re.sub(r'-\n', '', extracted_text)  # Remove hyphen+newline
    extracted_text = re.sub(r'\n', ' ', extracted_text)  # Replace remaining newlines with spaces
    extracted_text = re.sub(r'\s+([.,!?;:])', r'\1', extracted_text)  # Fix space before punctuation
    extracted_text = re.sub(r'([.,!?;:])(\S)', r'\1 \2', extracted_text)  # Ensure space after punctuation
    extracted_text = re.sub(r'\bPage \d+\b', '', extracted_text)  # Removes "Page X"
    extracted_text = re.sub(r'^\d+$', '', extracted_text, flags=re.MULTILINE)  # Removes lines that are only numbers
    
    return extracted_text.strip()

def process_all_pngs(input_dir, base_output_dir):
    # Generate a timestamped folder 
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_dir = os.path.join(base_output_dir, f"output_{timestamp}")
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory (and parent dirs if needed)
    
    for year_folder in sorted(os.listdir(input_dir)):
        year_path = os.path.join(input_dir, year_folder)

        if os.path.isdir(year_path):  # Check if it's a directory
            text_output_path = os.path.join(output_dir, f"{year_folder}.txt")

            with open(text_output_path, "w", encoding="utf-8") as output_file:
                for image_file in sorted(os.listdir(year_path)):
                    image_path = os.path.join(year_path, image_file)

                    if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):  # Check for image formats
                        print(f"Processing {image_path}...")
                        extracted_text = extract_text_from_image(image_path)
                        processed_text = post_process_text(extracted_text)

                        output_file.write(processed_text + "\n\n")  # Append text to the file

    print(f"✅ OCR completed! Results saved in: {output_dir}")


In [14]:
current_dir = os.getcwd()

input_dir = os.path.join(current_dir, '..', 'data', 'pdfs_en_images_png')  # Relative path to 'lib/data/pdfs_en_images_png'
output_dir = os.path.join(current_dir, '..', 'data', 'texte_extrait_par_pdfs')  # Output path within 'texte_extrait_par_pdfs'

process_all_pngs(input_dir, output_dir)


Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_1.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_10.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_11.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_12.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_13.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_14.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_15.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_16.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_17.png...
Processing /Users/georges/Desktop/EPFL/PDB/lib/code/../data/pdfs_en_images_png/1870/page_18.png...
Processing 

KeyboardInterrupt: 

In [None]:
# List of possible titles/statuses
POSSIBLE_TITLES = ["Mlle", "Mme", "M"]

#Extracts name and thesis from brut text
def extract_name_and_thesis(text):
    structured_data = []
    
    # Regex to detect last names in uppercase, followed by an optional first name in parentheses and an optional title
    pattern = re.compile(r"([A-ZÉÈÀÛÎÔ\-]+)(?: \(([A-Za-zÉÈÀÛÎÔéèàùâêîôûäëïöüç\-]+)\))?(?:\s+([A-Za-z\.\-]+))?\.\s*(.*?)\.\s*\d+", re.DOTALL)

    matches = pattern.findall(text)
    
    for match in matches:
        last_name = match[0].strip()
        first_name = match[1].strip() if match[1] else ""  # Optional first name
        potential_title = match[2].strip() if match[2] else ""

        # Check if the word after the first name is a valid title
        title = potential_title if potential_title in POSSIBLE_TITLES else ""

        thesis_title = match[3].strip()  # Thesis title

        structured_data.append([last_name, first_name, title, thesis_title])

    return structured_data

# Function to save extracted data to a CSV file
def save_to_csv(data, output_path):
    with open(output_path, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Last Name", "First Name", "Title", "Thesis Title"])
        writer.writerows(data)

In [None]:
save_to_csv(extract_name_and_thesis(text), output_path)