In [21]:
import os 
import numpy as np 
import cv2
from pdf2image import convert_from_path
import pytesseract
import unicodedata
import re
import pandas as pd
import natsort

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'

pd.set_option("display.max_rows", None)      # Show all rows
pd.set_option("display.max_columns", None)   # Show all columns
pd.set_option("display.width", 0)            # Auto-detect width
pd.set_option("display.max_colwidth", None)  # Don't truncate column content


In [10]:
# Extracts the year from the filename after 'Thèses_'.
def extract_year_from_filename(filename):
    # Regex pattern to match both formats
    match = re.search(r'Theses_(\d{4})(?:_(\d{4}))?_(\d+)_(\d+|blank)(?:\.pdf)?$', filename)
    
    # If no match is found, return None or handle as needed
    if not match:
        print(f" No match for filename: {filename}")
        return None

    # Extract the year and check if there's a second year
    year = match.group(1)
    second_year = match.group(2)  # This will be None if there's no second year
    start = match.group(3)
    end = match.group(4)

    # If a second year exists, concatenate it with the first year
    if second_year:
        year = f"{year}_{second_year}"

    return (year, int(start), end)

#Checks if a page is blank 
def is_blank(image, threshold=0.99):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    height = gray.shape[0]
    
    # Crops the 10% at the bottom of the page so that the number of the page doesn't affect our analysis
    cropped_gray = gray[:int(0.9 * height), :]
    
    _, thresh = cv2.threshold(cropped_gray, 240, 255, cv2.THRESH_BINARY)
    white_ratio = np.sum(thresh == 255) / thresh.size
    
    return white_ratio > threshold  # Retourne True si la page est blanche

# Converts a PDF file into images, saving each page as a PNG.
# # Images are stored in the specified output folder.
def convert_pdf_to_images(pdf_path, output_folder, start_page, end_page):
    images = convert_from_path(pdf_path)
    if end_page =='blank' :
        images = images[start_page:] 
    else : 
        end_page = int(end_page)
        images = images[start_page:end_page]  

    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        print(f"Page {i+1} saved: {image_path}")

        if is_blank(image):
            print(f"Page {i+1} est blanche. Arrêt du traitement.")
            break

# Processes all PDFs in the input directory.
# Extracts the year from the filename and creates a subfolder for each year.
# Converts each PDF into images and stores them in the corresponding subfolder.
def process_all_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensure the parent directory exists
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)  # Full path to PDF
            year, start_page, end_page = extract_year_from_filename(filename)
            print(f"Extracted year: {year}")  # Debugging
            if not year or year == "Erreur":  # Skip if invalid year extracted
                print(f" Invalid year extracted: {year} for {filename}")
                continue

            output_folder = os.path.join(output_dir, year)
            print(f"Creating subfolder: {output_folder}")  # Debugging
            os.makedirs(output_folder, exist_ok=True)  # Ensure the subfolder is created
            
            convert_pdf_to_images(pdf_path, output_folder,start_page,end_page)



In [None]:
current_dir = os.getcwd()

# Define the input and output paths relative to the current directory
input_dir = os.path.join(current_dir, '..', 'data', 'pdf_Tables_theses_Paris_1870_1939')  # Relative path to 'lib/data/pdf_Tables_theses_Paris_1870_1939'
output_dir = os.path.join(current_dir, '..', 'data', 'pdfs_en_images_png')  # Output path within 'lib/data/pdfs_en_images_png'

os.makedirs(output_dir, exist_ok = True)
print(f"Output directory: {output_dir}")  # Debugging line to check the output path

process_all_pdfs(input_dir, output_dir)

FIN DE L'EXTRACTION DE TEXTE;
DEBUT DE L'EXTRACTIION ET DU TRAITEMENT DU TEXTE.

























In [3]:
def extract_informations(text):

    pattern = re.compile(r"([A-ZÉÀÈÙÂÊÎÔÛÄËÏÖÜÇ]{2,}) \(([^)]+)\)\. (.+)?")

    # Initialisation des variables
    data = []
    current_nom = None
    current_prenom = None
    current_sujet = ""                      

    lines = text.split("\n")

    for line in lines:
        line = line.strip()  
        
        match = pattern.match(line)  

        if match:
            if current_nom is not None:
                data.append([current_nom, current_prenom, current_sujet.strip()])

            current_nom = match.group(1)
            current_prenom = match.group(2)
            current_sujet = match.group(3) if match.group(3) else ""
        
        else:
            if current_nom is not None:
                current_sujet += " " + line  

    if current_nom is not None:
        data.append([current_nom, current_prenom, current_sujet.strip()])

    df_cleaned = pd.DataFrame(data, columns=["Nom", "Prénom", "Sujet"])

    return df_cleaned


In [16]:
def contains_tome_marker(image):
    small_text = pytesseract.image_to_string(image, lang='fra')
    return re.search(r'TOME\s+\d+', small_text.upper()) is not None

def extract_columns(image):
    (h, w) = image.shape
    left = image[:, :w//2 - 5]
    right = image[:, 5 + w//2:]
    left_text = pytesseract.image_to_string(left, lang='fra')
    right_text = pytesseract.image_to_string(right, lang='fra')
    return left_text + " " + right_text


In [None]:

def process_image(image, first) : 
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    gray = clahe.apply(gray)

    (h, w) = gray.shape
    
    if first == True : 
        gray = gray[580:, :]  # Start from row 580 

    left_part = gray[:, :w//2 - 5]   
    right_part = gray[:,  5 + w//2:]


    left_text = pytesseract.image_to_string(left_part, lang = 'fra')
    right_text = pytesseract.image_to_string(right_part, lang = 'fra')

    full_text = left_text + " " + right_text

    return full_text 

In [10]:
def process_folder(input_dir):
    text = ""  
    df = pd.DataFrame()
  
    
    for image_file in natsort.natsorted(os.listdir(input_dir)):
        image_path = os.path.join(input_dir, image_file)
        image = cv2.imread(image_path)
        print(image_path)
        if image_file.lower().endswith('.png'): 
            if  image_file == "page_1.png" : 
                first = True 
            else : first = False
            text = text + process_image(image, first)
    df = extract_informations(text)
    return df

In [12]:
def remove_accents(text):
    if isinstance(text, str):
        return ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
        )
    return text

def cleanup_up_subject_column(df) :
    df['Nom'] = df['Nom'].astype(str)
    df['Prénom'] = df['Prénom'].astype(str)
    df['Sujet'] = df['Sujet'].astype(str)
    df['Sujet'] = df['Sujet'].str.replace('- ', '', regex=False)
    df['Sujet'] = df['Sujet'].str.replace(r',?\s*\d+\s*', ' ', regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r'\s+', ' ', regex=True).str.strip() 
    df['Sujet'] = df['Sujet'].str.replace(r"([bcdfghjklmnpqrstvwxyz]) (?=[aeiouy])", r"\1", regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r"[\"'#%&*\[\]{}<>|\\/^¤§°@=+\~`]", "", regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r"[•●▪■♦◊¤§°@©®™‰¨«»„”“†‡‚‘’]", " ", regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r"\b[a-zA-Z]\b", "", regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r"\b(TOM|TON|H|U|UVIL|co|tu|ot|mw|vf|En|El|N)\b", "", regex=True, flags=re.IGNORECASE)
    df['Sujet'] = df['Sujet'].str.replace(r"\s*[\.,;:!?]\s*", ". ", regex=True)
    df['Sujet'] = df['Sujet'].str.replace(r"\s+", " ", regex=True).str.strip()
    df['Sujet'] = df['Sujet'].apply(remove_accents)
    df['Prénom'] = df['Prénom'].apply(remove_accents)

In [13]:
def rescue_missing_entries(df):
    """
    Looks for names accidentally merged into the 'Sujet' field,
    and splits them out into new rows.
    """
    rescue_pattern = re.compile(
        r"\b([A-ZÉÀÈÙÂÊÎÔÛÄËÏÖÜÇ]{2,})[ ,]*([A-ZÉÀÈÙÂÊÎÔÛÄËÏÖÜÇa-zéàèùâêîôûäëïöüç\-]+)\)?[ .,-]"
    )

    rescued_rows = []

    for idx, sujet in df["Sujet"].items():
        matches = list(rescue_pattern.finditer(sujet))

        if matches:
            match = matches[0]
            start = match.start()
            rescued_text = sujet[start:]
            original_subject = sujet[:start].strip()

            # Update current row's Sujet
            df.at[idx, "Sujet"] = original_subject

            # Create rescued row
            nom = match.group(1)
            prenom = match.group(2)
            sujet_rescue = rescued_text[len(match.group(0)):].strip()

            rescued_rows.append({
                "Nom": nom,
                "Prénom": prenom,
                "Sujet": sujet_rescue
            })

    if rescued_rows:
        df_rescue = pd.DataFrame(rescued_rows)
        df = pd.concat([df, df_rescue], ignore_index=True)

    return df


In [27]:
def process_all_folders(input_dir) : 
    df= pd.DataFrame(columns=["Nom", "Prénom", "Sujet"])

    for year_folder in natsort.natsorted(os.listdir(input_dir)):
        year_path = os.path.join(input_dir, year_folder) 
        
        if os.path.isdir(year_path):  # Check if it's a directory
            df = pd.concat((df, process_folder(year_path)), ignore_index= True)
            rescue_missing_entries(df)
            cleanup_up_subject_column(df)

    return df

current_dir = os.getcwd()

input_dir = os.path.join(current_dir, '..', 'data', 'test')  # Relative path to 'lib/data/pdfs_en_images_png'

df = process_all_folders(input_dir)
print(df.to_string(max_rows=250, max_cols=210))


/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_1.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_2.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_3.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_4.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_5.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_6.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_7.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_8.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_9.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_10.png
/Users/georges/Desktop/EPFL/PDB/lib/code/../data/test/1870/page_11.png
             Nom                Prénom                                                                                                                                      Sujet
0         ABADIE    Jean-Marie-Charles            