# Manifestos: Text extraction

This notebook extracts raw text from party manifestos.  

**Important: Not all manifestos are available or finalized yet.**

Manifestos were retrieved on Mar 5, 2024, from the following URLs:  
- https://www.spd.de/fileadmin/Dokumente/EuroDel/20240128_Europaprogramm.pdf
- https://cms.gruene.de/uploads/assets/Europawahlprogramm-2024-Bu%CC%88ndnis90Die-Gru%CC%88nen_Wohlstand_Gerechtigkeit_Frieden_Freiheit.pdf
- https://www.fdp.de/sites/default/files/2024-01/fdp_europawahlprogramm-2024_vorabversion.pdf
- https://www.die-linke.de/fileadmin/user_upload/Europawahlprogramm_2023_neu2.pdf
- [https://www.afd.de/wp-content/uploads/2023/11/2023-11-16-\_-AfD-Europawahlprogramm-2024-\_-web.pdf](https://www.afd.de/wp-content/uploads/2023/11/2023-11-16-_-AfD-Europawahlprogramm-2024-_-web.pdf)

In [6]:
import fitz  # belongs to PyMuPDF


def pdf_to_text(
    pdf_origin_path: str, txt_target_path: str, margins: list, page_range: list = None
):
    """
    Extracts text from a PDF file and saves it to a .txt file.

    Args:
    pdf_origin_path (str): The path to the PDF file.
    txt_target_path (str): The path to the .txt file where the extracted text will be saved.
    margins (list): The margins of the PDF file (top, right, bottom, left) within which the text will be extracted.
    page_range (list): The range of pages to extract the text from (start, end), starting with 1, both ends inclusive. If None, the entire document will be extracted.
    """

    # Open the PDF
    document = fitz.open(pdf_origin_path)

    # Define the margins
    top_margin = margins[0]
    right_margin = margins[1]
    bottom_margin = margins[2]
    left_margin = margins[3]

    # Define the page range
    if page_range is None:
        page_range = [1, len(document)]
    else:
        # Check if the page range is valid
        if page_range[0] < 1 or page_range[1] > len(document):
            raise ValueError("Invalid page range")

    # Extract the text
    extracted_text = ""
    for page_num in range(page_range[0] - 1, page_range[1]):

        # Get the page
        page = document.load_page(page_num)

        # Get the dimensions of the entire page
        rect = page.rect

        # Define your viewport excluding margins
        viewport = fitz.Rect(
            left_margin,
            top_margin,
            rect.width - right_margin,
            rect.height - bottom_margin,
        )

        # Extract text within the viewport
        text = page.get_text("text", clip=viewport)

        # Add the extracted text to the output string
        extracted_text += text

    document.close()

    with open(txt_target_path, "w") as f:
        f.write(extracted_text)

    pass

In [11]:
pdf_to_text(
    pdf_origin_path="../data/manifestos/01_pdf_originals/spd_wahlprogramm_europawahl_2024.pdf",
    txt_target_path="../data/manifestos/02_txt_converted/spd_wahlprogramm_europawahl_2024.txt",
    margins=[0, 0, 50, 0],
    page_range=[3, 40]
)
pdf_to_text(
    pdf_origin_path="../data/manifestos/01_pdf_originals/linke_wahlprogramm_europawahl_2024.pdf",
    txt_target_path="../data/manifestos/02_txt_converted/linke_wahlprogramm_europawahl_2024.txt",
    margins=[0, 0, 50, 0],
    page_range=[5, 96]
)
pdf_to_text(
    pdf_origin_path="../data/manifestos/01_pdf_originals/gruene_wahlprogramm_europawahl_2024.pdf",
    txt_target_path="../data/manifestos/02_txt_converted/gruene_wahlprogramm_europawahl_2024.txt",
    margins=[0, 0, 50, 0],
    page_range=[4, 113]
)
pdf_to_text(
    pdf_origin_path="../data/manifestos/01_pdf_originals/fdp_wahlprogramm_europawahl_2024.pdf",
    txt_target_path="../data/manifestos/02_txt_converted/fdp_wahlprogramm_europawahl_2024.txt",
    margins=[0, 0, 60, 0],
    page_range=[1, 21]
)
pdf_to_text(
    pdf_origin_path="../data/manifestos/01_pdf_originals/afd_wahlprogramm_europawahl_2024.pdf",
    txt_target_path="../data/manifestos/02_txt_converted/afd_wahlprogramm_europawahl_2024.txt",
    margins=[0, 0, 50, 0],
    page_range=[5, 26]
)