In [1]:
# --- Load Libraries ---
import numpy as np
import pandas as pd
import os
from mistralai import Mistral
import re
import spacy
from openai import OpenAI
import time
from IPython.display import Markdown, display

In [None]:
# --- Setup ---
## Working directory
os.chdir("")

## API key Mistral
api_key = ""
client = Mistral(api_key=api_key)

## API key form GPT
api_key_gpt = ""

## Folder of data
folder = "1. manifesto_original"

# Choose language
## Get only files that end with "_<language>.pdf"
files = sorted(
    f for f in os.listdir(folder)
    if f.endswith("_<language>.pdf")
)

filename = files[0]                        
filepath = os.path.join(folder, filename)

  os.chdir("c:\\Users\\IPP\\\Dropbox\\Test Quebec")


In [3]:
# --- Upload PDF ---
uploaded_pdf = client.files.upload(
    file={
        "file_name": filename,
        "content": open(filepath, "rb"),
    },
    purpose="ocr"
)

## Get uploaded file id
file_id = uploaded_pdf.id  

In [4]:
# --- Get PDF url ---
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
file_url = signed_url.url

In [5]:
# --- Extract text from PDF url ---
ocr_result = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": file_url
    },
    include_image_base64=False 
)

In [None]:
# ---- Transform to DF ----
## Have to add region, year, party and language
### Join all texts
all_text = "\n\n".join([page.markdown for page in ocr_result.pages])

## Save markdown
### Specify output folder and filename
out_folder = "2. manifesto_ocr_md"
md_filename = filename.replace(".pdf", ".md")
md_path = os.path.join(out_folder, md_filename)

## Display markdown (Optional)
#display(Markdown(all_text))

### Write to file
with open(md_path, "w", encoding="utf-8") as f:
    f.write(all_text)

## Transform to DF
df_doc = pd.DataFrame([{
    "filename": filename,
    "text": all_text
}])

In [7]:
# ---- Clean text ----
## Remove titles
df_doc["text_clean"] = df_doc["text"].str.replace(r'(?m)^#+\s+.*$', '', regex=True)

## Remove single words and lines with 3 or less words
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r'(?m)^(?:\S+\s*){1,3}$', '', regex=True)

## Remove images
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"!\[.*?\]\(.*?\)", "", regex=True)

## Remove $...$ math wrappers
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"\$", "", regex=True)

## Replace LaTeX \% with %
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"\\%", "%", regex=True)

## Remove stray backslashes
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"\\", "", regex=True)

## Remove numbers
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"\d+", "", regex=True)

## Other symbols 
## Basically removing these: @, #, $, &, *, +, =, <, >, [, ], {, }, |, \, ~, ^, _, emojis, special symbols like ©, ®, €, £ and Uncommon accented characters
df_doc["text_clean"] = df_doc["text_clean"].str.replace(
    r"[^a-zA-Z0-9À-ÖØ-öø-ÿ.,;:!?'()\-\s]", 
    "",
    regex=True
)

## Collapse multiple spaces into one
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"\s{2,}", " ", regex=True)

## Trim leading/trailing spaces
df_doc["text_clean"] = df_doc["text_clean"].str.strip()

## Remove line breaks
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"(?m)\n", " ", regex=True)

## Remove repeated punctuation (.,)
df_doc["text_clean"] = df_doc["text_clean"].str.replace(r"(\.\%|\(\)|[.,\-()])(?:\s*\1){1,}", r"\1", regex=True)

In [8]:
# ---- Save cleaned text to file ----
out_folder = "3. manifesto_ocr_txt_clean"
txt_filename = df_doc["filename"].iloc[0].replace(".pdf", "_txt_clean.txt")
txt_path = os.path.join(out_folder, txt_filename)
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(df_doc["text_clean"].iloc[0])

In [9]:
# ---- Add Region, Year and Party ----
pattern = r"^([A-Z]+)_([0-9]{4})_([A-Z]+)_([a-z]+)\.pdf$"

match = re.match(pattern, filename)

if match:
    region, year, party, language = match.groups()
else:
    raise ValueError("Filename does not match expected pattern")


df_doc["region"] = region
df_doc["year"] = year
df_doc["party"] = party
df_doc["language"] = language

In [None]:
# ---- Divide DF by lines using spaCy----
## Load <language> spaCy model once
nlp = spacy.load("<language>")

## Divide lines
def split_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip() != ""]

df_lines = df_doc.assign(
    line=df_doc["text_clean"].apply(split_sentences)
).explode("line")

df_lines = df_lines.reset_index(drop=True)

## Remove blank sentences
df_lines = df_lines[df_lines["line"].str.strip() != ""]

## Remove sentences with 3 or fewer words
df_lines = df_lines[df_lines["line"].apply(lambda s: len(s.split()) > 3)]
df_lines = df_lines.reset_index(drop=True)

## Trim leading/trailing spaces
df_lines["line"] = df_lines["line"].str.strip()

## Add number of words column
df_lines["num_words"] = df_lines["line"].str.split().str.len()

In [11]:
# ---- Save dataframe before translate ----
## Select columns
df_lines = df_lines[["filename", "region","year","party","language","line","num_words"]]

base_name = filename.replace(".pdf", "")
df_lines.to_csv(f"4. manifesto_df/{base_name}_df.csv", index=False)

In [None]:
# ---- Translate code set up ----
## Prepare dataframe to translate by adding ids and sleceting columns
df_lines["id_translate"] = range(1, len(df_lines) + 1)
translate = df_lines.loc[df_lines["line"].notna(), ["id_translate", "line"]].copy()

## API key form GPT
client = OpenAI(api_key=api_key_gpt)

In [None]:
# ---- Loop to translate each row ----
results = []

for i, row in translate.iterrows():
    original_text = row["line"]

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a translation assistant."},
                {"role": "user", "content": f"Translate this text to English:\n{original_text}"}
            ]
        )

        translated = response.choices[0].message.content

    except Exception as e:
        print(f"Error at row {row['id_translate']}: {e}")
        translated = None

    results.append({
        "id_translate": row["id_translate"],
        "translated_text": translated
    })

    time.sleep(0.2)  

## Convert results to dataframe
translated_df = pd.DataFrame(results)

## Merge back into main df

df_final = (
    df_lines
    .merge(translated_df, on="id_translate", how="left")
    .drop(columns=["line", "id_translate"])
)

In [None]:
# ---- Select columns & save it ----
## Select columns
df_final = df_final[["filename", "region","year","party","language","line","translated_text","num_words"]]

## Save
base_name = filename.replace(".pdf", "")
df_final.to_csv(f"5. manifesto_translated/{base_name}_df_translated.csv", index=False)