# Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import re
import warnings
import pdfplumber
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

# Utils

In [113]:
# Fonction pour obtenir les informations des colonnes
def column_info(tables):
    pattern = r"(?<!\d)\d{2}-\d{1,5}\b"
    for index, list_info in enumerate(tables):
        matches_list = []
        for element in list_info:
            if element:
                matches = re.findall(pattern, element)
                if len(matches):
                    matches_list.append(element)
        if not len(matches_list):
            return index

# Fonction pour générer les noms de colonnes
def column_names_generate(tables, index_col_table):
    col_name_list = []
    zip_list = [list(element) for element in zip(*tables[0:index_col_table])]
    for index, col in enumerate(zip_list): # col 0 is
        # Repeat the subcolumn names
        if not col[0]:
            for index_e, e in enumerate(col):
                following_values = all(pd.Series(col[index_e:]).isna()) 
                if (not e) and (not following_values):
                    zip_list[index][index_e] = zip_list[index-1][index_e]
        cleaned_list = ['' if not x else x.replace("\n", " ") for x in col]
        col_name_list.append(" ".join(cleaned_list))
    col_name_list = [col_name.lstrip().rstrip() if col_name else col_name for col_name in col_name_list]
    return col_name_list

def flag_depense_recette(df, current_ope):
    df_copy = df.copy()
    depense_row = df_copy[df_copy.iloc[:,0]=="DEPENSES"]
    recette_row = df_copy[df_copy.iloc[:,0]=="RECETTES"]
    title_col = "type_opération"
    depense_txt = "Dépense"
    recette_txt = "Recette"
    if len(depense_row) and not len(recette_row):
        df_copy[title_col] = depense_txt
        current_ope = depense_txt
        df_copy.drop(index=depense_row.index[0], inplace=True)
    elif len(depense_row) and len(recette_row):
        df_copy[title_col] = np.where(
            (df_copy.index > depense_row.index[0]) & (df_copy.index < recette_row.index[0]),
            depense_txt,
            recette_txt
        )
        current_ope = recette_txt
        df_copy.drop(index=[depense_row.index[0], recette_row.index[0]], inplace=True)
    elif not len(depense_row) and len(recette_row):
        df_copy[title_col] = np.where(
            df_copy.index < recette_row.index[0],
            depense_txt,
            recette_txt
        )
        current_ope = recette_txt
        df_copy.drop(index=recette_row.index[0], inplace=True)
    else:
        df_copy[title_col] = current_ope
    return df_copy, current_ope
            
# Paramètres pour l'extraction avec pdfplumber
params = {
    "vertical_strategy": "explicit",
    "horizontal_strategy": "explicit",
    "explicit_vertical_lines": [],
    "explicit_horizontal_lines": [],
    "snap_tolerance": 100,
    "snap_x_tolerance": 3,
    "snap_y_tolerance": 3,
    "join_tolerance": 3,
    "join_x_tolerance": 2000,
    "join_y_tolerance": 0,
    "edge_min_length": 3,
    "min_words_vertical": 3,
    "min_words_horizontal": 3,
    "intersection_tolerance": 15,
    "intersection_x_tolerance": 3,
    "intersection_y_tolerance": 3,
    "text_tolerance": 1,
    "text_x_tolerance": 3,
    "text_y_tolerance": 3,
}


# Main

## Tome 1 - Annexes hors dettes

### Main function

In [216]:
def main_tome1_annexes_depenses_recettes(folder_path, year, files_dict, min_page, max_page):
    
    # Defining path
    path_pdf = os.path.join(folder_path, files_dict[year])

    # Opening PDF
    with pdfplumber.open(path_pdf) as pdf:
        # Initialisation current opération
        current_ope = "Dépense"
    
        # Initialisation final df
        final_df = dict()
        
        # Parcourir chaque page
        for page_num in range(min_page, max_page):
            if page_num in (min_page, max_page-1) or page_num%5 == 0:
                print(page_num)
                
            # Lire le tableau
            page = pdf.pages[page_num]
            params["explicit_vertical_lines"] = page.curves + page.edges
            params["explicit_horizontal_lines"] = page.curves + page.edges
            tables = page.extract_table(table_settings=params)
            if tables: #Des pages peuvent être vides
                if tables[0][0] in ("RECETTES","DEPENSES"):
                    tables.pop(0)
        
                # Obtenir l'index des tables où les colonnes commencent
                col_index = column_info(tables)
        
                # Générer les noms des colonnes
                col_names = column_names_generate(tables, col_index)
        
                # Générer le DataFrame
                df = pd.DataFrame(tables[col_index:], columns=col_names)
        
                # Flag depense recette
                df, current_ope = flag_depense_recette(df, current_ope)
    
                if not all(df.iloc[:,1].isna()):
                    # Conversion format numérique
                    for col_index in range(2, df.shape[1]-1):
                        df.iloc[:,col_index] = df.iloc[:,col_index].str.replace(" ","").str.replace(",",".").str.replace("\n","").astype(float)
                        
                    # Ajout du numéro de page
                    df.insert(0, "Numéro_page", page_num + 1)
                    df.insert(0, "Fichier_source", files_dict[year])
                    df.insert(0, "Année", year)
            
                    # Concatenating
                    if page_num == min_page:
                        final_df[page_num] = df
                    else:
                        precedent_key = list(final_df)[-1]
                        if df.shape[1]==final_df[precedent_key].shape[1] and all(df.columns == final_df[precedent_key].columns):
                            final_df[page_num] = pd.concat([final_df[precedent_key], df]).reset_index(drop=True)
                            final_df.pop(precedent_key)
                        else:
                            final_df[page_num] = df
    return final_df

In [None]:
# Définir le chemin du fichier PDF
folder_path = os.path.join("sources", "Tome_1")

# Creation dictionnaire fichiers
files_dict = {
    2023: "CA 2023 - Budget général Compte sur chiffres et annexes Tome 1.pdf",
    2022: "02 - CA 2022 Budget général - Compte sur chiffres et annexes - Tome 1.pdf",
    2021: "CA 2021 - Tome 1.pdf",
    2020: "04 Document budgétaire (Tome 1).pdf",
    2019: "04 - Document budgétaire (Tome 1.1).pdf"
}

### Extraction

In [214]:
year = 2023
tome1_2023 = main_tome1_annexes_depenses_recettes(folder_path, year, files_dict, min_page = 159, max_page = 247)

159
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245


In [217]:
year = 2022
tome1_2022 = main_tome1_annexes_depenses_recettes(folder_path, year, files_dict, min_page = 110, max_page = 155)

110
115
120
125
130
135
140
145
150
154
