# Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import re
import warnings
import pdfplumber
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

# Utils

In [105]:
# Fonction pour générer les noms de colonnes
def column_names_generate(tables, index_col_table):
    col_name_list = []
    zip_list = [list(element) for element in zip(*tables[0:index_col_table])]
    for index, col in enumerate(zip_list): # col 0 is
        # Repeat the subcolumn names
        if not col[0]:
            for index_e, e in enumerate(col):
                following_values = all(pd.Series(col[index_e:]).isna()) 
                if (not e) and (not following_values):
                    zip_list[index][index_e] = zip_list[index-1][index_e]
        cleaned_list = ['' if not x else x.replace("\n", " ") for x in col]
        col_name_list.append(" ".join(cleaned_list))
    col_name_list = [col_name.lstrip().rstrip() if col_name else col_name for col_name in col_name_list]
    return col_name_list

def correct_bold_text_double(df):
    df_copy = df.copy()
    rows_to_correct = []
    col_to_check = df_copy.iloc[:, 5]  # contrôle arbitraire sur la cinquième colonne qui est la première numérique
    for index, val in col_to_check.items():
        if ",," in val:
            rows_to_correct.append(index)
    if not len(rows_to_correct):
        return df_copy
    for row in rows_to_correct:
        df_copy.iloc[row, 5:] = df_copy.iloc[row, 5:].apply(lambda x: x.replace(" ", "") if len(x) else x)
        correct_text = []
        for text in df_copy.iloc[row, 5:]:
            text_f = ""
            for carac in range(0, len(text), 2):
                text_f += text[carac]
            correct_text.append(text_f)
        df_copy.iloc[row, 5:] = correct_text
    return df_copy

def correct_bold_text_triple(df):
    df_copy = df.copy()
    rows_to_correct = []
    col_to_check = df_copy.iloc[:, 5]  # contrôle arbitraire sur la cinquième colonne qui est la première numérique
    for index, val in col_to_check.items():
        if ",,," in val:
            rows_to_correct.append(index)
    if not len(rows_to_correct):
        return df_copy
    for row in rows_to_correct:
        df_copy.iloc[row, 5:] = df_copy.iloc[row, 5:].apply(lambda x: x.replace(" ", "") if len(x) else x)
        correct_text = []
        for text in df_copy.iloc[row, 5:]:
            text_f = ""
            for carac in range(0, len(text), 3):
                text_f += text[carac]
            correct_text.append(text_f)
        df_copy.iloc[row, 5:] = correct_text
    return df_copy
    
# Paramètres pour l'extraction avec pdfplumber
params = {
    "vertical_strategy": "explicit",
    "horizontal_strategy": "explicit",
    "explicit_vertical_lines": [],
    "explicit_horizontal_lines": [],
    "snap_tolerance": 100,
    "snap_x_tolerance": 3,
    "snap_y_tolerance": 3,
    "join_tolerance": 3,
    "join_x_tolerance": 2000,
    "join_y_tolerance": 0,
    "edge_min_length": 3,
    "min_words_vertical": 3,
    "min_words_horizontal": 3,
    "intersection_tolerance": 15,
    "intersection_x_tolerance": 3,
    "intersection_y_tolerance": 3,
    "text_tolerance": 1,
    "text_x_tolerance": 3,
    "text_y_tolerance": 3,
}

# Main

## Tome 2 - Etat des emprunts

### Main function

In [114]:
def main_tome2(folder_path, year, files_dict, min_page, max_page):
    
    # Defining path
    path_pdf = os.path.join(folder_path, files_dict[year])

    # Initialisation final df
    final_df = pd.DataFrame()
    
    # Opening PDF
    with pdfplumber.open(path_pdf) as pdf:
        for page_num in range(min_page, max_page):
            
            if page_num in (min_page, max_page-1) or page_num%10 == 0:
                print(page_num)
                
            # Lire le tableau
            page = pdf.pages[page_num]
            params["explicit_vertical_lines"] = page.curves + page.edges
            params["explicit_horizontal_lines"] = page.curves + page.edges
            tables = page.extract_table(table_settings=params)

            if tables:
                # Générer les noms des colonnes
                col_index = 2
                col_names = column_names_generate(tables, col_index)

                # Générer le DataFrame
                df = pd.DataFrame(tables[col_index:], columns=col_names)
                
                # Corrections des éventuelles lignes en gras
                df = correct_bold_text_triple(df)
                df = correct_bold_text_double(df)
                
                # Ajout colonne
                df.insert(0, "Numéro_page", page_num + 1)
                
                # Concatenation
                final_df = pd.concat([final_df, df])

    # Cleaning
    text_col = [1, 4, 5]
    num_col = [6, 7, 8, 12, 15, 18, 19]
    for col in text_col:
        final_df.iloc[:,col] = final_df.iloc[:,col]\
            .str.replace("\n"," ")\
            .str.replace("(cid:176)","°")\
            .str.replace("Ø","é")\
            .str.replace("(cid:244)","ô")\
            .str.replace("(cid:224)","à")\
            .str.replace("(cid:226)","â")\
            .str.replace("(cid:231)","ç")\
            .str.replace("Ł","è")
    for col in num_col:
        final_df.iloc[:,col] = final_df.iloc[:,col].apply(lambda x: float(x.replace(" ","").replace(",",".").replace("\n","")) if len(x) else x)
    final_df.columns = final_df.columns\
        .str.replace("(cid:176)","°")\
        .str.replace("Ø","é")\
        .str.replace("(cid:244)","ô")\
        .str.replace("(cid:224)","à")\
        .str.replace("(cid:226)","â")\
        .str.replace("(cid:231)","ç")\
        .str.replace("Ł","è")\
        .str.replace('(cid:146)',"'")\
        .str.replace("Œ","ê")\
        .str.replace("ß","û")
    
    # Ajout nom fichier
    final_df.insert(0, "Fichier_source", files_dict[year])
    
    final_df.reset_index(drop=True, inplace=True)
    
    return final_df

### Params

In [27]:
# Définir le chemin du fichier PDF
folder_path = os.path.join("sources", "Tome_2")

# Creation dictionnaire fichiers
files_dict = {
    2023: "CA 2023 - Budget général Compte sur chiffres et annexes Tome 2.pdf",
    2022: "03 - CA 2022 Budget général - Compte sur chiffres et annexes - Tome 2.pdf",
    2021: "CA 2021 - Tome 2.pdf",
    2020: "CA 2020 Document budgétaire (Tome 2.1).pdf",
    2019: "04 - Document budgétaire (Tome 1.2).pdf"
}

### Extraction

In [29]:
all_df = {}

In [46]:
year = 2023
all_df[f"Tome 2_{year}_Etat_emprunts_garantis"] = main_tome2(folder_path, year, files_dict, min_page = 6, max_page = 546)

6
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
545


In [58]:
year = 2022
all_df[f"Tome 2_{year}_Etat_emprunts_garantis"] = main_tome2(folder_path, year, files_dict, min_page = 43, max_page = 46)

43
45


In [118]:
year = 2021
all_df[f"Tome 2_{year}_Etat_emprunts_garantis"] = main_tome2(folder_path, year, files_dict, min_page = 8, max_page = 435)

8
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
434


In [119]:
year = 2020
all_df[f"Tome 2_{year}_Etat_emprunts_garantis"] = main_tome2(folder_path, year, files_dict, min_page = 9, max_page = 387)

9
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
386


In [120]:
year = 2019
all_df[f"Tome 2_{year}_Etat_emprunts_garantis"] = main_tome2(folder_path, year, files_dict, min_page = 0, max_page = 417)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
416


### Stacking

In [159]:
df_all_stack = pd.DataFrame()
for file_name, dict_year in all_df.items():
    for _, df in dict_year.items():
        df_stack = pd.DataFrame(
            df.set_index(["Année","Fichier_source","Fonction","Section","Libellé"]
                          )\
            .stack()
        )\
        .reset_index()\
        .rename(
            columns={
                "level_5":"Nom_Colonne", 
                0:"Valeur"
            }
        )
        df_all_stack = pd.concat([df_all_stack, df_stack])
df_all_stack.reset_index(drop=True, inplace=True)
df_all_stack.head()

Unnamed: 0,Année,Fichier_source,Fonction,Section,Libellé,Nom_Colonne,Valeur
0,2023,CA 2023 - Budget général Compte sur chiffres e...,FONCTION 0 – Services généraux,Section d'investissement,"Subv. non transf. Etat, établ. nationaux",Numéro_page,111.0
1,2023,CA 2023 - Budget général Compte sur chiffres e...,FONCTION 0 – Services généraux,Section d'investissement,"Subv. non transf. Etat, établ. nationaux",Article / compte nature (1),1321.0
2,2023,CA 2023 - Budget général Compte sur chiffres e...,FONCTION 0 – Services généraux,Section d'investissement,"Subv. non transf. Etat, établ. nationaux",90-02 Administration générale 90-020 Admin. gé...,0.0
3,2023,CA 2023 - Budget général Compte sur chiffres e...,FONCTION 0 – Services généraux,Section d'investissement,"Subv. non transf. Etat, établ. nationaux",90-02 Administration générale 90-021 Personnel...,0.0
4,2023,CA 2023 - Budget général Compte sur chiffres e...,FONCTION 0 – Services généraux,Section d'investissement,"Subv. non transf. Etat, établ. nationaux",90-02 Administration générale 90-022 Informati...,0.0


### Excel extraction

In [160]:
# Use ExcelWriter to write each DataFrame to a sheet
for file_name, dict_year in all_df.items():
    file_name = f"{file_name}.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        for sheet_name, df in dict_year.items():
            sheet_name = str(sheet_name)
            df.to_excel(writer, sheet_name=sheet_name, index=False)

In [161]:
with pd.ExcelWriter("Tome 1_2019-2023.xlsx", engine='xlsxwriter') as writer:
    sheet_name = "Data"
    df_all_stack.to_excel(writer, sheet_name=sheet_name, index=False)