# Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import re
import warnings
import pdfplumber
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

# Utils

In [92]:
# Fonction pour obtenir les informations des colonnes
def column_info(tables):
    pattern = r"(?<!\d)\d{2}-\d{1,5}\b"
    for index, list_info in enumerate(tables):
        matches_list = []
        for element in list_info:
            if element:
                matches = re.findall(pattern, element)
                if len(matches):
                    matches_list.append(element)
        if not len(matches_list):
            return index

# Fonction pour générer les noms de colonnes
def column_names_generate(tables, index_col_table):
    col_name_list = []
    zip_list = [list(element) for element in zip(*tables[0:index_col_table])]
    for index, col in enumerate(zip_list): # col 0 is
        # Repeat the subcolumn names
        if not col[0]:
            for index_e, e in enumerate(col):
                following_values = all(pd.Series(col[index_e:]).isna()) 
                if (not e) and (not following_values):
                    zip_list[index][index_e] = zip_list[index-1][index_e]
        cleaned_list = ['' if not x else x.replace("\n", " ") for x in col]
        col_name_list.append(" ".join(cleaned_list))
    col_name_list = [col_name.lstrip().rstrip() if col_name else col_name for col_name in col_name_list]
    return col_name_list

def lambda_correct(text, repeat_n):
    text_f = ""
    for carac in range(0, len(text), repeat_n):
        text_f += text[carac]
    return text_f

def correct_bold_text_double(df):
    df_copy = df.copy()
    col_to_correct = []
    for col in df_copy.columns:
        counter = 0
        for e in df_copy[col]:
            if ",," in e:
                counter += 1
        if counter:
            col_to_correct.append(col)
    for col in col_to_correct:
        df_copy[col] = df_copy[col].apply(lambda x: lambda_correct(x, 2))
    return df_copy

def correct_bold_text_triple(df):
    df_copy = df.copy()
    col_to_correct = []
    for col in df_copy.columns:
        counter = 0
        for e in df_copy[col]:
            if ",,," in e:
                counter += 1
        if counter:
            col_to_correct.append(col)
    for col in col_to_correct:
        df_copy[col] = df_copy[col].apply(lambda x: lambda_correct(x, 3))
    return df_copy
    
def flag_depense_recette(df, current_ope):
    df_copy = df.copy()
    depense_row = df_copy[df_copy.iloc[:,0]=="DEPENSES"]
    recette_row = df_copy[df_copy.iloc[:,0]=="RECETTES"]
    title_col = "type_opération"
    depense_txt = "Dépense"
    recette_txt = "Recette"
    if len(depense_row) and not len(recette_row):
        df_copy[title_col] = depense_txt
        current_ope = depense_txt
        df_copy.drop(index=depense_row.index[0], inplace=True)
    elif len(depense_row) and len(recette_row):
        df_copy[title_col] = np.where(
            (df_copy.index > depense_row.index[0]) & (df_copy.index < recette_row.index[0]),
            depense_txt,
            recette_txt
        )
        current_ope = recette_txt
        df_copy.drop(index=[depense_row.index[0], recette_row.index[0]], inplace=True)
    elif not len(depense_row) and len(recette_row):
        df_copy[title_col] = np.where(
            df_copy.index < recette_row.index[0],
            depense_txt,
            recette_txt
        )
        current_ope = recette_txt
        df_copy.drop(index=recette_row.index[0], inplace=True)
    else:
        df_copy[title_col] = current_ope
    return df_copy, current_ope
            
# Paramètres pour l'extraction avec pdfplumber
params = {
    "vertical_strategy": "explicit",
    "horizontal_strategy": "explicit",
    "explicit_vertical_lines": [],
    "explicit_horizontal_lines": [],
    "snap_tolerance": 100,
    "snap_x_tolerance": 3,
    "snap_y_tolerance": 3,
    "join_tolerance": 3,
    "join_x_tolerance": 2000,
    "join_y_tolerance": 0,
    "edge_min_length": 3,
    "min_words_vertical": 3,
    "min_words_horizontal": 3,
    "intersection_tolerance": 15,
    "intersection_x_tolerance": 3,
    "intersection_y_tolerance": 3,
    "text_tolerance": 1,
    "text_x_tolerance": 3,
    "text_y_tolerance": 3,
}

section_dict = {
    "Invest": "Section d'investissement",
    "Fonct": "Section de fonctionnement"
}

# Main

## Tome 1 - Annexes hors dettes

### Main function

In [88]:
def main_tome1_dettes(folder_path, year, files_dict, min_page, max_page, text_col, num_col):
    
    # Defining path
    path_pdf = os.path.join(folder_path, files_dict[year])

    # Initialisation final df
    final_df = pd.DataFrame()
    
    # Opening PDF
    with pdfplumber.open(path_pdf) as pdf:
        
        # Parcourir chaque page
        for page_num in range(min_page, max_page):
            if page_num in (min_page, max_page-1) or page_num%5 == 0:
                print(page_num)
                
            # Lire le tableau
            page = pdf.pages[page_num]
            params["explicit_vertical_lines"] = page.curves + page.edges
            params["explicit_horizontal_lines"] = page.curves + page.edges
            tables = page.extract_table(table_settings=params)

            if tables: #Des pages peuvent être vides

                # Obtenir l'index des tables où les colonnes commencent
                col_index = 3
        
                # Générer les noms des colonnes
                col_names = column_names_generate(tables, col_index)
        
                # Générer le DataFrame
                df = pd.DataFrame(tables[col_index:], columns=col_names)
    
                # Corrections des éventuelles lignes en gras
                df = correct_bold_text_triple(df)
                df = correct_bold_text_double(df)
    
                # Ajout colonne
                df.insert(0, "Numéro_page", page_num + 1)
                
                # Concatenation
                final_df = pd.concat([final_df, df])

    # Cleaning
    for col in text_col:
        final_df.iloc[:,col] = final_df.iloc[:,col]\
            .str.replace("\n"," ")\
            .str.replace("(cid:176)","°")\
            .str.replace("Ø","é")\
            .str.replace("(cid:244)","ô")\
            .str.replace("(cid:224)","à")\
            .str.replace("(cid:226)","â")\
            .str.replace("(cid:231)","ç")\
            .str.replace("Ł","è")
    for col in num_col:
        final_df.iloc[:,col] = final_df.iloc[:,col].apply(lambda x: float(x.replace(" ","").replace(",",".").replace("\n","")) if len(x) else x)
    final_df.columns = final_df.columns\
        .str.replace("(cid:176)","°")\
        .str.replace("Ø","é")\
        .str.replace("(cid:244)","ô")\
        .str.replace("(cid:224)","à")\
        .str.replace("(cid:226)","â")\
        .str.replace("(cid:231)","ç")\
        .str.replace("Ł","è")\
        .str.replace('(cid:146)',"'")\
        .str.replace("Œ","ê")\
        .str.replace("ß","û")
    
    # Ajout nom fichier
    final_df.insert(0, "Fichier_source", files_dict[year])
    final_df.insert(0, "Année", year)
    
    final_df.reset_index(drop=True, inplace=True)
    
    return final_df

### Params

In [99]:
# Définir le chemin du fichier PDF
folder_path = os.path.join("sources", "Tome_1")

# Creation dictionnaire fichiers
files_dict = {
    2023: "CA 2023 - Budget général Compte sur chiffres et annexes Tome 1.pdf",
    2022: "02 - CA 2022 Budget général - Compte sur chiffres et annexes - Tome 1.pdf",
    2021: "CA 2021 - Tome 1.pdf",
    2020: "04 Document budgétaire (Tome 1).pdf",
    2019: "04 - Document budgétaire (Tome 1.1).pdf"
}

files_dict_spec = {
    2022: "03 - CA 2022 Budget général - Compte sur chiffres et annexes - Tome 2.pdf",
}

### Extraction

In [40]:
all_df = {}

In [95]:
year = 2023
all_df[f"Tome 1_{year}_Dettes_1"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 248, max_page = 254, text_col = [1, 2, 8], num_col = [6, 9, 10])
all_df[f"Tome 1_{year}_Dettes_2"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 254, max_page = 259, text_col = [1, 8], num_col = [3, 5, 6, 9, 10, 11, 12, 13])

248
250
253
254
255
258


In [100]:
year = 2022
all_df[f"Tome 2_{year}_Dettes_1"] = main_tome1_dettes(os.path.join("sources", "Tome_2"), year, files_dict_spec, min_page = 7, max_page = 13, text_col = [1, 2, 8], num_col = [6, 9, 10])
all_df[f"Tome 2_{year}_Dettes_2"] = main_tome1_dettes(os.path.join("sources", "Tome_2"), year, files_dict_spec, min_page = 13, max_page = 18, text_col = [1, 8], num_col = [3, 5, 6, 9, 10, 11, 12, 13])

7
10
12
13
15
17


In [96]:
year = 2021
all_df[f"Tome 1_{year}_Dettes_1"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 239, max_page = 244, text_col = [1, 2, 8], num_col = [6, 9, 10])
all_df[f"Tome 1_{year}_Dettes_2"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 244, max_page = 249, text_col = [1, 8], num_col = [3, 5, 6, 9, 10, 11, 12, 13])

239
240
243
244
245
248


In [97]:
year = 2020
all_df[f"Tome 1_{year}_Dettes_1"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 241, max_page = 246, text_col = [1, 2, 8], num_col = [6, 9, 10])
all_df[f"Tome 1_{year}_Dettes_2"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 246, max_page = 251, text_col = [1, 8], num_col = [3, 5, 6, 9, 10, 11, 12, 13])

241
245
246
250


In [98]:
year = 2019
all_df[f"Tome 1_{year}_Dettes_1"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 263, max_page = 268, text_col = [1, 2, 8], num_col = [6, 9, 10])
all_df[f"Tome 1_{year}_Dettes_2"] = main_tome1_dettes(folder_path, year, files_dict, min_page = 269, max_page = 274, text_col = [1, 8], num_col = [3, 5, 6, 9, 10, 11, 12, 13])

263
265
267
269
270
273


### Concatenating

In [111]:
final_df_1 = pd.DataFrame()
final_df_2 = pd.DataFrame()
for key, df in all_df.items():
    df.columns = df.columns.str.replace("’","'")
    df.columns = df.columns.str.replace("''","'")
    if key.endswith("_1"):
        final_df_1 = pd.concat([final_df_1, df])
    else:
        for num in range(10, 18):
            df.columns = df.columns.str.replace(f"({num})","")
        final_df_2 = pd.concat([final_df_2, df])
final_df_1.reset_index(drop=True, inplace=True)
final_df_2.reset_index(drop=True, inplace=True)

### Excel extraction

In [114]:
file_name = "Tome 1_2019-2023_Répartition_Dettes.xlsx"
with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
    final_df_1.to_excel(writer, sheet_name="Dette_Repart", index=False)
    final_df_2.to_excel(writer, sheet_name="Dette_Repart_B1_2", index=False)