In [None]:
import camelot, fitz, ghostscript, matplotlib.pyplot as plt, os, pandas as pd, re

In [1]:
class _Service_Report:
    def __init__(self, name, folder_path=None):
        self.folder_path = folder_path
        self.name = name
        
        self.doc = self._open()
        self.doc_length = len(self.doc)
        
        self._close()
        
        
    def _open(self):
        self.doc = fitz.open(os.path.join(self.path, self.name) if self.path else self.name)
 
    
    def _get_first_page(self):
        return self.doc[0].get_text()
       
    
    def _get_header_informations(self):
        first_page = self._get_first_page()
        
        header_informations = {}
        
        return header_informations
    
    def _get_header_informations(self):
        text = self.doc[0].get_text()
       
        turbine_number = re.search(r'Turbine No\./Id:\s*(\d+)', text).group(1) if re.search(r'Turbine No\./Id:\s*(\d+)', text) else None
        service_order = re.search(r'Service Order:\s*(\d+)', text).group(1) if re.search(r'Service Order:\s*(\d+)', text) else None
        pad_no = (match.group(1).strip() if (match := re.search(r'PAD No\.\s*([^\n]+)', text)) else None)
        turbine_type = re.search(r'Turbine Type:\s*([\w\d]+)', text).group(1) if re.search(r'Turbine Type:\s*([\w\d]+)', text) else None
        start_date = re.search(r'Start Date:\s*([\d\.]+)', text).group(1) if re.search(r'Start Date:\s*([\d\.]+)', text) else None
        end_date = (match.group(1) if (match := re.search(r'End Date:\s*([\d\.]+)', text)) else None)
        date_and_time_of_receipt = (match.group(1).strip() if (match := re.search(r'Date & Time of Receipt\s*([\d\.\s:]+)', text)) else None)
        reason_for_call_out = (match.group(1) if (match := re.search(r'Reason for Call Out:\s*([^\n]+)', text)) else None)
                
        customer_address = ([line.strip() for line in match.group(1).split('\n') if line.strip()] 
                       if (match := re.search(r"Customer's Address:\s*(.*?)Site's Address:", text, re.DOTALL)) 
                       else None)
        
        header_informations = {
            'turbine_number': turbine_number,
            'service_order': service_order,
            'pad_no': pad_no,
            'turbine_type': turbine_type,
            'start_date': start_date,
            'customer_address': customer_address,
            'date_and_time_of_receipt': date_and_time_of_receipt,
            'reason_for_call_out': reason_for_call_out
        }
        
        return header_informations
        
    def _get_sif_page_number(self):
        for page_num in range(self.page_numbers):
            if "Service Inspection Form" in self.doc[page_num].get_text():
                return page_num
        raise ValueError("'Service Inspection Form' non trouvé dans le document")

    def _get_table(self, page_number):
        params = {'pages': str(page_number)}
        
        if self.stream:
            params['stream'] = self.stream
        if self.edge_tol is not None:
            params['camelot_edge_tol']
        if self.row_tol is not None:
            params['row_tol'] = self.camelot_row_tol
        if self.camelot_columns is not None:
            params['columns'] = self.camelot_columns
        
        tables = camelot.read_pdf(
            os.path.join(self.folder_path, self.name) if self.folder_path else self.name,
            **params  # Décompresse le dictionnaire en arguments nommés
        )
        
        if len(tables) > 0:
            return tables[0].df
        else:
            raise ValueError(f"Aucune table trouvée à la page {page_number}")
        
        
    def _get_full_table(self):
        sif_page_number = self._get_sif_page_number()
        all_tables = []
        
        for page_num in range(self.sif_page, self.page_numbers):
            try:
                # +1 car Camelot commence à 1
                table = self.get_page_table(page_num + 1)
                all_tables.append(table)
            except ValueError:
                continue
        
        if not all_tables:
            raise ValueError("Aucune table trouvée dans le document")
        
        final_table = pd.concat(all_tables, ignore_index=True)
        return final_table

        
    def _close(self):
        if doc is not None:
            self.doc.close()
            self.doc = None


class Vestas_Report(_Service_Report):
    def __init__(self, name, folder_path=None):
        super().__init__(name, folder_path)
        self._set_camelot_params()
        
    def _set_camelot_params(self):   
        self.camelot_params = {
            'flavor': 'stream',
            'edge_tol': 500,
            'row_tol': 10,
            'columns': ['65,330,350'] 
        }
        

    def _get_header_informations(self):
        first_page = super().get_first_page()
        
        turbine_number = re.search(r'Turbine No\./Id:\s*(\d+)', first_page).group(1) if re.search(r'Turbine No\./Id:\s*(\d+)', first_page) else None
        service_order = re.search(r'Service Order:\s*(\d+)', first_page).group(1) if re.search(r'Service Order:\s*(\d+)', first_page) else None
        pad_no = (match.group(1).strip() if (match := re.search(r'PAD No\.\s*([^\n]+)', first_page)) else None)
        turbine_type = re.search(r'Turbine Type:\s*([\w\d]+)', first_page).group(1) if re.search(r'Turbine Type:\s*([\w\d]+)', first_page) else None
        start_date = re.search(r'Start Date:\s*([\d\.]+)', first_page).group(1) if re.search(r'Start Date:\s*([\d\.]+)', first_page) else None
        end_date = (match.group(1) if (match := re.search(r'End Date:\s*([\d\.]+)', first_page)) else None)
        date_and_time_of_receipt = (match.group(1).strip() if (match := re.search(r'Date & Time of Receipt\s*([\d\.\s:]+)', first_page)) else None)
        reason_for_call_out = (match.group(1) if (match := re.search(r'Reason for Call Out:\s*([^\n]+)', first_page)) else None)
                
        customer_address = ([line.strip() for line in match.group(1).split('\n') if line.strip()] 
                       if (match := re.search(r"Customer's Address:\s*(.*?)Site's Address:", first_page, re.DOTALL)) 
                       else None)
        
        header_informations = {
            'turbine_number': turbine_number,
            'service_order': service_order,
            'pad_no': pad_no,
            'turbine_type': turbine_type,
            'start_date': start_date,
            'customer_address': customer_address,
            'date_and_time_of_receipt': date_and_time_of_receipt,
            'reason_for_call_out': reason_for_call_out
        }
        
        return header_informations

SyntaxError: invalid syntax (3177591441.py, line 56)

In [None]:
test_folder_path = 'data'
test_name = 