In [64]:
import requests
import os
import pymupdf
import re
import csv
from datetime import datetime, timedelta
import pandas as pd

In [None]:
def url_existance(url):
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code == 200
    except requests.RequestException as e:
        print(f"Error checking URL {url}: {e}")
        return False

In [65]:
def construct_urls(year, region_code):
    url_base = 'https://www.boe.es/borme/dias/{year}/{month:02}/{day:02}/pdfs/BORME-A-{year}-{version_number}-{region_code}.pdf'
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    current_date = start_date
    urls = []
    version_number = 1  # Start with version number 1

    while current_date <= end_date:
        original_version_number = version_number  # Store the original version number for the current date
        attempts = 0  # Track the number of version attempts for the current date
        found_version = False  # Flag to check if any version was found for the current date

        while attempts < 8:  # Allow up to 3 attempts for the current version number
            # Construct the URL for the current date and version number
            url = url_base.format(year=current_date.year, month=current_date.month,
                                  day=current_date.day, version_number=version_number, region_code=region_code)
            print(f"Trying URL: {url}")  # Debug statement to show the URL being checked

            if url_existance(url):
                print(f"URL exists: {url}")
                urls.append(url)
                found_version = True  # Mark that we found at least one version
                version_number += 1  # Move to the next version number for the next check
                break  # Exit to try the next date
            else:
                print(f"URL does not exist: {url}")
                attempts += 1  # Increment attempts count
                version_number += 1  # Increment version number to try the next one

        # If we reached 3 attempts and still no versions were found, reset the version number
        if not found_version:
            print(f"No versions found for {current_date}. Resetting version number to {original_version_number}.")
            version_number = original_version_number  # Reset to the original version number

        # Move to the next day
        current_date += timedelta(days=1)

    return urls

In [67]:
def download_pdf(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            temp_filename = 'borme_temp.pdf'
            with open(temp_filename, 'wb') as f:
                f.write(response.content)
            return temp_filename
        else:
            print(f"Failed to download PDF from {url}. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"Error downloading PDF from {url}: {e}")
        return None

In [68]:
def extract_text(pdf_file):
    try:
        # Open the provided PDF file
        pdf_document = pymupdf.open(pdf_file)

        text = ""

        # Iterate through each page in the PDF
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()

        return text

    except FileNotFoundError:
        print(f"Error: The file '{pdf_file}' was not found.")
        return None
    except pymupdf.pymupdf.PdfError as e:
        print(f"Error processing PDF: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


In [69]:
def split_into_segments(text):
    # Regex pattern to match the segments
    pattern = r'\b(\d{3}|\d{4}|\d{5}|\d{6})\s-\s'
    segments = re.split(pattern, text)
    
    # Remove empty segments if any
    segments = [segment.strip() for segment in segments if segment.strip()]
    
    if not segments:
        print("No segments found. Please check the input text format.")
    
    return segments

In [70]:
def filter_segments(segments):
    try:
        filtered_segments = [segment for segment in segments if "Extinción." in segment]
        return filtered_segments
    except Exception as e:
        print(f"Error filtering segments: {e}")
        return []

In [71]:
def extract_information_from_segment(segment):
    # Initialize the details dictionary
    details = {
        'company_name': None,
        'disolucion': None,
        'disolucion_date': None,
        'registration_number': None
    }

    try:
       

        # Disolución is our target word
        if "Disolución" in segment:
            if "Ceses/Dimisiones" in segment: #First pattern for segment
                # Extract company name
                pattern = r'(.*?)(?=\s*Ceses/Dimisiones)'
                match = re.search(pattern, segment, re.DOTALL)
                if match:
                    details['company_name'] = match.group(1).strip()
                    print(f"Extracted company name: {details['company_name']}")  # Debug statement
            elif "Situación concursal" in segment: #Second pattern for segment
                pattern = r'(.*?)(?=\s*Situación concursal)'
                match = re.search(pattern, segment, re.DOTALL)
                if match:
                    details['company_name'] = match.group(1).strip()
                    print(f"Extracted company name: {details['company_name']}")  # Debug statement
            else:
                pattern = r'(.*?)(?=\s*Disolución)'
                match = re.search(pattern, segment, re.DOTALL)
                if match:
                    details['company_name'] = match.group(1).strip()
                    print(f"Extracted company name: {details['company_name']}")  # Debug statement
            details['disolucion'] = 1
            date_match = re.search(r'(\d{1,2}\.\d{2}\.\d{2})', segment)
            details['disolucion_date'] = date_match.group(1) if date_match else None
                
            # Extract registration number (4 to 6 digits)
            reg_num_pattern = r'H\s*V\s*(\d{4,6})'
            reg_num_match = re.search(reg_num_pattern, segment, re.DOTALL)
            if reg_num_match:
                details['registration_number'] = reg_num_match.group(1)
            else:
                details['registration_number'] = "Not found"
        
        # Extinción
        else: 
            if "Extinción" in segment: #Second target word
                if "Ceses/Dimisiones" in segment: #First pattern
                    # Extract company name
                    pattern = r'(.*?)(?=\s*Ceses/Dimisiones)'
                    match = re.search(pattern, segment, re.DOTALL)
                    if match:
                        details['company_name'] = match.group(1).strip()
                        print(f"Extracted company name: {details['company_name']}")  # Debug statement
            
                elif "Situación concursal" in segment: #Second pattern
                    pattern = r'(.*?)(?=\s*Situación concursal)'
                    match = re.search(pattern, segment, re.DOTALL)
                    if match:
                        details['company_name'] = match.group(1).strip()
                        print(f"Extracted company name: {details['company_name']}")  # Debug statement
            
                else:
                    pattern = r'(.*?)(?=\s*Extinción)'
                    match = re.search(pattern, segment, re.DOTALL)
                    if match:
                        details['company_name'] = match.group(1).strip()
                        print(f"Extracted company name: {details['company_name']}")  # Debug statement
                details['disolucion'] = 1
                date_match = re.search(r'(\d{1,2}\.\d{2}\.\d{2})', segment)
                details['disolucion_date'] = date_match.group(1) if date_match else None

            # Extract registration number (4 to 6 digits)
            reg_num_pattern = r'H\s*V\s*(\d{4,6})'
            reg_num_match = re.search(reg_num_pattern, segment, re.DOTALL)
            if reg_num_match:
                details['registration_number'] = reg_num_match.group(1)
            else:
                details['registration_number'] = "Not found"

    except Exception as e:
        print(f"Error extracting information from segment: {e}")

    return details

In [72]:
def main():
    years = list(range(2016, 2025))  # List of years from 2016 to 2024
    region_code = "46"
    all_extracted_data = []  # List to collect extracted data from all PDFs

    for year in years:
        urls = construct_urls(year, region_code)  # Generate URLs for the current year

        for url in urls:
            print(f"Processing URL: {url}")  # Debug message
            pdf_file = download_pdf(url)
            if pdf_file:
                print(f"PDF saved as: {pdf_file}")
            else:
                print("Failed to download the PDF.")
                continue  # Skip to the next URL if download fails

            extracted_text = extract_text(pdf_file)
            if extracted_text:
                print("Extracted Text Successfully Downloaded")
            else:
                print("Failed to extract text from the PDF")
                continue  # Skip to the next URL if extraction fails
            
            segments = split_into_segments(extracted_text)
            if not segments:
                print("No segments found after splitting the text.")
                continue  # Skip to the next URL if no segments are found
            
            filtered_segments = filter_segments(segments)
            if not filtered_segments:
                print("No filtered segments found. Please check the extraction logic.")
                continue  # Skip to the next URL if no filtered segments are found
            
            for segment in filtered_segments:
                details = extract_information_from_segment(segment)
                if details:  # Check if details were extracted successfully
                    all_extracted_data.append(details)  # Append the details to the list
                else:
                    print("No details found for segment.")
            
            # Delete the temporary PDF file after processing
            try:
                os.remove(pdf_file)
                print(f"Deleted temporary file: {pdf_file}")
            except OSError as e:
                print(f"Error deleting temporary file: {e}")

        # Create encoding='utf-8'a DataFrame for the current year
        if all_extracted_data:
            df = pd.DataFrame(all_extracted_data)
            file_path = f'/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Raw/Disolucion/Valencia_disolucion_{year}_data.csv'  # Set file path with year
            df.to_csv(file_path, index=False, encoding='latin1')  # Save DataFrame to CSV file
            print(f'DataFrame saved to: {file_path}')
        else:
            print(f"No data to save for year {year}.")

        all_extracted_data.clear()  # Clear the data for the next year

    print("Processing completed for all years.")

if __name__ == "__main__":
    main()

Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-1-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-1-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-2-46.pdf


URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-2-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-3-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-3-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-4-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-4-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-5-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-5-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-6-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-6-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-7-46.pdf
URL does not exist: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-7-46.pdf
Trying URL: https://www.boe.es/borme/dias/2016/01/01/pdfs/BORME-A-2016-8