In [1]:
import os
import PyPDF2
import pandas as pd
import fitz
import re

In [2]:
def pdf_to_text(pdf_path, start_section, exclude_section, custom_exclusion):

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Extract text from each page
    page_text = ""
    text_content = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        page_text += page.get_text()

        # Check if the start_section is found
        if start_section.lower() in page_text.lower():
            # Include text only from the start_section onwards
            text_content += page_text.split(start_section, 1)[1]
            break

    # Find the index of the section to be excluded
    section_index = text_content.find(exclude_section)
    if section_index != -1:
        # Exclude content after the section
        text_content = text_content[:section_index]

    # Apply custom exclusion
    for phrase in custom_exclusion:
        text_content = re.sub(re.escape(phrase), '', text_content, flags=re.IGNORECASE)

    return text_content

In [15]:
"Extract Text from IEEE Papers"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\IEEE_Xplore'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['XXX-X-XXXX-XXXX-X/XX/$XX.00 ©20XX IEEE', 
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:12:06 UTC from IEEE Xplore. Restrictions apply.',
                    '2023 International Conference on Computer, Electronics & Electrical Engineering & their Applications (IC2E3) | 979-8-3503-3800-3/23/$31.00 ©2023 IEEE | DOI: 10.1109/IC2E357697.2023.10262765',
                    '979-8-3503-0485-5/23/$31.00 ©2023 IEEE',
                    'DOI 10.1109/ICWS60048.2023.00101',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:16:27 UTC from IEEE Xplore. Restrictions apply.',
                    '2023 IEEE International Conference on Web Services (ICWS) | 979-8-3503-0485-5/23/$31.00 ©2023 IEEE | DOI: 10.1109/ICWS60048.2023.00101',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:12:37 UTC from IEEE Xplore. Restrictions apply.',
                    '978-1-6654-9058-0/22/$31.00 ©2022 IEEE',
                    '2022 International Conference on Data Analytics for Business and Industry (ICDABI) |978-1-6654-9058-0/22/$31.00 ©2022 IEEE |DOI: 10.1109/ICDABI56818.2022.10041672',
                    '2022 International Conference on Data Analytics for Business and Industry (ICDABI)',
                    '978-1-6654-3902-2/21/$31.00 ©2021 IEEE',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:15:32 UTC from IEEE Xplore. Restrictions apply.',
                    '2021 IEEE International Conference on Big Data (Big Data) | 978-1-6654-3902-2/21/$31.00 ©2021 IEEE | DOI: 10.1109/BigData52589.2021.9671862'
                   ]
start_section = 'Abstract'
exclude_section = 'References'

IEEE_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        # Call the pdf_to_dataframe function for each PDF
        IEEE_pdf_content = pdf_to_text(pdf_path, start_section, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        IEEE_pdf_content_dict[base_name] = IEEE_pdf_content

In [16]:
IEEE_pdf_df = pd.DataFrame(list(IEEE_pdf_content_dict.items()), columns=['Filename', 'Content'])
IEEE_pdf_df

Unnamed: 0,Filename,Content
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,— Data localization laws are becoming more \nc...
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,—Data mesh is a socio-technical approac...
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,— As the enterprises experience exponential gr...
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,—This paper presents a systematic review of\nc...


In [17]:
"Extract Text from Springer Link Papers"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\Springer_Link'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['© Eberhard Hechler, Maryela Weihrauch, Yan (Catherine) Wu 2023', 'E. Hechler et al., Data Fabric and Data Mesh Approaches with AI,'
                    'https://doi.org/10.1007/978-1-4842-9253-2_1', 'https://doi.org/10.1007/978-1-4842-9253-2_2', 'https://doi.org/10.1007/978-1-4842-9253-2_3', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_4', 'https://doi.org/10.1007/978-1-4842-9253-2_5', 'https://doi.org/10.1007/978-1-4842-9253-2_6', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_7', 'https://doi.org/10.1007/978-1-4842-9253-2_8', 'https://doi.org/10.1007/978-1-4842-9253-2_9', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_10', 'https://doi.org/10.1007/978-1-4842-9253-2_11', 'https://doi.org/10.1007/978-1-4842-9253-2_12',
                    'https://doi.org/10.1007/978-1-4842-9253-2_13', 'https://doi.org/10.1007/978-1-4842-9253-2_14', 'https://doi.org/10.1007/978-1-4842-9253-2_15', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_16', 'https://doi.org/10.1007/978-1-4842-9253-2_17', 
                    'Chapter 1 Evolution of Data Architecture', 
                    'Chapter 2 Terminology: Data Fabric and Data Mesh', 
                    'Chapter 3 Data Fabric Data Mesh Use Case Scenarios', 
                    'Chapter 4 Data Fabric and Data Mesh Business Benefits', 
                    'Chapter 5 Key Data Fabric and Data Mesh Capabilities',
                    'Chapter 6 Relevant ML and DL Concepts',
                    'Chapter 7 AI and ML for a Data Fabric and Data Mesh',
                    'Chapter 8 AI for Entity Resolution',
                    'Chapter 9 Data Fabric and Data Mesh for the AI Lifecycle',
                    'Chapter 10 Data Fabric Architecture Patterns',
                    'Chapter 11 Data Fabric Within an Enterprise Architecture',
                    'Chapter 12 Data Fabric and Data Mesh in a Hybrid Cloud Landscape',
                    'Chapter 13 Intelligent Cataloging and Metadata Management',
                    'Chapter 14 Automated Data Fabric and Data Mesh Aspects',
                    'Chapter 15 Data Governance in the Context of Data Fabric and Data Mesh',
                    'Chapter 16 Sample Vendor Offerings',
                    'Chapter 17 Data Fabric and Data Mesh Research Areas',
                    ' The Author(s), under exclusive license to Springer Nature Switzerland AG 2022',
                    'C. Strauss et al. (Eds.): DEXA 2022, LNCS 13426, pp. 85–102, 2022.',
                    'https://doi.org/10.1007/978-3-031-12423-5_7',
                    'N. J. Podlesny et al.',
                    'CoK: A Survey of Privacy Challenges in Relation to Data Meshes',
                    ' The Author(s), under exclusive license to Springer Nature Switzerland AG 2023',
                    'C. Strauss et al. (Eds.): DEXA 2023, LNCS 14146, pp. 3–18, 2023.',
                    'https://doi.org/10.1007/978-3-031-39847-6_1',
                    'R. Wrembel', 'Data Integration: From Data Warehouse to Data Mesh',
                    'A. Burduk et al. (Eds.): ISPEM 2023, LNNS 790, pp. 300–314, 2023.',
                    'https://doi.org/10.1007/978-3-031-45021-1_23',
                    'Converging Data Mesh and Microservice Principles',
                    'F. Morais et al.'
                   ]

start_section_abs = 'Abstract'
start_section_intro = 'Introduction'
exclude_section = 'References'

springer_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        if '978-1-4842-9253-2' in filename:
            # Call the pdf_to_dataframe function for each PDF
            springer_pdf_content = pdf_to_text(pdf_path, start_section_intro, exclude_section, custom_exclusion)

        else:
            # Call the pdf_to_dataframe function for each PDF
            springer_pdf_content = pdf_to_text(pdf_path, start_section_abs, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        springer_pdf_content_dict[base_name] = springer_pdf_content

In [18]:
springer_pdf_df = pd.DataFrame(list(springer_pdf_content_dict.items()), columns=['Filename', 'Content'])
springer_pdf_df

Unnamed: 0,Filename,Content
0,978-1-4842-9253-2_1,"\nWhen you look back in time, data architectur..."
1,978-1-4842-9253-2_10,\nIn this chapter we provide a high-level over...
2,978-1-4842-9253-2_11,\nIn Chapter 1 we saw that a Data Fabric archi...
3,978-1-4842-9253-2_12,"\nIn this chapter, we look at the intersection..."
4,978-1-4842-9253-2_13,"to Metadata Management\nIn the digital era, e..."
5,978-1-4842-9253-2_14,\nApplying AI to metadata management and intel...
6,978-1-4842-9253-2_15,\nData is constantly generated while we live a...
7,978-1-4842-9253-2_16,\nThe goal of Data Fabric1 is to provide a fle...
8,978-1-4842-9253-2_17,\nIt must have become obvious in this book tha...
9,978-1-4842-9253-2_2,\nThe terms Data Fabric and Data Mesh are ofte...
