In [1]:
import os
import PyPDF2
import pandas as pd
import fitz
import re

In [2]:
def pdf_to_text(pdf_path, start_sections, exclude_section, custom_exclusion, page_range = None):

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Extract text from each page
    page_text = ""
    text_content = ""

    start_page, end_page = page_range if page_range else (0, pdf_document.page_count)
    
    for page_num in range(start_page, min(end_page, pdf_document.page_count)):
        page = pdf_document[page_num]
        page_text += page.get_text()

        # Check if any of the possible_start_sections is found
        for start_section in start_sections:
            if start_section.lower() in page_text.lower():
                # Include text only from the start_section onwards
                split_result = page_text.split(start_section, 1)
                if len(split_result) > 1:
                    text_content += split_result[1]

    # Find the index of the section to be excluded
    section_index = text_content.find(exclude_section)
    if section_index != -1:
        # Exclude content after the section
        text_content = text_content[:section_index]

    # Apply custom exclusion
    for phrase in custom_exclusion:
        text_content = re.sub(re.escape(phrase), '', text_content, flags=re.IGNORECASE)

    return text_content

## IEEE_Xplore

In [3]:
"Extract Text from IEEE Papers"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\IEEE_Xplore'
# folder_path = r'C:\Users\sitas\OneDrive\Desktop\data_mesh_topics_and_sentiment\dataset\Data_Mesh_Literature\IEEE_Xplore'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['XXX-X-XXXX-XXXX-X/XX/$XX.00 ©20XX IEEE',
                    '\n',
                    'chieve',
                    'Department of  Computer Science & Engineering',
                    'Breaking Down Data Silos: Data Mesh to Achieve',
                    'Effective Aggregation in Data Localization',
                    'Jaganmohan Reddy Kancharla',
                    'Department of Computer Science & Engineering',
                    'National Institute of Technology',
                    'Calicut',
                    'jaganmohan_p220051cs@nitc.ac.in',
                    'Dr. S. D. Madhu Kumar',
                    'Department of Computer Science & Engineering',
                    'National Institute of Technology',
                    'Calicut',
                    'madhu@nitc.ac.in',
                    'Arif Wider 1,2 , Sumedha Verma 1 , and Atif Akhtar 1',
                    '1 Thoughtworks Germany',
                    'Caffamacherreihe 7, 20355 Hamburg, Germany',
                    '{awider,sumedhav,syedatif}@thoughtworks.com',
                    '2 Hochschule für Technik und Wirtschaft Berlin',
                    'Treskowallee 8, 10318 Berlin, Germany',
                    'wider@htw-berlin.de',
                    'rchitecturesTorsten Priebe, Sebastian NeumaierData Intelligence Research GroupSt. Pölten University of Applied Sciences, Austria{torsten.priebe, sebastian.neumaier}@fhstp.ac.atStefan MarkusSimplity AT GmbHVienna, Austriastefan.markus@simplity.ai',
                    ' Decentralized Data MeshApproachVijay Kumar Butte1, Sujata Butte*2*University of Idaho,  Idaho Falls, ID, USA2suja2591@vandals.uidaho.edu1hellovkb@gmail.com',
                    'pproaches  Arif Wider  1,2  , Sumedha Verma  1  , and Atif Akhtar  1  1  Thoughtworks Germany      2  Hochschule für Technik und Wirtschaft Berlin      ',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:12:06 UTC from IEEE Xplore. Restrictions apply.',
                    '2023 International Conference on Computer, Electronics & Electrical Engineering & their Applications (IC2E3) | 979-8-3503-3800-3/23/$31.00 ©2023 IEEE | DOI: 10.1109/IC2E357697.2023.10262765',
                    '979-8-3503-0485-5/23/$31.00 ©2023 IEEE',
                    'DOI 10.1109/ICWS60048.2023.00101',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:16:27 UTC from IEEE Xplore. Restrictions apply.',
                    '2023 IEEE International Conference on Web Services (ICWS) | 979-8-3503-0485-5/23/$31.00 ©2023 IEEE | DOI: 10.1109/ICWS60048.2023.00101',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:12:37 UTC from IEEE Xplore. Restrictions apply.',
                    '978-1-6654-9058-0/22/$31.00 ©2022 IEEE',
                    '2022 International Conference on Data Analytics for Business and Industry (ICDABI) |978-1-6654-9058-0/22/$31.00 ©2022 IEEE |DOI: 10.1109/ICDABI56818.2022.10041672',
                    '2022 International Conference on Data Analytics for Business and Industry (ICDABI)',
                    '978-1-6654-3902-2/21/$31.00 ©2021 IEEE',
                    'Authorized licensed use limited to: Technische Hochschule Ingolstadt. Downloaded on October 24,2023 at 14:15:32 UTC from IEEE Xplore. Restrictions apply.',
                    '2021 IEEE International Conference on Big Data (Big Data) | 978-1-6654-3902-2/21/$31.00 ©2021 IEEE | DOI: 10.1109/BigData52589.2021.9671862'
                   ]
start_section = 'Abstract'
exclude_section = 'References'

IEEE_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        # Call the pdf_to_dataframe function for each PDF
        IEEE_pdf_content = pdf_to_text(pdf_path, start_section, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        IEEE_pdf_content_dict[base_name] = IEEE_pdf_content

In [4]:
IEEE_pdf_df = pd.DataFrame(list(IEEE_pdf_content_dict.items()), columns=['Filename', 'Content'])
IEEE_pdf_df["Publisher"] = "IEEE"
IEEE_pdf_df

Unnamed: 0,Filename,Content,Publisher
0,Breaking Down Data Silos Data Mesh to Achieve ...,Abstract— Data localization laws...,IEEE
1,Decentralized Data Governance as Part of a Dat...,Abstract —Data mesh is a socio-technical ...,IEEE
2,Enterprise Data Strategy A Decentralized Data ...,Abstract— As the enterprises experience expone...,IEEE
3,Finding Your Way Through the Jungle of Big Dat...,Abstract—This paper presents a systematic revi...,IEEE


## Springer_Link

In [6]:
"Extract Text from Springer Link Papers"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\Springer_Link'
# folder_path = r'C:\Users\sitas\OneDrive\Desktop\data_mesh_topics_and_sentiment\dataset\Data_Mesh_Literature\Springer_Link'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['\n',
                    'https://doi.org/10.1007/978-1-4842-9253-2_1',
                    '© Eberhard Hechler, Maryela Weihrauch, Yan (Catherine) Wu 2023', 'E. Hechler et al., Data Fabric and Data Mesh Approaches with AI,'
                    'https://doi.org/10.1007/978-1-4842-9253-2_1', 'https://doi.org/10.1007/978-1-4842-9253-2_2', 'https://doi.org/10.1007/978-1-4842-9253-2_3', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_4', 'https://doi.org/10.1007/978-1-4842-9253-2_5', 'https://doi.org/10.1007/978-1-4842-9253-2_6', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_7', 'https://doi.org/10.1007/978-1-4842-9253-2_8', 'https://doi.org/10.1007/978-1-4842-9253-2_9', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_10', 'https://doi.org/10.1007/978-1-4842-9253-2_11', 'https://doi.org/10.1007/978-1-4842-9253-2_12',
                    'https://doi.org/10.1007/978-1-4842-9253-2_13', 'https://doi.org/10.1007/978-1-4842-9253-2_14', 'https://doi.org/10.1007/978-1-4842-9253-2_15', 
                    'https://doi.org/10.1007/978-1-4842-9253-2_16', 'https://doi.org/10.1007/978-1-4842-9253-2_17', 
                    'Chapter 1 Evolution of Data Architecture', 
                    'Chapter 2 Terminology: Data Fabric and Data Mesh', 
                    'Chapter 3 Data Fabric Data Mesh Use Case Scenarios', 
                    'Chapter 4 Data Fabric and Data Mesh Business Benefits', 
                    'Chapter 5 Key Data Fabric and Data Mesh Capabilities',
                    'Chapter 6 Relevant ML and DL Concepts',
                    'Chapter 7 AI and ML for a Data Fabric and Data Mesh',
                    'Chapter 8 AI for Entity Resolution',
                    'Chapter 9 Data Fabric and Data Mesh for the AI Lifecycle',
                    'Chapter 10 Data Fabric Architecture Patterns',
                    'Chapter 11 Data Fabric Within an Enterprise Architecture',
                    'Chapter 12 Data Fabric and Data Mesh in a Hybrid Cloud Landscape',
                    'Chapter 13 Intelligent Cataloging and Metadata Management',
                    'Chapter 14 Automated Data Fabric and Data Mesh Aspects',
                    'Chapter 15 Data Governance in the Context of Data Fabric and Data Mesh',
                    'Chapter 16 Sample Vendor Offerings',
                    'Chapter 17 Data Fabric and Data Mesh Research Areas',
                    ' The Author(s), under exclusive license to Springer Nature Switzerland AG 2022',
                    'C. Strauss et al. (Eds.): DEXA 2022, LNCS 13426, pp. 85–102, 2022.',
                    'https://doi.org/10.1007/978-3-031-12423-5_7',
                    'N. J. Podlesny et al.',
                    'CoK: A Survey of Privacy Challenges in Relation to Data Meshes',
                    ' The Author(s), under exclusive license to Springer Nature Switzerland AG 2023',
                    'C. Strauss et al. (Eds.): DEXA 2023, LNCS 14146, pp. 3–18, 2023.',
                    'https://doi.org/10.1007/978-3-031-39847-6_1',
                    'R. Wrembel', 'Data Integration: From Data Warehouse to Data Mesh',
                    'A. Burduk et al. (Eds.): ISPEM 2023, LNNS 790, pp. 300–314, 2023.',
                    'https://doi.org/10.1007/978-3-031-45021-1_23',
                    'Converging Data Mesh and Microservice Principles',
                    'F. Morais et al.'
                   ]

# start_section_abs = 'Abstract'
start_sections = ['Abstract', 'Introduction']
exclude_section = 'References'

springer_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        # # Specify the page range for certain PDFs (e.g., pages 1 to 5)
        # page_range = (200, 210) if base_name == "978-3-031-36118-0" else None

        # Call the pdf_to_dataframe function for each PDF
        springer_pdf_content = pdf_to_text(pdf_path, start_sections, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        springer_pdf_content_dict[base_name] = springer_pdf_content

In [7]:
springer_pdf_df = pd.DataFrame(list(springer_pdf_content_dict.items()), columns=['Filename', 'Content'])
springer_pdf_df["Publisher"] = 'Springer'
springer_pdf_df

Unnamed: 0,Filename,Content,Publisher
0,01. Evolution of Data Architecture_book chapter,"When you look back in time, data architectures...",Springer
1,02. Terminology Data Fabric and Data Mesh_book...,The terms Data Fabric and Data Mesh are often ...,Springer
2,03. Data Fabric and Data Mesh Use Case Scenari...,Many organizations realized the importance of ...,Springer
3,04. Data Fabric and Data Mesh Business Benefit...,"After defining a Data Fabric, also as an under...",Springer
4,05. Key Data Fabric and Data Mesh Capabilities...,This chapter introduces key capabilities for b...,Springer
5,06. Relevant ML and DL Concepts_book chapter,"to AI, ML, and DLBroadly speaking, AI is a sy...",Springer
6,07. AI and ML for a Data Fabric and Data Mesh_...,Today’s data-rich enterprises are frequently c...,Springer
7,08. AI for Entity Resolution_book chapter,The data used to describe the core business en...,Springer
8,09. Data Fabric and Data Mesh for the AI Lifec...,to the AI LifecycleAI and software engineerin...,Springer
9,10. Data Fabric Architecture Patterns_book cha...,In this chapter we provide a high-level overvi...,Springer


## Research_Gate

In [8]:
"Extract Text from Research_gate"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\Research_Gate'
# folder_path = r'C:\Users\sitas\OneDrive\Desktop\data_mesh_topics_and_sentiment\dataset\Data_Mesh_Literature\Research_Gate'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['\n',
                    'Sarah Choudhary, Prospect Ph.D. from Capitol University in Data Science',
                    ' The Author(s), under exclusive license to Springer Nature Switzerland AG 2022',
                    'Frontiers in Artificial Intelligence','frontiersin.org','Pakrashi et al.',
                    '10.3389/frai.2023.1209507', 'School of Computer Science, University College Dublin, Dublin, Ireland,',
                    'Insight Centre for Data Analytics, Dublin, Ireland, VistaMilk SFI Research Centre, Teagasc Moorepark, Fermoy, Ireland, Accenture Labs, Dublin, Ireland'
                    'Arjun Pakrashi, Duncan Wallace, Brian Mac Namee,',
                    'Derek Greene and Christophe Guéret',
                    'The University of Melbourne, Australia',
                    'Tampere University of Technology, Finland',
                    'The Hong Kong University of Science and Technology, China',
                    'arXiv:2304.01062v1',
                    'Abel and Indika, et al.',
                    'https://agrovoc.fao.org/browse/agrovoc/en/ (visited August 17, 2023).',
                    'http://agrold.org (visited August 17, 2023).',
                    'https://creativecommons.org/licenses/ (visited August 17, 2023).',
                    'https://www.doi.org/ (visited August 17, 2023).',
                    'https://www.icbf.com/ (visited August 17, 2023).',
                    'https://www.ornua.com/ (visited August 17, 2023).',
                    'https://www.teagasc.ie/ (visited August 17, 2023).',
                    'https://pasturebase.teagasc.ie (visited August 17, 2023).',
                    'https://www.icbf.com/the-icbf-database/ (visited August 17, 2023).',
                    'https://www.vistamilk.ie/ (visited August 17, 2023).',
                    '13 https://solidproject.org/ (visited August 17, 2023).',
                    '14 https://www.inrupt.com/ (visited August 17, 2023).',
                    'https://osduforum.org/ (visited August 17, 2023).',
                    'https://www.opengroup.org/ (visited August 17, 2023).',
                    'https://www.insight-centre.org/ (visited September 19, 2023).',
                    'Comma Separated Value file format.',
                    'Manuscript submitted to ACM',
                    'Authors’ addresses: Abel Goedegebuure, a.a.goedegebuure@tilburguniversity.edu; Indika Kumara, i.p.k.weerasinghadewage@tilburguniversity.edu; Stefan Driessen, s.w.driessen@tilburguniversity.edu; Willem-Jan van den Heuvel, w.j.a.m.vdnHeuvel@tilburguniversity.edu, Tilburg University, Warandelaan 2, Tilburg, North Brabant, Netherlands, 5037 AB; Geert Monsieur, g.monsieur@tue.nl; Damian Andrew Tamburri, d.a.tamburri@tue.nl, Eindhoven University of Technology, Groene Loper 3, Eindhoven, North Brabant, Netherlands, 5612 AZ; Dario Di Nucci, ddinucci@unisa.it, University of Salerno, Via Giovanni Paolo II, 132, Fisciano SA, Salerno, Italy, 84084.',
                    'arXiv:2302.01713v1 [cs.AI] 3 Feb 2023',
                    '∗This manuscript does not represent an official IBM statement.',
                    '0VERSION 1.0 as of 02/02/2023',
                    'Authors’ addresses: Jan Bode, jan.bode1@ibm.com, IBM, Germany; Niklas Kühl, kuehl@kit.edu, KIT, Germany; Dominik Kreuzberger, dominik.',
                    'kreuzberger@ibm.com, IBM, Germany; Sebastian Hirschl, sebastian.hirschl@de.ibm.com, IBM, Germany.',
                    'XXX-X-XXXX-XXXX-X/XX/$XX.00 ©20XX IEEE',
                    '1 Hochschule für angewandte Wissenschaften München, Fakultät für Informatik und Mathematik, Lothstr. 34, 80335 München, clara.voss97@gmail.com',
                    'This work is licensed under a Creative Commons Attribution 4.0 International License',
                    'IJARCCE',
                    'ISSN (O) 2278-1021, ISSN (P) 2319-5940',
                    'International Journal of Advanced Research in Computer and Communication Engineering',
                    'DOI: 10.17148/IJARCCE.2023.124121',
                    'ISO 3297:2007 CertifiedImpact Factor 8.102Vol. 12, Issue 4, April 2023',
                    'doi:10.20944/preprints202309.2113.v1',
                    '© 2023 by the author(s). Distributed under a Creative Commons CC BY license.',
                    'Preprints (www.preprints.org) | NOT PEER-REVIEWED | Posted: 3 October 2023',
                    'TYPE Methods',
                    'PUBLISHED 04 October 2023' ,
                    'DOI 10.3389/frai.2023.1209507',
                    'DOI: 10.17148/IJARCCE.2023.12412'
                   ]

# start_section_abs = 'Abstract'
start_sections = ['Accenture Labs, Dublin, Ireland','Abstract', 'Introduction', 'INTRODUCTION', 'The user has requested enhancement of the downloaded file.']
exclude_section = 'References'

research_gate_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        research_gate_pdf_content = pdf_to_text(pdf_path, start_sections, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        research_gate_pdf_content_dict[base_name] = research_gate_pdf_content

In [9]:
research_gate_pdf_df = pd.DataFrame(list(research_gate_pdf_content_dict.items()), columns=['Filename', 'Content'])
research_gate_pdf_df["Publisher"] = "ResearchGate"
research_gate_pdf_df

Unnamed: 0,Filename,Content,Publisher
0,A Comparative Study of Data Architectures_phd ...,"In this digital age, the quantity of data pro...",ResearchGate
1,Cow Mesh a data-mesh architecture to unify dai...,Dairy is an economically signiﬁcant industry t...,ResearchGate
2,Data Mesh a Systematic Gray Literature Review_...,The world is living in the golden age of data....,ResearchGate
3,Data Mesh Motivational Factors Challenges and ...,"As the volume of data continues to grow, organ...",ResearchGate
4,From Data Mess to Data Mesh Solution for Futur...,": As technology advances, data volume and velo...",ResearchGate
5,Identifying Alternatives and Deciding Factors ...,: The data mesh was introduced in 2019 as a ne...,ResearchGate
6,Navigating the Data Architecture Landscape_pre...,: In the rapidly evolving ﬁeld of data managem...,ResearchGate


## Science_Direct

In [83]:
"Extract Text from Science_direct"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\Science_Direct'
# folder_path = r'C:\Users\sitas\OneDrive\Desktop\data_mesh_topics_and_sentiment\dataset\Data_Mesh_Literature\Science_Direct'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['\n',
                    'The 6th International Conference on Emerging Data and Industry (EDI40),March 15-17, 2023, Leuven, Belgium',
                    ' © 2023 The Authors. Published by Elsevier B.V.',
                    '585 ',
                    '1877-0509   10.1016/j.procs.2023.03.074',
                    '10.1016/j.procs.2023.03.074',
                    '1877-0509',
                    'Keywords: Big Data; Data Mesh; Data Architectures; Data Lake.',
                    'Keywords: cloud computing; data mesh; data-driven architecture; domain concept;edge computing; event-based analytic; fog computing; IoT ; Real-time; Smart Monitoring',
                    '* . E-mail address: inesamachado98@gmail.com ',
                    'E-mail address: inesamachado98@gmail.com',
                    '© 2021 The Authors. Published by Elsevier B.V.',
                    'Peer-review under responsibility of the scientific committee of the Conference Program Chairs',
                    '1877-0509 © 2023 The Authors. Published by Elsevier B.V. This is an open access article under the CC BY-NC-ND license (https://creativecommons.org/licenses/by-nc-nd/4.0) Peer-review under responsibility of the scientific committee of the Conference Program Chairs 10.1016/j.procs.2023.03.074',
                    '∗ Corresponding author. Tel.: +085-919-9090',
                    '© 2020 The Authors. Published by Elsevier B.V.This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)Peer-review under responsibility of the Conference Program Chairs.',
                    'E-mail address: worapol.pon@nida.ac.th',
                    'Corresponding author', 'Inês Araújo Machado et al. / Procedia Computer Science 196 (2022) 263–271','1877-0509 © 2021 The Authors. Published by Elsevier B.V',
                    'This is an open access article under the CC BY-NC-ND license (https://creativecommons.org/licenses/by-nc-nd/4.0)',
                    'Peer-review under responsibility of the scientific committee of the CENTERIS –International Conference on ENTERprise Information',
                    'Systems / ProjMAN - International Conference on Project MANagement / HCist - International Conference on Health and Social Care ',
                    'Information Systems and Technologies 2021',
                    '10.1016/j.procs.2021.12.013',
                    'Worapol Alex Pongpech et al. / Procedia Computer Science 220 (2023) 584–591',
                    'Available online at www.sciencedirect.com',
                    'Procedia Computer Science 196 (2022) 263–271',
                    '1877-0509 © 2021 The Authors. Published by Elsevier B.V.']

start_section = ['Introduction','Abstract']
exclude_section = 'References'


Science_direct_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]
        
        Science_direct_content = pdf_to_text(pdf_path, start_section, exclude_section, custom_exclusion)

        # Store the filename and content in the dictionary
        Science_direct_pdf_content_dict[base_name] = Science_direct_content

In [84]:
Science_direct_pdf_df = pd.DataFrame(list(Science_direct_pdf_content_dict.items()), columns=['Filename', 'Content'])
Science_direct_pdf_df["Publisher"] = "Elsevier"
Science_direct_pdf_df

Unnamed: 0,Filename,Content,Publisher
0,A Distributed Data Mesh Paradigm for an Event-...,A smart system is a system capable of indepe...,Elsevier
1,Data Mesh Concepts and Principles of a Paradig...,Inherent to the growing use of the most vari...,Elsevier


## Google_Scholar

In [108]:
"Extract Text from Google_scholar"

# Replace 'your_folder_path' with the path to the folder containing PDFs
folder_path = r'C:\Users\mondal\OneDrive - Technische Hochschule Ingolstadt\Work\Projects\Papers\Data Mesh Success Factors\Google_Scholar'
# folder_path = r'C:\Users\sitas\OneDrive\Desktop\data_mesh_topics_and_sentiment\dataset\Data_Mesh_Literature\Google_Scholar'

# Specify page range, custom exclusion, and exclude section as needed
custom_exclusion = ['\n',
                    'Authors’ addresses: Jan Bode, jan.bode1@ibm.com, IBM, Germany; Niklas Kühl,kuehl@uni-bayreuth.de, University of Bayreuth, Germany; Dominik Kreuzberger,dominik.kreuzberger@ibm.com, IBM, Germany; Sebastian Hirschl, sebastian.hirschl@de.ibm.com, IBM, Germany.; Carsten Holtmann, carsten.holtmann@de.ibm.com, IBM,Germany.',
                    'URI: https://hdl.handle.net/10125/103468', '978-0-9981331-6-4', '(CC BY-NC-ND 4.0)',
                   'The research was conducted as part of the Polish Ministry of Education and Science’s Industrial Doctorate Program implemented from 2020 to 2024 (Contract No. DWD/4/24/2020)',
                    '© The Author(s), under exclusive license to Springer Nature Switzerland AG 2023', 'Z. Hu et al. (Eds.): ICCSEEA 2023, LNDECT 181, pp. 3–12, 2023.', 'https://doi.org/10.1007/978-3-031-36118-0_1',
                    ' Y. Tang and G. Li',
                    'arXiv:2302.01713v2',
                   'arXiv:2302.01713v2 [cs.AI] 5 Apr 2023',
                    '[cs.AI]  5 Apr 2023',
                   '∗This manuscript does not represent an official IBM statement',
                   '0VERSION 1.1 as of 05/04/2023',
                   '• Bode and Kühl, et al.',
                   'Authors’ addresses: Jan Bode, jan.bode1@ibm.com, IBM, Germany; Niklas Kühl, kuehl@uni-bayreuth.de, University of Bayreuth, Germany; Dominik Kreuzberger, dominik.kreuzberger@ibm.com, IBM, Germany; Sebastian Hirschl, sebastian.hirschl@ de.ibm.com, IBM, Germany.; Carsten Holtmann, carsten.holtmann@de.ibm.com, IBM, Germany.',
                   'Hochschule für angewandte Wissenschaften München, Fakultät für Informatik und Mathematik, Lothstr. 34, 80335 München, clara.voss97@gmail.com ',
                   'Lappeenranta–Lahti University of Technology LUT',
                   'School of Engineering Science',
                   'Industrial Engineering and Management',
                   'Oskar Vinnikainen',
                   'Master’s thesis',
                   'Examiners: Associate professor Kalle Elfvengren and Post-doctoral researcher Antti YläKujala'
                   ]

start_section_intro = ['Abstract', 'Introduction', 'INTRODUCTION', '1 Introduction']
exclude_section = 'References'

Google_scholar_pdf_content_dict = {}

# Iterate over PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)

        # Get the base name (without extension) of the file
        base_name = os.path.splitext(filename)[0]

        if base_name == "Data Mesh A Holistic Examination Of Its Principles, Practices, And Potential_master thesis":
            page_range = (9, 90) 

        elif base_name == "Utilization of Data Mesh Framework as a Part of Organization's Data Management_master thesis":
            page_range = (6, 56) 
        
        else: 
            page_range = None

        Google_scholar_pdf_content = pdf_to_text(pdf_path, start_section_intro, exclude_section, custom_exclusion, page_range)

        # Store the filename and content in the dictionary
        Google_scholar_pdf_content_dict[base_name] = Google_scholar_pdf_content

In [109]:
Google_scholar_pdf_df = pd.DataFrame(list(Google_scholar_pdf_content_dict.items()), columns=['Filename', 'Content'])
Google_scholar_pdf_df["Publisher"] = "miscellaneous"
Google_scholar_pdf_df

Unnamed: 0,Filename,Content,Publisher
0,Data Mesh A Holistic Examination Of Its Princi...,Today we live in a world where one of the mos...,miscellaneous
1,Introducing Data Mesh paradigm for Smart City ...,The concept of a smart city imposes a uniq...,miscellaneous
2,Towards avoiding the data mess Industry insigh...,"As the volume of data continues to grow, organ...",miscellaneous
3,Utilization of Data Mesh Framework as a Part o...,Digital technology is present in almost every...,miscellaneous


## Merge and Export

In [113]:
combined_df = pd.concat([IEEE_pdf_df,
                         springer_pdf_df,
                         research_gate_pdf_df, 
                         Science_direct_pdf_df, 
                         Google_scholar_pdf_df], ignore_index=True)


combined_df[['Filename', 'Publication_type']] = combined_df['Filename'].str.split('_', expand=True)
combined_df

Unnamed: 0,Filename,Content,Publisher,Publication_type
0,Breaking Down Data Silos Data Mesh to Achieve ...,Abstract— Data localization laws...,IEEE,conference
1,Decentralized Data Governance as Part of a Dat...,Abstract —Data mesh is a socio-technical ...,IEEE,conference
2,Enterprise Data Strategy A Decentralized Data ...,Abstract— As the enterprises experience expone...,IEEE,conference
3,Finding Your Way Through the Jungle of Big Dat...,Abstract—This paper presents a systematic revi...,IEEE,conference
4,01. Evolution of Data Architecture,"When you look back in time, data architectures...",Springer,book chapter
5,02. Terminology Data Fabric and Data Mesh,The terms Data Fabric and Data Mesh are often ...,Springer,book chapter
6,03. Data Fabric and Data Mesh Use Case Scenarios,Many organizations realized the importance of ...,Springer,book chapter
7,04. Data Fabric and Data Mesh Business Benefits,"After defining a Data Fabric, also as an under...",Springer,book chapter
8,05. Key Data Fabric and Data Mesh Capabilities,This chapter introduces key capabilities for b...,Springer,book chapter
9,06. Relevant ML and DL Concepts,"to AI, ML, and DLBroadly speaking, AI is a sy...",Springer,book chapter


In [115]:
combined_df.to_csv('Data_mesh_publications.csv', index=False)