In [1]:
# import PyPDF2
import docx
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
def extract_text_from_docx(docx_path): # extract text
    text = ""
    doc = docx.Document(docx_path)
    for paragraph in doc.paragraphs:
        if paragraph.text: 
            text += paragraph.text + " "  
    return text.strip() 


def extract_data_from_table(docx_path, marker): # create contents
    doc = docx.Document(docx_path)
    table_data = []
    
    for table in doc.tables:
        for row in table.rows:
            row_data = [cell.text for cell in row.cells]
            table_data.append(row_data)

    # remove elements after marker
    try:
        index_to_remove = table_data.index(marker)
        table_data[index_to_remove:] = []
    except ValueError:
        pass

    return table_data


def process_contents_df(contents_data):
    df = pd.DataFrame(contents_data, columns=['Article_Name', 'Page'])

    # split 'Article_Name' into 'Authors' and 'Article_Name'
    df['Authors'], df['Article_Name'] = df['Article_Name'].str.rsplit('.', 1).str
    # clean spaces
    df['Article_Name'] = df['Article_Name'].str.strip()
    # add period
    df['Authors'] = df['Authors'] + '.'
    # remove rows where 'Article_Name' is NaN
    df = df.dropna(subset=['Article_Name'])
    # replace '\n' with spaces
    df['Article_Name'] = df['Article_Name'].replace('\n', ' ', regex=True)
    # reset index
    df.reset_index(drop=True, inplace=True)

    return df


def find_article_text(text, start, end=None):
    start_index = text.find(start)
    if start_index == -1:
        return None # no article found

    start_index += len(start)
    
    if end:
        end_index = text.find(end, start_index)
        if end_index == -1:
            return start_index + 'ERROR' # no end of the article found
        return text[start_index:end_index]
    else:
        return text[start_index:]
    

def articles_text_to_df(contents_df, text_from_docx, end_marker='References:'):
    df = contents_df.copy()
    df['Text'] = None

    for i in range(len(df)):
        start = df['Article_Name'][i].upper()
        df.at[i, 'Text'] = find_article_text(text_from_docx, start, end_marker)

    return df

In [5]:
all_docx = ["data/2022_satellite.docx", "data/2021_satellite.docx"]

total_df = pd.DataFrame()

for docx_path in all_docx:
    text_from_docx = extract_text_from_docx(docx_path)

    if '2022' in docx_path:
        marker = ['Authors', '180']
    elif '2021' in docx_path:
        marker = ['Figure 1: Centralized network', 'Figure 2: Decentralized network']
    else:
        marker = None

    contents_data = extract_data_from_table(docx_path, marker)
    contents_df = process_contents_df(contents_data)
    articles_df = articles_text_to_df(contents_df, text_from_docx)

    total_df = pd.concat([total_df, articles_df], ignore_index=True)


  df['Authors'], df['Article_Name'] = df['Article_Name'].str.rsplit('.', 1).str


In [8]:
total_df.head()

Unnamed: 0,Article_Name,Page,Authors,Text
0,Internet of Things Eco-System Concept for Healthy Eating,7,"Avramenko К., Latysheva Т.","Abstract. A healthy lifestyle is a way of living that lower the risk of being ill or dying early. The article describes how the Internet of Things can improve everyday life, including nutrition and health monitoring. Keywords: Healthy Eating, nutrition, іnternet of Things, information technology, lifestyle INTRODUCTION Ukraine is among the top ten countries with the highest mortality risk from an unbalanced diet, according to a study by The Lancet magazine [1]. Among the main reasons are ex..."
1,Development of a Mathematical Model Determination of the Proximity of Vectors in the Project-Vector Space,11,"Biloshchytskyi A., Kuchansky A., Biloshchytska S., Andrashko Yu., Faizullin A.","As a result of the decomposition of the subject area, the classification of projects of educational environments is made, and the task of optimal management in the design-vector space is formulated. The research enables us to proceed to formalization of educational environment management processes. We will define the basic definition of this part of the research. Definition 1. By project-vector management of educational environments, we will understand the implementation of functions that e..."
2,Investigation of the Algorithm of Near Search of the Compromise Median in the Problem of Collective Ranking of Objects,14,"Bovsunovska M., Hnatiienko H.","Abstract. Many works are devoted to the problem of creating efficient algorithms for solving problems of high dimensionality, when the brute-force search methods become too time-consumin. This work is also devoted to the development of an efficient algorithm in the field of expert evaluation problems, namely, the algorithm for solving the problem of collective evaluation coordination. The main goal of this work is to implement in software the algorithm for the near search of the compromice ..."
3,Analysis of Host Detection Methods,18,"Buchyk S., Saroka S.","Abstract. New malicious software fills the Internet every day. In this regard, the protection of personal data is currently an important task. The first stage of many computer attacks is intelligence, and one of its mechanisms is port scanning, which allows an attacker to find out which services are running on the desired system. It means that in the future it can prepare and conduct an attack against the detected services and their vulnerabilities. This article provides an information and ..."
4,Symmetric Encryption of Messages Based on the Use of Images,21,"Buchyk S., Tsapro D.","Abstract. This work is devoted to the problem of message transmission protection in an unsecured communication channel. It presents possible enhancements of message transmission protection by symmetric encryption creating image arrays using one-way math operations to mix public and private keys and a modified Diffie-Hellman method as a way to exchange private keys in an insecure channel. Keywords: images, Diffie–Hellman method, symmetric encryption, rgb. Nowadays the development of informat..."
