In [1]:
# import PyPDF2
import docx
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
def extract_text_from_docx(docx_path): # extract text
    text = ""
    doc = docx.Document(docx_path)
    for paragraph in doc.paragraphs:
        if paragraph.text: 
            text += paragraph.text + " "  
    return text.strip() 


def extract_data_from_table(docx_path, marker): # create contents
    doc = docx.Document(docx_path)
    table_data = []
    
    for table in doc.tables:
        for row in table.rows:
            row_data = [cell.text for cell in row.cells]
            table_data.append(row_data)

    # remove elements after marker
    try:
        index_to_remove = table_data.index(marker)
        table_data[index_to_remove:] = []
    except ValueError:
        pass

    return table_data


def process_contents_df(contents_data):
    df = pd.DataFrame(contents_data, columns=['Article_Name', 'Page'])

    # split 'Article_Name' into 'Authors' and 'Article_Name'
    df['Authors'], df['Article_Name'] = df['Article_Name'].str.rsplit('.', 1).str
    # clean spaces
    df['Article_Name'] = df['Article_Name'].str.strip()
    # add period
    df['Authors'] = df['Authors'] + '.'
    # remove rows where 'Article_Name' is NaN
    df = df.dropna(subset=['Article_Name'])
    # replace '\n' with spaces
    df['Article_Name'] = df['Article_Name'].replace('\n', ' ', regex=True)
    # reset index
    df.reset_index(drop=True, inplace=True)

    return df


def find_article_text(text, start, end=None):
    start_index = text.find(start)
    if start_index == -1:
        return None # no article found

    start_index += len(start)
    
    if end:
        end_index = text.find(end, start_index)
        if end_index == -1:
            return start_index + 'ERROR' # no end of the article found
        return text[start_index:end_index]
    else:
        return text[start_index:]
    

def articles_text_to_df(contents_df, text_from_docx, end_marker='References:'):
    df = contents_df.copy()
    df['Text'] = None

    for i in range(len(df)):
        start = df['Article_Name'][i].upper()
        df.at[i, 'Text'] = find_article_text(text_from_docx, start, end_marker)

    return df

In [3]:
# docx_path = "data/2022_satellite.docx"
docx_path = "data/2021_satellite.docx"
# docx_path = "data/2020_satellite_1.docx"
# docx_path = "data/2020_satellite_2.docx"

text_from_docx = extract_text_from_docx(docx_path)

if '2022' in docx_path:
    marker = ['Authors', '180']
elif '2021' in docx_path:
    marker = ['Figure 1: Centralized network', 'Figure 2: Decentralized network']
else:
    marker = None

In [4]:
contents_data = extract_data_from_table(docx_path, marker)
# text_from_docx
# contents_data
contents_df = process_contents_df(contents_data)
articles_df = articles_text_to_df(contents_df, text_from_docx)

  df['Authors'], df['Article_Name'] = df['Article_Name'].str.rsplit('.', 1).str


In [5]:
articles_df.head()

Unnamed: 0,Article_Name,Page,Authors,Text
0,Development and Research of the Intelligent Technology for Predicting the Popularity of Animals From the Shelter,6,"Antonevych M., Snytyuk V.","Abstract. Millions of stray animals are injured on the streets every day around the world. So, what can be done, to help these animals to get new home? Scientists and researchers can suggest improvements that will help shelter animals to have a higher chance to be adopted with the help of data science. One of the solutions for this problem can be called one that will also use artificial intelligence. We intend to use a computer vision apparatus based on neural networks. Keywords: Computer v..."
1,Identification of Critical Information Infrastructure Objects,9,"Antoniuk V., Parkhomenko I.","Abstract. To date, natural and man-made threats, the level of terrorism, the scale and complexity of cyberattacks have increased significantly. And the number of cyberattacks aimed at impressing various areas of critical infrastructure is growing steadily. These situations have increased the urgency of the problem of protection of critical infrastructure, especially information and communication technologies, which are strategically important for the existence and functioning of our state, ..."
2,Classification Models in E-Commerce Projects,11,"Bakukha N., Khlevna Iu.","Abstract. Reporting the relevance of the application of opportunities, modern analytics technologies in e-commerce projects. The classification model for the online store has been developed. The model is based on the method of k-nearest neighbors. There are 4 classes of customers: customers with numerous transactions compared to other customers, regular customers, customers who made their transactions much earlier than other customers, regular customers with medium and low check amounts. Ke..."
3,Decentralized Network in IoT Systems,13,"Bidochka V., Paliy S.","Keywords: IoT, decentralized networks, centralized networks, networks, P2P. Introduction The Internet of Things (IoT) describes the network of physical objects — “things” — that are embedded with sensors, software, and other technologies for the purpose of connecting and exchanging data with other devices and systems over the internet. IoT systems are becoming more and more popular these days. They are used in different ways and do different tasks. There’s a lot of noise at the moment about..."
4,BadUSB: Overview of the Possible Attacks With the Usage of Arduino Board,16,"Buchyk S., Kosse A.","Abstract. The introduced work covers the most feasible attacks that can be executed against the personal computers that have been left unblocked. As in most other cases: badUSB attacks still requires a lot of social engineering to work properly. Keywords: BadUSB, Arduino, HID attack, Kali Linux. Over the years, social engineering and the use of USB sticks for an attack have shown to be quite effective. People have been repeatedly warned not to connect mass storage devices that they have not..."
