In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Function to process each line and extract title and abstract
def process_line(line):
    parts = line.split('|')
    if len(parts) != 3:
        return None
    identifier, type, content = parts
    return type, content


In [None]:
# Reading the file and processing each line
file_path = 'NCBItestset_corpus.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()

In [None]:
# Creating a dictionary to hold titles, abstracts, and abstract numbers
data = {'Title': [], 'Abstract': [], 'Abstract Number': []}
current_title = ''
abstract_number = 1  # Starting from 1

for line in lines:
    result = process_line(line)
    if result:
        type, content = result
        if type == 't':
            current_title = content
        elif type == 'a':
            data['Title'].append(current_title)
            data['Abstract'].append(content)
            data['Abstract Number'].append(abstract_number)
            abstract_number += 1  # Incrementing the abstract number for each new entry

In [None]:
# Creating a DataFrame
df1 = pd.DataFrame(data)

In [None]:
def process_file_for_tables_with_abstract_number(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Data structure to hold tabular data and abstract numbers
    table_data = []
    abstract_number = 0  # Start from abstract 1
    in_table = False  # Flag to indicate if we're in the tabular data section

    for line in lines:
        if '|t|' in line:
            in_table = False  # Reset flag when a new title is encountered
            abstract_number += 1  # Increment abstract number for each new title
        elif '|a|' in line:
            in_table = True  # Set flag when an abstract starts
        elif in_table and '|' not in line and line.strip():
            # Process tabular data line
            table_row = line.strip().split('\t')
            if len(table_row) > 1:  # Ensure the line is part of the table
                table_row.append(abstract_number)  # Add abstract number to the row
                table_data.append(table_row)

    # Creating a DataFrame from the extracted tabular data
    df_tables = pd.DataFrame(table_data, columns=[*range(len(table_data[0])-1), 'Abstract Number'])

    return df_tables

# Process the file and create a DataFrame of tables with abstract numbers
df_tables = process_file_for_tables_with_abstract_number(file_path)


In [None]:
# Correcting the column names as per the new specifications
df_tables.rename(columns={3: 'Entity_name', 4: 'Entity_class', 5: 'Mesh_info'}, inplace=True)


In [None]:
df = df_tables

In [None]:
df.to_excel('NER_answers.xlsx', index=False)