In [25]:
import pandas as pd
import re

In [26]:
def parse_medical_abstact_data(filepath: str):

    text_list = []
    label_list = []
    abstract_list = []
    abstract_num = ''

    try:
        with open(filepath, 'r') as file:
            for line in file:
                
                # Strip any extra spaces from front and end of the line
                stripped_line = line.strip()
                                
                # Only consider the line which is not empty
                if len(stripped_line) != 0:
                    # Search for abstract number and update the abstract variable if the match is found
                    result = re.search('^\#+\d+$', stripped_line)
                    if result != None:
                        abstract_num = result.group()
                    
                    # Search for the label and text using named groups and extract the groups if the match is found
                    result = re.search('(?P<label>^[A-Z]+)(?:[\s\t]+)(?P<text>.+$)', stripped_line)
                    if result != None:
                        label = result['label']
                        text = result['text']
                        
                        # Find the total number of tokens in the text line
                        split_text = text.split()

                        # Append the values of abstract name, label and text to the lists only if the label has one of the following abstract segments ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']
                        # Only considder text line where total tokens is more than or equal to 5 (one token will be considered for the dot; so a total of four words must be in the line)
                        if (label in ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']) and (len(split_text) >= 5):
                            abstract_list.append(abstract_num)
                            text_list.append(text)
                            label_list.append(label)

        # Now creating a dataframe from all the lists
        data_df = pd.DataFrame({'Abstract Name': abstract_list, 'Text': text_list, 'Label': label_list})

        return data_df
    
    except Exception as e:
        print(e)
        
        return None
        

In [27]:
train_df = parse_medical_abstact_data(filepath='Dataset/train.txt')
train_df.head()

Unnamed: 0,Abstract Name,Text,Label
0,###24491034,The emergence of HIV as a chronic condition me...,BACKGROUND
1,###24491034,This paper describes the design and evaluation...,BACKGROUND
2,###24491034,This study is designed as a randomised control...,METHODS
3,###24491034,The intervention group will participate in the...,METHODS
4,###24491034,The program is based on self-efficacy theory a...,METHODS


In [29]:
# Saving the dataframes as csv files
train_df.to_csv(path_or_buf='Dataset/train.csv', index=False)

In [30]:
test_df = parse_medical_abstact_data(filepath='Dataset/test.txt')
test_df.head()

Unnamed: 0,Abstract Name,Text,Label
0,###24562799,Many pathogenic processes and diseases are the...,BACKGROUND
1,###24562799,It was recently demonstrated that supraphysiol...,BACKGROUND
2,###24562799,"In the present study , we examined the associa...",BACKGROUND
3,###24562799,"In addition , we compared plasma levels of MAp...",BACKGROUND
4,###24562799,A total of 192 MI patients and 140 control per...,METHODS


In [31]:
# Saving the dataframes as csv files
test_df.to_csv(path_or_buf='Dataset/test.csv', index=False)

In [32]:
dev_df = parse_medical_abstact_data(filepath='Dataset/dev.txt')
dev_df.head()

Unnamed: 0,Abstract Name,Text,Label
0,###15337700,Adrenergic activation is thought to be an impo...,BACKGROUND
1,###15337700,Systemic venous norepinephrine was measured at...,RESULTS
2,###15337700,Baseline norepinephrine level was associated w...,RESULTS
3,###15337700,"On multivariate analysis , baseline norepineph...",RESULTS
4,###15337700,"In contrast , the relation of the change in no...",RESULTS


In [33]:
# Saving the dataframes as csv files
dev_df.to_csv(path_or_buf='Dataset/dev.csv', index=False)