## **PUBMED Dataset Preprocessing**

Cleaning the PubMed Dataset

In [None]:
from src.training_utils import *
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

print(transformers.__version__)

Load the Pubmed dataset which was downloaded to the local directory 

In [None]:
pubmed_dataset = load_dataset("ccdv/pubmed-summarization") # Model loaded from huggingface
print(pubmed_dataset)

Load and Visualize the Dataset, cleaning two examples in the validation

In [None]:
import json
from tqdm import tqdm

def load_corpus(fname, is_training):
    corpus = []
    with open(fname, "r") as f:
        lines = f.readlines()  # Read all lines into a list
        for i, line in tqdm(enumerate(lines), total=len(lines)):
            data = json.loads(line)
            if len(data["text"]) == 0 or len(data["summary"]) == 0:
                print(i)
                continue
            if is_training:
                if len(data["indices"]) == 0 or len(data["score"]) == 0:
                    continue

            corpus.append(data)
    return corpus

val = load_corpus("src/data/PubMed/test_PUBMED.jsonl", False)
train = load_corpus("src/data/PubMed/train_PUBMED_labelled.jsonl", True)


## **Clean Dataset Class**

In [None]:
from torch.utils.data import Dataset

class Clean_Dataset(Dataset):
    # This is used to clean the noisy sample from PubMed train
    def __init__(self, abs_dataset, ext_dataset, val = False):
        self.abs_dataset = abs_dataset
        self.ext_dataset = ext_dataset
        self.abs_dataset, self.ext_dataset = self.preprocess(val)


    def __len__(self):
        return len(self.ext_dataset)
    
    def __getitem__(self, idx):

        return self.abs_dataset[idx], self.ext_dataset[idx]
    
    def compare(self,s1,s2, red = True):
        distance = (s1[:len(s2)+10].strip()).startswith(s2.strip())#Levenshtein.distance(s1[:100], s2[:100])
        if not distance:
            if red:
                print(f"\nEXT: \n", s2)
                print(f"\nABS: \n", s1[:len(s2)+10])
            return False
        else:
            return True
            
    
    def preprocess(self, val = False):
        new_dataset_abs = []
        new_dataset = []
        new_ext = []
        count = 0
        for idx in tqdm(range(len(self.abs_dataset))):

            if self.abs_dataset[idx]['article'] != "":
                if not val:
                    new_dataset_abs.append({"article": self.abs_dataset[idx]['article'], 'abstract': self.abs_dataset[idx]['abstract']})
                elif val and idx not in [4923, 2320]: # Empty articles
                    new_dataset_abs.append({"article": self.abs_dataset[idx]['article'], 'abstract': self.abs_dataset[idx]['abstract']})

            else:
                count+=1
        if not val:
            for idx in range(len(self.ext_dataset)):
                if self.compare(new_dataset_abs[idx]['article'],self.ext_dataset[idx]['text'][0]):
                    new_dataset.append({"article": new_dataset_abs[idx]['article'], 'abstract': new_dataset_abs[idx]['abstract']})
                    new_ext.append(self.ext_dataset[idx])
                    continue
                    
                else:
                    count+=1
                    for idx2 in range(1, len(new_dataset_abs[idx:])):
                        if self.compare(new_dataset_abs[idx2+idx]['article'],self.ext_dataset[idx]['text'][0], red = False):
                            new_dataset.append({"article": new_dataset_abs[idx2+idx]['article'], 'abstract': new_dataset_abs[idx2+idx]['abstract']})
                            new_ext.append(self.ext_dataset[idx])
                            break
        else:
            new_dataset = new_dataset_abs
            new_ext = self.ext_dataset

        return new_dataset, new_ext
        

Run only once: Synchronize the abstractive and extractive datasets, and get rid of the missing texts.

In [None]:
train_new = Clean_Dataset(pubmed_dataset['train'], train, val = False)
val_new = Clean_Dataset(pubmed_dataset['test'], val, val = True)
print(len(train_new))
print(len(val_new))

In [None]:
serialized_data_val = [sample for sample in val_new]
serialized_data_train = [sample for sample in val_new]

### Save the data

In [None]:
with open("src/data/PubMed/Train_ExtAbs_PUBMED.json", "w") as f:
   json.dump(serialized_data_train, f)

with open("src/data/PubMed/Test_ExtAbs_PUBMED.json", "w") as f:
   json.dump(serialized_data_val, f)

### Load the saved dataset by opening the .json file

In [None]:
with open("src/data/PubMed/Train_ExtAbs_PUBMED.json") as f:
        training_corpus = json.load(f)