In [1]:
from collections import defaultdict
import os
import random
import re
from tqdm import tqdm
from typing import List

from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd
from rouge import Rouge

In [2]:
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [2]:
PERMITTED_TITLES_SOURCE = "scientific-paper-summarisation/Data/Utility_Data/permitted_titles.txt"
non_content_keys = ['MAIN-TITLE', 'HIGHLIGHTS', 'KEYPHRASES', 'ABSTRACT', 'ACKNOWLEDGEMENTS', 'REFERENCES']
stop_words = set(stopwords.words('english'))

In [3]:
def preprocess_sentence(sentence, filter_sentence=True):
    """
    Preprocesses a sentence, turning it all to lowercase and tokenizing it into words.
    :param sentence: the sentence to pre-process.
    :return: the sentence, as a list of words, all in lowercase
    """
    
    if filter_sentence:
        sentence = sentence.lower()
        word_tokens = word_tokenize(sentence)

        # Remove stopwords from sentence
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words and w.isalnum()]
        return filtered_sentence
    
    # Remove all line endings, multiple whitespace etc. from sentence
    cleaned_sentence = ' '.join(sentence.split())    
    return cleaned_sentence

def paper_tokenize(text, sentences_as_lists=False, preserve_order=False):
    """
    Takes a paper with the sections delineated by '@&#' and splits them into a dictionary where the key is the section
    and the value is the text under that section. This could probably be a bit more efficient but it works well enough.
    :param text: the text of the paper to split
    :param sentences_as_lists: if true, returns the text of each section as a list of sentences rather than a single
                               string.
    :param preserve_order: if true, tracks the order in which the paper sections occured.
    :returns: a dictionary of the form (section: section_text)
    """
    with open(PERMITTED_TITLES_SOURCE, "r") as pt:
        permitted_titles = pt.read().split("\n")

    # Split the text into sections
    if preserve_order:
        split_text_1 = re.split("@&#", text)
        split_text = zip(split_text_1, range(len(split_text_1)))
    else:
        split_text = re.split("@&#", text)

    # The key value. This value is changed if a permitted section title is encountered in the list.
    state = ""

    # After the for loop, this dictionary will have keys relating to each permitted section, and values corresponding
    # to the text of that section
    sentences_with_states = defaultdict(str)
    sentences = defaultdict(str)

    section_counts = defaultdict(int)
    
    paper_abstract = ""

    if preserve_order:
        for text, pos in split_text:

            # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers
            text = text.replace("etal.", "etal")
            text = text.replace("et al.", "etal")
            text = text.replace("Fig.", "Fig")
            text = text.replace("fig.", "fig")
            text = text.replace("Eq.", "Eq")
            text = text.replace("eq.", "eq")
            text = text.replace("pp.", "pp")
            text = text.replace("i.e.", "ie")
            text = text.replace("e.g.", "eg")
            text = text.replace("ref.", "ref")
            text = text.replace("Ref.", "Ref")
            text = text.replace("etc.", "etc")
            text = text.replace("Figs.", "Figs")
            text = text.replace("figs.", "figs")
            text = text.replace("No.", "No")
            text = text.replace("eqs.", "eqs")

            # Checks if text is a section title
            if text.lower() in permitted_titles:
                state = text
                section_counts[state] += 1
            else:
                if sentences_as_lists:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = ([preprocess_sentence(x) for x in sent_tokenize(text)], pos)
                    sentences[state] = [preprocess_sentence(x, filter_sentence=False) for x in sent_tokenize(text)]
                else:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = (text, pos)
            if state == "ABSTRACT":
                paper_abstract = text.strip()

    return sentences, sentences_with_states, paper_abstract

In [4]:
def get_paper_as_words(tokenized_paper):
    all_words = []
    for key in tokenized_paper.keys():
        
        # For every paper section that contains content information,
        # retrieve words
        if key not in non_content_keys:
            section_content = tokenized_paper[key]
            [all_words.extend(s) for s in section_content[0]]
            
    return all_words

In [5]:
def compute_sentence_weight(sentence: List, text: List) -> int:
    
    sentence_score = 0
    
    # Iterate over all words in the sentence
    for word in sentence:
        
        # Get word synsets
        word_synsets = wn.synsets(word)
        
        # If word has synsets (i.e., is known by wordnet), continue
        best_synset_score = 0
        for synset in word_synsets:
            
            # Get and tokenize gloss and remove stopwords and punctuation
            filtered_gloss = preprocess_sentence(synset.definition())
            
            # Compute score
            score = sum(1 for def_word in filtered_gloss if def_word in text)
            if score > best_synset_score:
                best_synset_score = score
                
        # Update sentence score
        sentence_score += best_synset_score

    return sentence_score

In [11]:
def summarize_paper(tokenized_paper, paper_sentences, nr_sentences=5):
    sentence_weights = []
    
    # Get word representation of paper
    paper_words = get_paper_as_words(tokenized_paper)
    
    print(tokenized_paper.keys())
    
    for section in tokenized_paper.keys():
        
        if section not in non_content_keys:
            section_content = tokenized_paper[section]
            section_sentences = paper_sentences[section]
            
            for tok_sentence, orig_sentence in zip(section_content[0], section_sentences):
                
                # Compute sentence weight and store with sentence
                sentence_weight = compute_sentence_weight(tok_sentence, paper_words)
                sentence_weights.append((orig_sentence, sentence_weight))
            
    # Create a dataframe of all sentences and sort descending by weight
    sentence_weights = pd.DataFrame(sentence_weights, columns=['sentence', 'weight'])
    sentence_weights.sort_values(by=['weight'], ascending=False, inplace=True)
    
    summary = sentence_weights.head(nr_sentences)['sentence'].values
    
    string_summary = ' '.join(summary)
    
    
    return string_summary

In [7]:
rouge = Rouge()

def compute_metrics(paper_abstract: np.array, generated_summary: np.array):
    rouge_scores = rouge.get_scores(generated_summary, paper_abstract, avg=True)
    print(rouge_scores)
#     return rouge_scores['rouge-1'].values(), rouge_scores['rouge-2'].values(), rouge_scores['rouge-l'].values(),

In [12]:
PAPER_PATH = 'scientific-paper-summarisation/Data/Parsed_Papers'
paper_file_names = os.listdir('scientific-paper-summarisation/Data/Parsed_Papers')

# Define desired number of sentences for a summary
NR_OF_SENTENCES = 5

ground_truth_summaries = np.empty(len(paper_file_names[1:4]), dtype='object')
generated_summaries = np.empty(len(paper_file_names[1:4]), dtype='object')

for i, paper_file_name in tqdm(enumerate(paper_file_names[0:3])):
    print(paper_file_name)
    
    # Read paper file
    filepath = f'{PAPER_PATH}/{paper_file_name}'
    with open(filepath, "r") as paper_file:
        paper_content = paper_file.read()
    
    # Tokenize paper into sentences (and sentences into separate words) and get paper abstract
    paper_sentences, tokenized_paper, paper_abstract = paper_tokenize(paper_content, sentences_as_lists=True, preserve_order=True)
    ground_truth_summaries[i] = paper_abstract
    
    # Summarize paper
    generated_paper = summarize_paper(tokenized_paper, paper_sentences, NR_OF_SENTENCES)
    generated_summaries[i] = generated_paper
    
# Compute ROUGE scores
# compute_metrics(ground_truth_summaries, generated_summaries)
print(generated_summaries)

0it [00:00, ?it/s]

S0003687014000994.txt
dict_keys(['', 'MAIN-TITLE', 'HIGHLIGHTS', 'KEYPHRASES', 'ABSTRACT', 'INTRODUCTION', 'BACKGROUND', 'METHOD', 'RESULTS', 'DISCUSSION', 'CONCLUSION', 'ACKNOWLEDGEMENTS', 'REFERENCES'])


3it [00:06,  2.11s/it]

S0377221714005463.txt
dict_keys(['', 'MAIN-TITLE', 'HIGHLIGHTS', 'KEYPHRASES', 'ABSTRACT'])
S0377221715003756.txt
dict_keys(['', 'MAIN-TITLE', 'HIGHLIGHTS', 'KEYPHRASES', 'ABSTRACT'])
["We observed the following issues: • Patient wristband not present as it has fallen off, eg in the shower Patient wristband not present as the patient has removed it Patient wristband kept outside the room as they refused to wear it Patient barcode doesn't scan as it has been printed badly or is incomplete Patient barcode doesn't scan as the wristband has been put on awkwardly Patient slept on the arm with the wristband so they were disturbed and needed to move so staff could access it Patients sometimes manipulate their arm awkwardly so the barcode and scanner can be aligned In terms of the patient experience the last two were most noticeably, and the third issue suggests that at least some patients do not like wearing the wristband. An example of an innovation is thinking not only about individual gluc




In [61]:
hypothesis = ["hello my name is djesse. test sentence", "hello my name is djesse. test sentence" ]
reference = ["test sentence. hello my name is djesse", "test sentence. hello my name is djesse"]

hypothesis[0:2]

['hello my name is djesse. test sentence',
 'hello my name is djesse. test sentence']

In [76]:
paper_file_names[0:3]

['S0003687014000994.txt', 'S0377221714005463.txt', 'S0377221715003756.txt']