In [1]:
from collections import defaultdict
import os
import random
import re
from tqdm import tqdm
from typing import List

from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd
from rouge import Rouge

In [2]:
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [9]:
PERMITTED_TITLES_SOURCE = "scientific-paper-summarisation/Data/Utility_Data/permitted_titles.txt"
non_content_keys = ['MAIN-TITLE', 'HIGHLIGHTS', 'KEYPHRASES', 'ABSTRACT', 'ACKNOWLEDGEMENTS', 'REFERENCES']
stop_words = set(stopwords.words('english'))

In [13]:
def preprocess_sentence(sentence, filter_sentence=True):
    """
    Preprocesses a sentence, turning it all to lowercase and tokenizing it into words.
    :param sentence: the sentence to pre-process.
    :return: the sentence, as a list of words, all in lowercase
    """
    
    if filter_sentence:
        sentence = sentence.lower()
        word_tokens = word_tokenize(sentence)

        # Remove stopwords from sentence
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words and w.isalnum()]
        return filtered_sentence
    
    # Remove all line endings, multiple whitespace etc. from sentence
    cleaned_sentence = ' '.join(sentence.split())    
    return cleaned_sentence

def paper_tokenize(text, sentences_as_lists=False, preserve_order=False):
    """
    Takes a paper with the sections delineated by '@&#' and splits them into a dictionary where the key is the section
    and the value is the text under that section. This could probably be a bit more efficient but it works well enough.
    :param text: the text of the paper to split
    :param sentences_as_lists: if true, returns the text of each section as a list of sentences rather than a single
                               string.
    :param preserve_order: if true, tracks the order in which the paper sections occured.
    :returns: a dictionary of the form (section: section_text)
    """
    with open(PERMITTED_TITLES_SOURCE, "r") as pt:
        permitted_titles = pt.read().split("\n")

    # Split the text into sections
    if preserve_order:
        split_text_1 = re.split("@&#", text)
        split_text = zip(split_text_1, range(len(split_text_1)))
    else:
        split_text = re.split("@&#", text)

    # The key value. This value is changed if a permitted section title is encountered in the list.
    state = ""

    # After the for loop, this dictionary will have keys relating to each permitted section, and values corresponding
    # to the text of that section
    sentences_with_states = defaultdict(str)
    sentences = defaultdict(str)

    section_counts = defaultdict(int)
    
    paper_abstract = ""
    sentence_index = 0
    
    if preserve_order:
        for text, pos in split_text:

            # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers
            text = text.replace("etal.", "etal")
            text = text.replace("et al.", "etal")
            text = text.replace("Fig.", "Fig")
            text = text.replace("fig.", "fig")
            text = text.replace("Eq.", "Eq")
            text = text.replace("eq.", "eq")
            text = text.replace("pp.", "pp")
            text = text.replace("i.e.", "ie")
            text = text.replace("e.g.", "eg")
            text = text.replace("ref.", "ref")
            text = text.replace("Ref.", "Ref")
            text = text.replace("etc.", "etc")
            text = text.replace("Figs.", "Figs")
            text = text.replace("figs.", "figs")
            text = text.replace("No.", "No")
            text = text.replace("eqs.", "eqs")

            # Checks if text is a section title
            if text.lower() in permitted_titles:
                state = text
                section_counts[state] += 1
            else:
                if sentences_as_lists:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = ([preprocess_sentence(x) for x in sent_tokenize(text)], pos)
                    
                    sentence_storage = []
                    for x in sent_tokenize(text):
                        sentence_storage.append((preprocess_sentence(x, filter_sentence=False), sentence_index))
                        sentence_index+=1
                    sentences[state] = sentence_storage
            if state == "ABSTRACT":
                paper_abstract = text.strip()

    return sentences, sentences_with_states, paper_abstract

In [14]:
def get_paper_as_words(tokenized_paper):
    all_words = []
    for key in tokenized_paper.keys():
        
        # For every paper section that contains content information,
        # retrieve words
        if key not in non_content_keys:
            section_content = tokenized_paper[key]
            [all_words.extend(s) for s in section_content[0]]
            
    return all_words

In [15]:
def compute_sentence_weight(sentence: List, text: List) -> int:
    
    sentence_score = 0
    
    # Iterate over all words in the sentence
    for word in sentence:
        
        # Get word synsets
        word_synsets = wn.synsets(word)
        
        # If word has synsets (i.e., is known by wordnet), continue
        best_synset_score = 0
        for synset in word_synsets:
            
            # Get and tokenize gloss and remove stopwords and punctuation
            filtered_gloss = preprocess_sentence(synset.definition())
            
            # Compute score
            score = sum(1 for def_word in filtered_gloss if def_word in text)
            if score > best_synset_score:
                best_synset_score = score
                
        # Update sentence score
        sentence_score += best_synset_score

    return sentence_score

In [30]:
def summarize_paper(tokenized_paper, paper_sentences, nr_sentences=5):
    sentence_weights = []
    
    # Get word representation of paper
    paper_words = get_paper_as_words(tokenized_paper)
    
    for section in tokenized_paper.keys():
        
        if section not in non_content_keys:
            section_content = tokenized_paper[section]
            section_sentences = paper_sentences[section]
            
            for tok_sentence, orig_sentence in zip(section_content[0], section_sentences):
                
                # Compute sentence weight and store with sentence
                sentence_weight = compute_sentence_weight(tok_sentence, paper_words)
                sentence_weights.append((orig_sentence[0], orig_sentence[1], sentence_weight))
            
    # Create a dataframe of all sentences and sort descending by weight
    sentence_weights = pd.DataFrame(sentence_weights, columns=['sentence', 'index', 'weight'])
    sentence_weights.sort_values(by=['weight'], ascending=False, inplace=True)
    
    # Select desired number of sentences and sort by order of occurence in text
    summary = sentence_weights.head(nr_sentences).sort_values(by=['index'])['sentence'].values
    
    # Join selected strings into a summary
    string_summary = ' '.join(summary)
    
    return string_summary

In [7]:
rouge = Rouge()

def compute_metrics(paper_abstract: np.array, generated_summary: np.array):
    rouge_scores = rouge.get_scores(generated_summary, paper_abstract, avg=True)
    print(rouge_scores)
#     return rouge_scores['rouge-1'].values(), rouge_scores['rouge-2'].values(), rouge_scores['rouge-l'].values(),

In [35]:
PAPER_PATH = 'scientific-paper-summarisation/Data/Parsed_Papers'
paper_file_names = os.listdir('scientific-paper-summarisation/Data/Parsed_Papers')

# Define desired number of sentences for a summary
NR_OF_SENTENCES = 5

ground_truth_summaries = np.empty(1, dtype='object')
generated_summaries = np.empty(1, dtype='object')

for i, paper_file_name in tqdm(enumerate(paper_file_names[0:1])):
    print(paper_file_name)
    
    # Read paper file
    filepath = f'{PAPER_PATH}/{paper_file_name}'
    with open(filepath, "r") as paper_file:
        paper_content = paper_file.read()
    
    # Tokenize paper into sentences (and sentences into separate words) and get paper abstract
    paper_sentences, tokenized_paper, paper_abstract = paper_tokenize(paper_content, sentences_as_lists=True, preserve_order=True)
    ground_truth_summaries[i] = paper_abstract
    
    # Summarize paper
    generated_summary = summarize_paper(tokenized_paper, paper_sentences, NR_OF_SENTENCES)
    generated_summaries[i] = generated_paper
    
# Compute ROUGE scores
compute_metrics(ground_truth_summaries, generated_summaries)
print(generated_summaries)

0it [00:00, ?it/s]

S0142694X1500054X.txt


1it [00:03,  3.43s/it]

{'rouge-1': {'r': 0.14832535885167464, 'p': 0.22302158273381295, 'f': 0.17816091474253548}, 'rouge-2': {'r': 0.05187319884726225, 'p': 0.09045226130653267, 'f': 0.06593406130144004}, 'rouge-l': {'r': 0.1339712918660287, 'p': 0.2014388489208633, 'f': 0.16091953543219065}}
['The average age of the graphic designers surveyed was 39, ranging from 18 to 71 years old, and the average amount of graphic design experience was 13 years, ranging from 1 year or less, to 43 years. The graphic designers were asked the extent to which they agreed with the statement ‘visual accessibility is very important in my day to day graphic design work’, while clients were asked about their agreement with the statement ‘visual accessibility is very important in all graphic design work that I commission’. Inclusive design is ‘a general approach to designing in which designers ensure that their products and services address the needs of the widest possible audience, irrespective of age or ability’ (Design Council,




In [61]:
hypothesis = ["hello my name is djesse. test sentence", "hello my name is djesse. test sentence" ]
reference = ["test sentence. hello my name is djesse", "test sentence. hello my name is djesse"]

hypothesis[0:2]

['hello my name is djesse. test sentence',
 'hello my name is djesse. test sentence']

In [76]:
paper_file_names[0:3]

['S0003687014000994.txt', 'S0377221714005463.txt', 'S0377221715003756.txt']