# Title and Abstract Extraction

- Extract the titles as best we can
- Extract the abstracts (first 100 words) as best we can

The title extraction code has been shamlessly stolen from [@benjpjones](https://github.com/benjpjones) (see original [here](https://github.com/UTA-REST/SnowmassScripts/blob/master/TitleExtraction/Step%202%20-%20ExtractTitles.ipynb)).

## Configuration

In [None]:
# The shortest plausible title
min_title_len = 10
max_lines_for_title = 40
max_lines_for_abstract = 100
max_words_for_abstract = 100

# These words indicate that the sentence is not yet over
hanging_words = ["of","for","at","the","with","from", " by", "using","and",
                 "neutrino","other","new","beyond","after","above","without",
                 " a"," in","-",":","–","next","high", "oak"]

# These are words that indicate this may not be the title
remove_words = ["snowmass","2021","letter","of","interest","submission",
                "submitted","to","august","2020","topical","group","working",
                "intent","process","contribution","21"]

# And these will be removed from whatever we ultimately extract as the title
import re
skip_patterns = [re.compile("Snowmass",re.IGNORECASE),
                 re.compile("2021",re.IGNORECASE),
                 re.compile("Snowmass-2021",re.IGNORECASE),
                 re.compile("Snowmass2021",re.IGNORECASE),
                 re.compile("Letter Of Interest", re.IGNORECASE),
                 re.compile("Letter Of Intent", re.IGNORECASE),
                 re.compile("LOI", re.IGNORECASE),
                 re.compile(":", re.IGNORECASE)]

import string
punctuation = string.punctuation.replace(')', '')

## Setup

In [2]:
from config import text_file_info
from pathlib import Path
import string
import json

## Title And Abstract

In [3]:
def clean_title(title):
    'Clean the title up for final display'
    title = ' '.join(title.split())
    title = title.strip(punctuation).strip()
    for p in skip_patterns:
        p.sub('', title)
    return title.strip()


def extract_title(lines):
    'Extract a title from a set of lines. Return as string'
    title = None
    for line in lines[:max_lines_for_title]:
        words = [w for w in line.lower().split() if w.strip(punctuation) not in remove_words]
        workstring = ' '.join(words)
        workstring=workstring.translate(str.maketrans('', '', string.punctuation)).strip()

        # If this is the first line of the title, then we are set to go
        if len(workstring) > min_title_len:

            # If this is the first line, look for a hanging word in the title. That means
            # that is likely continues on the next line.
            if title is None:
                if words[-1] in hanging_words:
                    title = line
                    continue # Go for another loop. LOVE them goto statements!
                title = line
            else:
                title += ' ' + line

            # We have the title. Lets clean it up a little bit before returning it.
            return clean_title(title)

    # Bummer!
    return None

def extract_finder(lines, search_func):
    'Extract the first 100 words after the word abstract'
    abstract = None
    for line in lines[:max_lines_for_abstract]:
        if abstract is None:
            abstract = search_func(line)
        else:
            abstract += line.split()
        if abstract is not None and len(abstract) > max_words_for_abstract:
            break

    if abstract is None or len(abstract) == 0:
        # Bummer!
        return None
            
    return ' '.join(abstract[:max_words_for_abstract]).strip()

start_words = ['abstract', 'introduction']
def search_for_abstract(line):
    for w in start_words:
        idx = line.lower().find(w)
        if idx >= 0:
            return line[idx + len(w):].split()
    return None


def search_for_sentence(line):
    'Look for a line with lots of characters and little punctuation and go from there'
    if len(line) > 90:
        without_punct = line.translate(str.maketrans('', '', string.punctuation)).strip()
        if len(line) - len(without_punct) < 2:
            return line.strip().split()
    return None


def search_for_anything(line):
    'Give up'
    new_line = ' '.join(line.split())
    if len(new_line) > 40:
        return line.strip().split()

    
def extract_abstract(lines):
    a = extract_finder(lines, search_for_abstract)
    if a is None:
        a = extract_finder(lines, search_for_sentence)
    if a is None:
        a = extract_finder(lines, search_for_anything)
    if a is None:
        return ""
    return a


def lines_from_file(f):
    'Given a path object to a text file for a LOI, return a title'
    with f.open('r') as fd:
        return fd.readlines()

In [4]:
files = (Path(f[2]) for f in text_file_info())

In [5]:
all_files = [
    {
        'file': str(f),
        'title': extract_title(lines),
        'abstract': extract_abstract(lines)
    }
    for f,lines in [(f, lines_from_file(f)) for f in files]
]

In [6]:
with open('../data/loi_info.json', 'w') as f:
    json.dump(all_files,f)