In [3]:
import math
import os
import os

import nltk
import pandas as pd
import requests
import spacy
from bs4 import BeautifulSoup
from spacy.matcher import Matcher

In [4]:
class ArticlesExtraction:
    def __init__(self, number=20, verbose=True):
        self.number = number
        self.verbose = verbose

    def _get_links(self):
        if self.verbose:
            print(f'Getting the links for {self.number} articles...')
        mainpage = requests.get('https://as-botanicalstudies.springeropen.com/articles')
        mainsoup = BeautifulSoup(mainpage.text)
        links = ['https://as-botanicalstudies.springeropen.com' + x['href'] for x in
                 sum([x.findAll('a') for x in soup.findAll('h3', class_="c-listing__title")], [])]
        return links[:self.number]

    def extract(self):
        extra = ['Availability of data and materials', 'Abbreviations', 'References', 'Acknowledgements',
                 'Funding', 'Author information', 'Ethics declarations', 'Additional information',
                 'Rights and permissions', 'About this article']
        links = self._get_links()
        if self.verbose:
            print('Getting the texts...')
        texts = dict()
        for num, link in enumerate(links):
            if self.verbose:
                print(f'{num + 1}/{len(links)} links', end="\r")
            page = requests.get(link)
            pagecontent = BeautifulSoup(page.text)
            name = pagecontent.findAll('h1', class_="c-article-title")[0].text
            #print(pagecontent.findAll('section'))
            text = "\n".join(sum([list(map(lambda y: y.text, x.findAll('p'))) for x in pagecontent.findAll('section') if
                                  x.has_attr('data-title') and x['data-title'] not in extra], []))
            texts[name] = text

        return texts

    def extract_and_save(self):
        texts = self.extract()
        if self.verbose:
            print('Saving the articles...')
        if not os.path.exists('articles'):
            os.mkdir('articles')
        for key, value in texts.items():
            with open(f"articles/{key.replace('/', '|')}", 'w') as file:
                file.write(value)
        return texts

In [5]:
class RuleBasedExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)

    def extract(self, texts):
        all_terms = []
        self._add_rules()
        for num, text in enumerate(texts):
            doc = self.nlp(text)
            matches = matcher(doc)
            for match_id, start, end in matches:
                string_id = self.nlp.vocab.strings[match_id]
                span = doc[start:end]
                lemma = ' '.join([n.lemma_ for n in self.nlp(span.text.lower())])
                all_terms.append(lemma)

            print(f'{num + 1}/{len(texts)} texts processed', end="\r")

    def _add_rules(self):
        noun_pattern = {"POS": {"IN": ["NOUN", "PROPN"]}}
        det_pattern = {"POS": {"IN": ["DET", "PRON"]}, "OP": "?"}
        pattern = [  #[{"POS": "NOUN"}, {"POS": "NOUN"}],
            [noun_pattern, noun_pattern],
            #[{"DEP": "compound"}, {"POS": "NOUN"}],
            [noun_pattern, {"POS": "ADP"}, noun_pattern],
            [{"POS": "ADJ", "OP": "+"}, noun_pattern],
            [noun_pattern, {"POS": "ADP"}, det_pattern, noun_pattern],
            [det_pattern, {"POS": "ADJ"}, {"POS": "CCONJ"}, {"POS": "ADJ"}, noun_pattern],
        ]
        self.matcher.add("terms", pattern)

In [6]:
class Annotator:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def annotate_text(text):
        doc = nlp(text)
        true_tokens = [x.text_with_ws for x in doc]
        positions = []
        tokens = [x.lemma_.lower() for x in doc]
        for term in all_terms:
            term = term.split()
            while contains(term, tokens):
                pos1, pos2 = contains(term, tokens)
                positions.append((pos1, pos2))
                tokens[pos1:pos2] = ['_' for x in range(pos1, pos2)]
        new_tokens = []
        for num, word in enumerate(tokens):
            if num in [x[0] for x in positions]:
                new_tokens.append(' <bos> ')
            new_tokens.append(true_tokens[num])
            if num in [x[1] - 1 for x in positions]:
                new_tokens.append(' <eos> ')
        return "".join(new_tokens)