In [33]:
!source venv/bin/activate

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [34]:
import math
import os
import os
from collections import Counter

import nltk
import nltk
import pandas as pd
import requests
import scipy.stats
import sklearn
import sklearn_crfsuite
import spacy
from bs4 import BeautifulSoup
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from spacy.matcher import Matcher
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer

In [None]:
def custom_tokenizer(nlp = spacy.load("en_core_web_sm")):
        
    special_case = [{ORTH: "<bos>"}]
    nlp.tokenizer.add_special_case("<bos>", special_case)

    special_case = [{ORTH: "<eos>"}]
    nlp.tokenizer.add_special_case("<eos>", special_case)

    infixes = list([r"'s\b", r"(?<!\d)\.(?!\d)"]) +  nlp.Defaults.prefixes
    infix_re = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer
    
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

In [35]:
def custom_tokenizer(nlp = spacy.load("en_core_web_sm")):
    
    special_case = [{ORTH: "<bos>"}]
    nlp.tokenizer.add_special_case("<bos>", special_case)

    special_case = [{ORTH: "<eos>"}]
    nlp.tokenizer.add_special_case("<eos>", special_case)

    infixes = list([r"'s\b", r"(?<!\d)\.(?!\d)"]) +  nlp.Defaults.prefixes
    infix_re = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer
    
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)
nlp.tokenizer = custom_tokenizer(nlp)


In [36]:
class ArticlesExtraction:
    def __init__(self, number=20, verbose=True, save_txt=True, save_pdf=True):
        self.number = number
        self.verbose = verbose
        self.save_txt = save_txt
        self.save_pdf = save_pdf

    def _get_links(self):
        if self.verbose:
            print(f'Getting the links for {self.number} articles...')
        mainpage = requests.get('https://as-botanicalstudies.springeropen.com/articles')
        mainsoup = BeautifulSoup(mainpage.text)
        links = ['https://as-botanicalstudies.springeropen.com' + x['href'] for x in
                 sum([x.findAll('a') for x in mainsoup.findAll('h3', class_="c-listing__title")], [])]
        return links[:self.number]

    def extract(self):
        extra = ['Availability of data and materials', 'Abbreviations', 'References', 'Acknowledgements',
                 'Funding', 'Author information', 'Ethics declarations', 'Additional information',
                 'Rights and permissions', 'About this article']
        links = self._get_links()
        pdf_links = []
        if self.verbose:
            print('Getting the texts...')
        texts = dict()
        for num, link in enumerate(links):
            if self.verbose:
                print(f'{num + 1}/{len(links)} links', end="\r")
            page = requests.get(link)
            pagecontent = BeautifulSoup(page.text)
            name = pagecontent.findAll('h1', class_="c-article-title")[0].text
            text = "\n".join(sum([list(map(lambda y: y.text, x.findAll('p'))) for x in pagecontent.findAll('section') if
                                  x.has_attr('data-title') and x['data-title'] not in extra], []))
            texts[name] = text
            pdf_link = [x.findAll('a') for x in pagecontent.findAll('div', class_="c-pdf-download u-clear-both")][0][0][
                'href']
            pdf_links.append(pdf_link)
        if self.save_txt:
            if self.verbose:
                print('Saving the articles in txt...')
            if not os.path.exists('articles'):
                os.mkdir('articles')
            for key, value in texts.items():
                with open(f"articles/{key.replace('/', '|')}.txt", 'w') as file:
                    file.write(value)
        if self.save_pdf:
            if self.verbose:
                print('Saving the articles in pdf...')
            if not os.path.exists('articles_pdf'):
                os.mkdir('articles_pdf')
            for (key, value), link in zip(texts.items(), pdf_links):
                pdf = requests.get('https:' + link, allow_redirects=True)
                open(f"articles_pdf/{key.replace('/', '|')}.pdf", 'wb').write(pdf.content)
        return texts

In [37]:
class RuleBasedExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)

    def extract(self, texts):
        all_terms = []
        self._add_rules()
        for num, text in enumerate(texts):
            doc = self.nlp(text)
            matches = self.matcher(doc)
            for match_id, start, end in matches:
                string_id = self.nlp.vocab.strings[match_id]
                span = doc[start:end]
                lemma = ' '.join([n.lemma_ for n in self.nlp(span.text.lower())])
                all_terms.append(lemma)

            print(f'{num + 1}/{len(texts)} texts processed', end="\r")
        return all_terms

    def _add_rules(self):
        noun_pattern = {"POS": {"IN": ["NOUN", "PROPN"]}}
        det_pattern = {"POS": {"IN": ["DET", "PRON"]}, "OP": "?"}
        pattern = [  #[{"POS": "NOUN"}, {"POS": "NOUN"}],
            [noun_pattern, noun_pattern],
            #[{"DEP": "compound"}, {"POS": "NOUN"}],
            [noun_pattern, {"POS": "ADP"}, noun_pattern],
            [{"POS": "ADJ", "OP": "+"}, noun_pattern],
            [noun_pattern, {"POS": "ADP"}, det_pattern, noun_pattern],
            [det_pattern, {"POS": "ADJ"}, {"POS": "CCONJ"}, {"POS": "ADJ"}, noun_pattern],
        ]
        self.matcher.add("terms", pattern)

In [38]:
class Annotator:
    def __init__(self):        
        self.nlp.tokenizer = custom_tokenizer(nlp)

    def annotate_text(self, text, all_terms):
        doc = self.nlp(text)
        true_tokens = [x.text_with_ws for x in doc]
        positions = []
        tokens = [x.lemma_.lower() for x in doc]
        for term in all_terms:
            term = term.split()
            while self._contains(term, tokens):
                pos1, pos2 = self._contains(term, tokens)
                positions.append((pos1, pos2))
                tokens[pos1:pos2] = ['_' for x in range(pos1, pos2)]
        new_tokens = []
        for num, word in enumerate(tokens):
            if num in [x[0] for x in positions]:
                new_tokens.append(' <bos> ')
            new_tokens.append(true_tokens[num])
            if num in [x[1] - 1 for x in positions]:
                new_tokens.append(' <eos> ')
        return "".join(new_tokens)

    def _contains(self, small, big):
        for i in range(len(big) - len(small) + 1):
            for j in range(len(small)):
                if big[i + j] != small[j]:
                    break
            else:
                return i, i + len(small)
        return False

In [39]:
class SequenceClassifier:
    def __init__(self):
        pass

    def word2features(self, sent, i):
        word = sent[i][0]
        postag = sent[i][1]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'postag': postag,
            'postag[:2]': postag[:2],
        }
        if i > 0:
            word1 = sent[i - 1][0]
            postag1 = sent[i - 1][1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:postag': postag1,
                '-1:postag[:2]': postag1[:2],
            })
        else:
            features['BOS'] = True

        if i < len(sent) - 1:
            word1 = sent[i + 1][0]
            postag1 = sent[i + 1][1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            features['EOS'] = True

        return features

    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]

    def sent2labels(self, sent):
        return [label for token, postag, label in sent]

    def sent2tokens(self, sent):
        return [token for token, postag, label in sent]

    def convert_corpus(self, sents):
        X = [self.sent2features(s) for s in sents]
        y = [self.sent2labels(s) for s in sents]
        return X, y

    def fit(self, X_train, y_train):

        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True)
        crf.fit(X_train, y_train)
        return crf


In [40]:
def bio_annotate(annotations):
    corpus = []
    for text in annotations:
        new = []
        bos = False
        ios = False
        doc = nlp(text)
        for token in doc:
            if token.text == '<bos>':
                bos = True
            elif token.text == '<eos>':
                ios = False
            elif bos:
                new.append((token.text, token.pos_, 'B'))
                bos = False
                ios = True
            elif ios:
                new.append((token.text, token.pos_, 'I'))
            else:
                new.append((token.text, token.pos_, 'O'))
        corpus.append(new)
    return corpus

In [41]:
artextr = ArticlesExtraction(20, save_txt=False, save_pdf=False)
articles = artextr.extract()

Getting the links for 20 articles...
Getting the texts...


In [42]:
# I'm using only the first 1000 symbols here just to check if everything works, because it takes a lot of time
# to process the whole text
texts = [x[:1000] for x in list(articles.values())]
#texts = [x for x in list(articles.values())]

In [43]:
r = RuleBasedExtractor()
all_t = r.extract(texts)



In [44]:
popular_terms = Counter([x.lower() for x in all_t]).most_common(1000)
with open('popular_terms.txt', 'w') as file:
    for line in popular_terms:
        file.write(line[0] + '\n')

In [45]:
anno = Annotator()
annotations = [anno.annotate_text(x, all_t) for x in texts]

In [46]:
train = bio_annotate(annotations[:15])
test = bio_annotate(annotations[15:])

In [47]:
seq = SequenceClassifier()
X_train, y_train = seq.convert_corpus(train)
model = seq.fit(X_train, y_train)

In [48]:
labels = list(model.classes_)

In [49]:
X_test, y_test = seq.convert_corpus(test)
y_pred = model.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

1.0