# Imports

In [180]:
!source venv/bin/activate

In [181]:
import math
import os
import os
from collections import Counter

import nltk
import nltk
import pandas as pd
import requests
import scipy.stats
import sklearn
import sklearn_crfsuite
import spacy
from bs4 import BeautifulSoup
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from spacy.matcher import Matcher
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer

# Terminology Representation

## Define Tree Structure

In [182]:
class Node():
    def __init__(self, name = None):
        self.name = name
        self.points_to = dict()
        self.stored_value = None 
        
        self.visited = False #for searching

    def point_to_node(self, other_node):
        self.points_to[other_node.name] = other_node

    def is_leaf(self):
        return self.points_to == dict()
    
    def is_empty(self):
        return (self.stored_value is None)
    
    def list_children_names(self):
        return self.points_to.keys()
    
    def list_children(self):
        self.points_to.values()
    
    def list_values_in_children(self):
        stored_values = []
        
        if not self.is_empty():
            stored_values.append(self.stored_value)

        if self.is_leaf():
            return stored_values
        
        for node in self.points_to.values():
            stored_values.extend(
                node.list_values_in_children()
            )
        return stored_values
        

In [183]:

class TerminologyTree():
    def __init__(self, name = "", root = Node() ):
        self.name = name
        self.root = root

## Create Representation

In [184]:
def add_term_to_tree(term, tree):

    current_node = tree.root
    for word in term:
        #go to the next node

            # if there is no next node, create it
        if word not in current_node.points_to.keys():
            new_node = Node(word)
            current_node.points_to[word] = new_node
        else:
            new_node = current_node.points_to[word]
    
        current_node = new_node

    #now we are at the end of a path whose nodes spell the term

    #store the string in the end node
    current_node.stored_value = term


def fill_terminology_tree(term_list, tree):
    for term in term_list:
        add_term_to_tree(term, tree)
    return

## Tree Operations

In [185]:
def is_term_in_tree(term, tree):
    current_node = tree.root
    for word in term:
        try:
            current_node = current_node.points_to[word]
        except:
            return False

    if current_node.stored_value == term:
        return True
    else:
        return False

def terms_with_word(word, tree):
    """return a list of terms that contain the word"""
    # breadth first search

    #adapting the pseudocode from
    #https://en.wikipedia.org/wiki/Breadth-first_search

    queue = []
    tree.root.visited = True
    queue.append(tree.root)
    result = []

    while len(queue) != 0:
        current_node = queue.pop()

        if word == current_node.name:

            # now that we found the word in a node:
            # get all the terms stored in the children of that node
            # and append these to the results list.
            result.extend(
                current_node.list_values_in_children()
                )

        for node in current_node.points_to.values():
            if not node.visited:
                node.visited = True
                queue.append(node)
    
    _unmark_tree(tree) # visited = False for all nodes
    return result

# Tokenizer

In [186]:
def custom_tokenizer(nlp = spacy.load("en_core_web_sm")):
    
    special_case = [{ORTH: "<bos>"}]
    nlp.tokenizer.add_special_case("<bos>", special_case)

    special_case = [{ORTH: "<eos>"}]
    nlp.tokenizer.add_special_case("<eos>", special_case)

    infixes = list([r"'s\b", r"(?<!\d)\.(?!\d)"]) +  nlp.Defaults.prefixes
    infix_re = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer
    
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)
nlp.tokenizer = custom_tokenizer(nlp)


# Article Extraction

In [187]:
class ArticlesExtraction:
    def __init__(self, number=20, verbose=True, save_txt=True, save_pdf=True):
        self.number = number
        self.verbose = verbose
        self.save_txt = save_txt
        self.save_pdf = save_pdf

    def _get_links(self):
        if self.verbose:
            print(f'Getting the links for {self.number} articles...')
        mainpage = requests.get('https://as-botanicalstudies.springeropen.com/articles')
        mainsoup = BeautifulSoup(mainpage.text)
        links = ['https://as-botanicalstudies.springeropen.com' + x['href'] for x in
                 sum([x.findAll('a') for x in mainsoup.findAll('h3', class_="c-listing__title")], [])]
        return links[:self.number]

    def extract(self):
        extra = ['Availability of data and materials', 'Abbreviations', 'References', 'Acknowledgements',
                 'Funding', 'Author information', 'Ethics declarations', 'Additional information',
                 'Rights and permissions', 'About this article']
        links = self._get_links()
        pdf_links = []
        if self.verbose:
            print('Getting the texts...')
        texts = dict()
        for num, link in enumerate(links):
            if self.verbose:
                print(f'{num + 1}/{len(links)} links', end="\r")
            page = requests.get(link)
            pagecontent = BeautifulSoup(page.text)
            name = pagecontent.findAll('h1', class_="c-article-title")[0].text
            text = "\n".join(sum([list(map(lambda y: y.text, x.findAll('p'))) for x in pagecontent.findAll('section') if
                                  x.has_attr('data-title') and x['data-title'] not in extra], []))
            texts[name] = text
            pdf_link = [x.findAll('a') for x in pagecontent.findAll('div', class_="c-pdf-download u-clear-both")][0][0][
                'href']
            pdf_links.append(pdf_link)
        if self.save_txt:
            if self.verbose:
                print('Saving the articles in txt...')
            if not os.path.exists('articles'):
                os.mkdir('articles')
            for key, value in texts.items():
                with open(f"articles/{key.replace('/', '|')}.txt", 'w') as file:
                    file.write(value)
        if self.save_pdf:
            if self.verbose:
                print('Saving the articles in pdf...')
            if not os.path.exists('articles_pdf'):
                os.mkdir('articles_pdf')
            for (key, value), link in zip(texts.items(), pdf_links):
                pdf = requests.get('https:' + link, allow_redirects=True)
                open(f"articles_pdf/{key.replace('/', '|')}.pdf", 'wb').write(pdf.content)
        return texts

In [188]:
class RuleBasedExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)

    def extract(self, texts):
        all_terms = []
        self._add_rules()
        for num, text in enumerate(texts):
            doc = self.nlp(text)
            matches = self.matcher(doc)
            for match_id, start, end in matches:
                string_id = self.nlp.vocab.strings[match_id]
                span = doc[start:end]
                lemma = ' '.join([n.lemma_ for n in self.nlp(span.text.lower())])
                all_terms.append(lemma)

            print(f'{num + 1}/{len(texts)} texts processed', end="\r")
        return all_terms

    def _add_rules(self):
        noun_pattern = {"POS": {"IN": ["NOUN", "PROPN"]}}
        det_pattern = {"POS": {"IN": ["DET", "PRON"]}, "OP": "?"}
        pattern = [  #[{"POS": "NOUN"}, {"POS": "NOUN"}],
            [noun_pattern, noun_pattern],
            #[{"DEP": "compound"}, {"POS": "NOUN"}],
            [noun_pattern, {"POS": "ADP"}, noun_pattern],
            [{"POS": "ADJ", "OP": "+"}, noun_pattern],
            [noun_pattern, {"POS": "ADP"}, det_pattern, noun_pattern],
            [det_pattern, {"POS": "ADJ"}, {"POS": "CCONJ"}, {"POS": "ADJ"}, noun_pattern],
        ]
        self.matcher.add("terms", pattern)

# Annotator

In [189]:
class Annotator:
    tagging = dict()
    def __init__(self):        
        self.nlp = custom_tokenizer(nlp)

    def _longest_term(position, word_list, tree):
        current_node  = tree.root
        term = []

        while True:
            try:
                word = word_list[position]
                current_node = current_node.points_to[word]
                if not current_node.is_empty(): 
                    aux_term = current_node.stored_value
                    if len(aux_term) > len(term): term = aux_term
                position += 1
            except:
                break
        
        return term



    def annotate(self, tree, word_list):
        """"
        returns a list of tuples (word, tag)
        """
        tagging = dict()
        position = 0

        def put_tag(position, tag):
            tagging[str(position)]= tag
            return None

        while position < len(word_list):
            #find the longest term appearing in the text at this position
            term = self._longest_term(position, word_list, tree)
            
            length = len(term)

            if length == 0: # no term was found
                put_tag(position, "O")
                position += 1
            else:
                put_tag(position, "B") #beginning of term
                for i in range(length-1):
                    put_tag(position + i +1, "I") #inside of term
                position += length

        text_tags = [
            (word_list[position], tagging[str(position)])
            for position in range(len(word_list))
        ]

        return text_tags



In [190]:
class SequenceClassifier:
    def __init__(self):
        pass

    def word2features(self, sent, i):
        word = sent[i][0]
        postag = sent[i][1]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'postag': postag,
            'postag[:2]': postag[:2],
        }
        if i > 0:
            word1 = sent[i - 1][0]
            postag1 = sent[i - 1][1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:postag': postag1,
                '-1:postag[:2]': postag1[:2],
            })
        else:
            features['BOS'] = True

        if i < len(sent) - 1:
            word1 = sent[i + 1][0]
            postag1 = sent[i + 1][1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            features['EOS'] = True

        return features

    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]

    def sent2labels(self, sent):
        return [label for token, postag, label in sent]

    def sent2tokens(self, sent):
        return [token for token, postag, label in sent]

    def convert_corpus(self, sents):
        X = [self.sent2features(s) for s in sents]
        y = [self.sent2labels(s) for s in sents]
        return X, y

    def fit(self, X_train, y_train):

        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True)
        crf.fit(X_train, y_train)
        return crf


# Experiments

## Extract Articles

In [191]:
artextr = ArticlesExtraction(20, save_txt=False, save_pdf=False)
articles = artextr.extract()

Getting the links for 20 articles...
Getting the texts...


## Create Terminology

### Extract Terms

In [192]:
r = RuleBasedExtractor()
all_t = r.extract(texts)
terminology = [t.split(' ') for t in all_t]



In [193]:
popular_terms = Counter([x.lower() for x in all_t]).most_common(1000)
with open('popular_terms.txt', 'w') as file:
    for line in popular_terms:
        file.write(line[0] + '\n')

### Create Tree Representation for Terminology

In [197]:
tree = TerminologyTree()
fill_terminology_tree(terminology, tree)
tree.root.points_to['salt stress']

KeyError: 'salt stress'

### Annotate Text

In [198]:
anno = Annotator()
annotations = [anno.annotate(x, all_t) for x in texts]

NameError: name '_longest_term' is not defined

In [None]:
train = bio_annotate(annotations[:15])
test = bio_annotate(annotations[15:])

In [None]:
seq = SequenceClassifier()
X_train, y_train = seq.convert_corpus(train)
model = seq.fit(X_train, y_train)

In [None]:
labels = list(model.classes_)

In [None]:
X_test, y_test = seq.convert_corpus(test)
y_pred = model.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

1.0