In [6]:
from annette.db import SessionManager
from annette.db.models import Citation, ManualClassification

Create a generic function to get a list of `{'attribute_name': attribute, 'class': classification}` records:

In [7]:
def get_data(attr_name):
    attr = getattr(Citation, attr_name)

    with SessionManager() as session_manager:
        citations = session_manager.session.query(attr,
                                                  ManualClassification.classification_id) \
            .join(ManualClassification, Citation.doi == ManualClassification.doi) \
            .group_by(attr, ManualClassification.classification_id).all()

    return [{
        attr_name: getattr(c, attr_name) if getattr(c, attr_name) is not None else '',
        'class': c.classification_id
        } for c in citations]

Set up two tokenisers for the `subject` attribute:

In [8]:
from nltk import RegexpParser
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


def tokenise_subjects_simple(subject):
    if subject == '' or subject is None:
        return []
    return list(set([s.strip().lower() for s in subject.split(',')]))


def tokenise_subjects_phrases(subject):
    if subject == '' or subject is None:
        return []
    split_subjects = []
    phrase_pattern = 'CHUNK:{<JJ>*<NN.?>*<VBG>*}'
    phrase_chunker = RegexpParser(phrase_pattern)
    for s in subject.split(','):
        tokens = word_tokenize(s.strip().lower())
        tags = pos_tag(tokens)
        phrases = [' '.join([leaf[0] for leaf in c.leaves()]) for c in phrase_chunker.parse(tags) if hasattr(c, 'label') and c.label() == 'CHUNK']
        for phrase in phrases:
            phrase_tokens = word_tokenize(phrase)
            phrase_tags = pos_tag(phrase_tokens)
            lemmatised_phrase = []
            for pto, pta in phrase_tags:
                wn_tag = {
                    'n': wn.NOUN,
                    'j': wn.ADJ,
                    'v': wn.VERB,
                    'r': wn.ADV
                    }.get(pta[0].lower(), None)
                if wn_tag is None:
                    continue
                lemmatised = WordNetLemmatizer().lemmatize(pto, wn_tag)
                lemmatised_phrase.append(lemmatised)
            if len(lemmatised_phrase) > 0:
                lemmatised_phrase = ' '.join(lemmatised_phrase)
                split_subjects.append(lemmatised_phrase)
    return list(set(split_subjects))



The simple tokeniser just splits the string by commas (and converts to lowercase). The tags should be fairly standardised, so this shouldn't be too much of a problem.

Just to see if it's more accurate/helpful to process the tags in a different way, we've also defined the 'phrase' tokeniser, which goes through several steps:
1. Split the whole string by commas (assumes the field is a comma-separated list of phrasal tags);
2. Tokenise the phrasal subject tags and identify parts of speech;
3. Extract sub-phrases based on those parts of speech (i.e. adjective-noun-verb combinations, ignoring conjunctives like *and*);
4. Lemmatise the words in those phrases to reduce differences between related tags (e.g. science and sciences both become *science*);
5. Join the words back together to form a lemmatised sub-phrase.

The two tokenisers will generate different length lists from the same input string.

In [9]:
print(tokenise_subjects_simple('Ecology,Aquatic Science,Nature and Landscape Conservation'))
print(tokenise_subjects_phrases('Ecology,Aquatic Science,Nature and Landscape Conservation'))

['ecology', 'aquatic science', 'nature and landscape conservation']
['ecology', 'aquatic science', 'landscape conservation', 'nature']


In [10]:
print(tokenise_subjects_simple('Ecology,Ecology, Evolution, Behavior and Systematics,Nature and Landscape Conservation'))
print(tokenise_subjects_phrases('Ecology,Ecology, Evolution, Behavior and Systematics,Nature and Landscape Conservation'))

['ecology', 'nature and landscape conservation', 'evolution', 'behavior and systematics']
['behavior', 'nature', 'systematics', 'ecology', 'evolution', 'landscape conservation']
