In [11]:
import gzip
import joblib
import re
import functools as ft

import numpy as np
import spacy
import gensim
from tqdm import *

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, LabelEncoder, PolynomialFeatures

import utils



In [2]:
model = gensim.models.word2vec.KeyedVectors.load('vectors/gutenberg_en_300.vec')

logistic_models = joblib.load('logistic_models_for ensembling.pkl.gz')

# `logistic_models` is a list of (estimator, test score) tuples
clf_list = [x[0] for x in logistic_models]

nlp = spacy.load('en_core_web_lg')

le = LabelEncoder()
le.fit(['animate', 'inanimate'])

preprocessor = make_pipeline(
    Normalizer(norm='max'),
    PolynomialFeatures(include_bias=False)
)

nouns_with_vectors = {w for w in model.wv.vocab if w.endswith('|NOUN')}

In [3]:
def get_vec(word):
    return get_vecs(list(word))


def get_vecs(words):
    assert isinstance(words, list)
    _vecs = []
    
    for w in words:
        _vecs.append(model.wv.word_vec(w))
        
    return np.array(_vecs)


def avg_predict_proba(vec, estimators):
    """Adapted from https://stackoverflow.com/a/42925296.
    """
    _predictions = []
    
    for _clf in estimators:
        # `predict_proba` below is expecting a 2D array, but `vec` is only 1D.
        _expanded = np.expand_dims(vec, axis=0)
        _prediction = _clf.predict_proba(_expanded)
        _predictions.append(_prediction)
    
    _pred_array = np.asarray(_predictions)
    
    _avg_prediction = np.average(_pred_array, axis=0)
    
    return _avg_prediction[0]


def pipeline(vec, preprocessor, estimators, label_encoder):
    _preprocess_x = preprocessor.fit_transform(vec)
    
    _pred_proba = np.apply_along_axis(
        avg_predict_proba,
        axis=1,
        arr=_preprocess_x,
        **dict(estimators=estimators)
    )
    
    _pred_class = np.argmax(_pred_proba, axis=1)
    _pred_label = label_encoder.inverse_transform(_pred_class)
    
    return _pred_label


predict_animacy = ft.partial(
    pipeline,
    **dict(preprocessor=preprocessor,
           estimators=clf_list,
           label_encoder=le)
)


def make_xml_tag(span, parsed_doc, tag_lookup, ent_label):
    _xml_id = '#' + span.text_with_ws.strip().lower().replace(' ', '_')

    span.merge()
    _idx = span.start
    _token = parsed_doc[_idx]

    _token._.set('xml_tag', tag_lookup[label])
    _token._.set('xml_attrs', {'id': _xml_id})

In [4]:
with open('noun_clusters_to_keep.txt') as fo:
    denoised_nouns = {utils.clean_word(w.strip()) for w in fo.read().split()}

In [5]:
ent_lookup = {
    'PERSON': 'persName', 
    'NORP': 'personGrp', 
    'GPE': 'personGrp', 
    'LOC': 'placeName'
}

In [6]:
spacy.tokens.Token.set_extension('xml_tag')
spacy.tokens.Token.set_extension('xml_attrs')
spacy.tokens.Token.set_extension('is_person', default=False)

In [7]:
with open(r'data/texts/cato_minor.txt') as fo:
    text = fo.read()

In [8]:
doc = nlp(text)

In [9]:
for _chunk in doc.noun_chunks:
    is_person = False
    
    for _word in _chunk:
        _norm_word = utils.clean_word(_word.text + '|' + _word.pos_)
        
        if (_word.pos_ == 'NOUN'
                and _norm_word in denoised_nouns
                and _norm_word in model.wv.vocab):
            _word_vec = np.expand_dims(model.wv.word_vec(_norm_word), axis=0)
            _prediction = predict_animacy(_word_vec)

            if _prediction == 'animate':
                is_person = True
                break
    
    if is_person:
        xml_id = '#' + _chunk.text_with_ws.strip().lower().replace(' ', '_')
        
        _chunk.merge()
        idx = _chunk.start
        token = doc[idx]
        
        token._.set('xml_tag', ent_lookup['PERSON'])
        token._.set('xml_attrs', {'id': xml_id})

In [10]:
ents = (ent for ent in doc.ents if ent.label_ in ent_lookup)

for e in ents:
    xml_id = '#' + e.text_with_ws.strip().lower().replace(' ', '_')
    
    e.merge()
    
    idx = e.start
    token = doc[idx]
    
    token._.set('xml_tag', ent_lookup[e.label_])
    token._.set('xml_attrs', {'id': xml_id})

In [11]:
text_with_markup = []

for i, sent in enumerate(doc.sents):
    for token in sent:
        display = ''
        attr_repr = ''

        tag = token._.get('xml_tag')
        attrs = token._.get('xml_attrs')

        if attrs:
            attr_repr = ''.join([' ' + a + '="' + attrs[a] + '"' for a in attrs])

        if tag and token.text.strip():
            markup = '<' + tag + attr_repr + '>' + token.text_with_ws + '</' + tag + '>'
        else:
            markup = token.text_with_ws

        text_with_markup.append(markup)

In [12]:
space_tag_re = re.compile(r' (</[A-Za-z]+>)')
space_re = re.compile(r' +')

clean_xml = space_tag_re.sub(r'\1 ', ''.join(text_with_markup))
clean_xml = space_re.sub(r' ', clean_xml)

with open('data/texts/cato_minor.xml', 'w') as xml_out:
    xml_out.write(clean_xml)