# spaCy experiments

## Imports & initialization

Import the required modules.

In [None]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import spacy

Create a language model, English in this case.

In [None]:
en_nlp = spacy.load('en_core_web_sm')

## Part of speech tagging (POS)

Read a text file into a string variable.

In [None]:
with open('Data/frost.txt') as file:
    text = ''.join(file.readlines())

Parse the text using the language model.

In [None]:
doc = en_nlp(text)

Show the part of speech tags, as well as the context of the words.

In [None]:
for word in doc:
    print(f'{word.text!r}: {word.pos_}, '
          f'{word.left_edge.text!r} <- {word.head.text!r} -> {word.right_edge.text!r}')

Since we can't use backslashes in f-strings, we define a constant to represent it.

In [None]:
newline = '\n'

To split a text in sentences, a statistical model is used that was obtained from the training corpus.

In [None]:
for i, sentence in enumerate(doc.sents):
    print(f'{i:3d} {sentence.text.replace(newline, " ")}')

For poetry, sentences seem somewhat hard to detect.  However, it is possible to define a language model for English and add a rule-based sentencizer to it.

In [None]:
en_nlp_alt = spacy.lang.en.English()
sentencizer = en_nlp_alt.create_pipe('sentencizer')
en_nlp_alt.add_pipe(sentencizer)
doc = en_nlp_alt(text)
for i, sentence in enumerate(doc.sents):
    print(f'{i:3d} {sentence.text.replace(newline, " ").strip()}')

## Named entiry recognition (NER)

Named entity recognition is supported as well.

In [None]:
sentence = 'Music by Johann Sebastian Bach is better than that by Friederich Buxtehude. Both lived in Germany'

In [None]:
doc = en_nlp(sentence)

In [None]:
for i, word in enumerate(doc):
    print(f'{i:3d} {word.text!r}: {word.pos_}, {word.ent_type_}')

It is also possible to retrieve named entities from the document explicitly.

In [None]:
for entity in doc.ents:
    print(f'{entity} ({entity.label_}): {entity.start} -> {entity.end}')

This can also be visualized as markup in the sentence.

In [None]:
spacy.displacy.render(doc, style='ent', jupyter=True)

In [None]:
spacy.displacy.render(doc, style='dep', jupyter=True,
                      options={'distance': 140, 'compact': True})

## Similarity

Document similarity can also be computed conveniently.

In [None]:
doc1 = en_nlp('The book is nice')
doc2 = en_nlp('The novel is beautiful')
doc1.similarity(doc2)

In [None]:
doc1 = en_nlp('The book is nice')
doc2 = en_nlp('The house is on fire')
doc1.similarity(doc2)

In [None]:
words = ['queen', 'lady', 'girl', 'king', 'lord', 'boy', 'cat', 'dog', 'lion']
similarity = np.empty((len(words), len(words)))
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        similarity[i, j] =  en_nlp(word1).similarity(en_nlp(word2))

The similarity matrix can be visualized as a heat map using the following function:

In [None]:
def plot_similarity_matrix(sim, words, cmap=plt.cm.Blues):
    figure, axes = plt.subplots(figsize=(6, 6))
    axes.imshow(sim, interpolation='nearest', cmap=cmap)
    axes.set_xticks(range(len(words)))
    axes.set_xticklabels(words, rotation=45)
    axes.set_yticks(range(len(words)))
    axes.set_yticklabels(words)
    fmt = '{0:.2f}'
    thresh = 0.5*(sim.max() + sim.min())
    for i, j in itertools.product(range(sim.shape[0]), range(sim.shape[1])):
        axes.text(j, i, fmt.format(sim[i, j]),
                  horizontalalignment="center",
                  color="white" if sim[i, j] > thresh else "black",
                  fontsize=8)
    figure.tight_layout()
    axes.set_xlabel('word 1')
    axes.set_ylabel('word 2')

In [None]:
plot_similarity_matrix(similarity, words)