In [1]:
import geovpylib.database as db
import spacy


# Connect to Yellow database
db.connect_yellow('switzerland_and_beyond')

# Fetch corpus
persons = db.query("select * from hls.person")
person = persons.sample(1).iloc[0]

[DB] Requests will not be executed
[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


# Chapter 1: Finding words, phrases, names and concepts

https://course.spacy.io/en/chapter1

## Introduction

**Create a `doc` in a language**

In [2]:
nlp = spacy.blank('fr')
doc = nlp(person.notice)
print(doc.text)

ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'NoneType'>

**Get tokens out of a `doc`**

In [None]:
nlp = spacy.blank('fr')
doc = nlp(person.notice)
token = doc[0]
print(token.text)

**Get a slice of the doc**

In [None]:
nlp = spacy.blank('fr')
doc = nlp(person.notice)
a_slice = doc[2:10]
print(a_slice.text)

**Find dates (births and deaths) in `doc`**

In [None]:
nlp = spacy.blank('fr')
doc = nlp(person.notice)
lendoc = len(doc)

for token in doc:
    if token.text == 'Naît' and doc[token.i + 1].text == "le" and doc[token.i + 2].like_num:
        print('Birth date found:', doc[token.i + 2])
    if token.text == 'meurt' and doc[token.i + 1].text == "le" and doc[token.i + 2].like_num:
        print('Death date found:', doc[token.i + 2])


## Pipelines

"Fundamentally, a spaCy pipeline package consists of three components: the weights, i.e. binary data loaded in from a directory, a pipeline of functions called in order, and language data like the tokenization rules and language-specific settings. For example, a Spanish NER pipeline requires different weights, language data and components than an English parsing and tagging pipeline. This is also why the pipeline state is always held by the Language class. spacy.load puts this all together and returns an instance of Language with a pipeline set and access to the binary data" (https://spacy.io/usage/processing-pipelines)

Vocabulary:
- POS: Part Of Speech
- DEP: DEPendency label

**Load a pipeline**

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)
print(doc)

**Predict language annotation**

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)

for token in doc[0:15]:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

**All kinds of POS found, with explaination**

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)

POSs = []
DEPs = []
for token in doc:
    if token.pos_ not in POSs: POSs.append(token.pos_)
    if token.dep_ not in DEPs: DEPs.append(token.dep_)

print('===== Part of Speech: =====')
for pos in POSs:
    print(pos, "-->", spacy.explain(pos))

print('\n===== Dependency labels: =====')
for dep in DEPs:
    print(dep, "==>", spacy.explain(dep))

**All entities found in a text (NER)**

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "==>", ent.label_)

## Rule Based Matching

To write rules to find words and phrases in texts

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)

matcher = spacy.matcher.Matcher(nlp.vocab)
pattern_birth = [{'TEXT': 'Naît'}, {'TEXT': 'le'}, {'LIKE_NUM': True}]
pattern_death = [{'TEXT': 'meurt'}, {'TEXT': 'le'}, {'LIKE_NUM': True}]
pattern_son = [{'TEXT': 'Fils'}, {'TEXT': 'de'}, {'POS': 'PROPN'}]
pattern_daughter = [{'TEXT': 'Fille'}, {'TEXT': 'de'}, {'POS': 'PROPN'}]

matcher.add("BIRTH", [pattern_birth])
matcher.add("DEATH", [pattern_death])
matcher.add("SON", [pattern_son])
matcher.add("DAUGHTER", [pattern_daughter])

matches = matcher(doc)
print("Total matches found:", len(matches))

for id, start, end in matches:
    print(doc[start:end].text)

# Chapter 2: Large-scale data analysis with spaCy

"In this chapter, you'll use your new skills to extract specific information from large volumes of text. You'll learn how to make the most of spaCy's data structures, and how to effectively combine statistical and rule-based approaches for text analysis." https://course.spacy.io/en/chapter2

## Data Structures

**Word hashes (in vocab)**

In [None]:
nlp = spacy.load("fr_core_news_sm")
text = person.notice
doc = nlp(text)
word = 'meurt'
hash = nlp.vocab.strings[word]
word_from_hash = nlp.vocab.strings[hash]

print(hash, word_from_hash)

**Manually create a `doc`**

In [None]:
words = ['Hello', 'world', '!']
spaces = [True, False, False]
doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)

print(doc.text)


**Add a new entity to the existing entities of a `doc`**

In [None]:
from textwrap import wrap
for txt in wrap(text): print(txt)

In [None]:
# nlp = spacy.load('fr_core_news_sm')
nlp = spacy.blank('fr')
text = person.notice
doc = nlp(text)

span = spacy.tokens.Span(doc, 33, 35, label="PERSON")
doc.ents = [span]

for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
text.index('Marion Cave')