# Named-entity recognition with SpaCy

La méthode NER permet de détecter les entitées nommées et de référencer des informations.

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [2]:
nlp = spacy.load('fr_core_news_sm')

In [4]:
## La documentation est accessible ici, https://spacy.io/api/doc¶

## Détecter les entitées nommées

In [3]:
def show_ents(doc): 
    if doc.ents: 
        for ent in doc.ents: 
            print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_))) 
    else: print('Aucune entité nommée trouvée')

In [4]:
doc1 = nlp("Apple is looking at buying U.K. startup for $1 billion") 
show_ents(doc1)

Apple is looking at buying - 0 - 26 - MISC - Miscellaneous entities, e.g. events, nationalities, products or works of art


In [5]:
doc4 = nlp("Les trottoirs") 
show_ents(doc4)

Aucune entité nommée trouvée


In [6]:
doc5 = nlp("Les trottoirs de la ville") 
show_ents(doc5)

Aucune entité nommée trouvée


In [7]:
doc6 = nlp("Bruxelles, ma belle") 
show_ents(doc6)

Bruxelles - 0 - 9 - LOC - Non-GPE locations, mountain ranges, bodies of water


In [8]:
doc7 = nlp("La voie publique n'est pas une déchetterie") 
show_ents(doc7)

Aucune entité nommée trouvée


In [9]:
doc8 = nlp("Commission assistance") 
show_ents(doc8)

Commission - 0 - 10 - ORG - Companies, agencies, institutions, etc.


In [10]:
doc9 = nlp("Dans le cadre de ce travail, les Bulletins communaux de la Ville de Bruxelles sont analysés à l'aide d'une série de scripts.") 
show_ents(doc9)

Bulletins - 33 - 42 - LOC - Non-GPE locations, mountain ranges, bodies of water
Ville de Bruxelles - 59 - 77 - LOC - Non-GPE locations, mountain ranges, bodies of water


In [11]:
doc10 = nlp("Le recours à la méthode NER permettrait une meilleure organisation de la documentation relative aux séances du Conseil communal de la Ville de Bruxelles.") 
show_ents(doc10)

NER - 24 - 27 - ORG - Companies, agencies, institutions, etc.
Conseil communal de la Ville de Bruxelles - 111 - 152 - ORG - Companies, agencies, institutions, etc.


## Application sur le corpus pour tous les Bulletins communaux des années 1847 à 1978


In [12]:
# Charger le texte

n=1000000
text = open("../data/allyear.txt", encoding='utf-8').read()[:n]

In [13]:
%%time
# Traiter le texte

doc = nlp(text)

Wall time: 44.8 s


In [14]:
# Compter les entités

organisations = defaultdict(int)

for ent in doc.ents:
    if ent.label_ == "ORG" and len(ent.text) > 3:
        organisations[ent.text] += 1

In [15]:
organisations

defaultdict(int,
            {'Dl!\n\n': 1,
             'VILLE': 1,
             'IMPRIMERIE': 1,
             'FAUBOURG': 1,
             'Conseil': 374,
             '^espourVica.lém': 1,
             'SOMMAIRE': 17,
             '— Discussion': 1,
             'Messageries': 2,
             'Académie des beaux': 2,
             'CONSTRUCTIONS': 1,
             'Location du marché de la Madeleine': 1,
             'Conseil communal': 8,
             'Philanthropie j o u i r': 1,
             'Société de Philanthrophie': 1,
             'Aveugles indigens': 1,
             'Sociélé Philanlropique': 1,
             'Société Philanthropique': 14,
             'Société Royale de Philanthropie': 7,
             "Conseil général de l'administration des hospices": 1,
             'Conseil général des hospices': 6,
             'Société de Commerce des Pays-Bas': 1,
             'Société de Commerce': 1,
             "Conseil général d'administration des hospices": 3,
             'Société 

In [16]:
# Trier et imprimer

sorted_organisations = sorted(organisations.items(), key=lambda kv: kv[1], reverse=True)

for organisations, freq in sorted_organisations[:30]:
    print(f"{organisations} appears {freq} times in the corpus")

Conseil appears 374 times in the corpus
Ranwet appears 45 times in the corpus
Collège appears 32 times in the corpus
Bas-fonds appears 22 times in the corpus
Société appears 19 times in the corpus
SOMMAIRE appears 17 times in the corpus
Conseil de salubrité appears 15 times in the corpus
Société Philanthropique appears 14 times in the corpus
Constitution appears 14 times in the corpus
Collège des bourgmestre appears 10 times in the corpus
Chambre appears 9 times in the corpus
Conseil communal appears 8 times in the corpus
Conservatoire appears 8 times in the corpus
Administration communale appears 8 times in the corpus
Société Royale de Philanthropie appears 7 times in the corpus
Conseil général des hospices appears 6 times in the corpus
M. Ranwet appears 5 times in the corpus
Administration des Hospices appears 5 times in the corpus
Régence appears 4 times in the corpus
A R T appears 4 times in the corpus
Adhésion appears 4 times in the corpus
Code pénal appears 4 times in the corpus


In [17]:
text = open("../data/allyear.txt", encoding='utf-8').read()[:n]

len(text)

1000000

In [18]:
nlp.max_length = 15000000

In [19]:
def search_organisations(n=1000000):
    text = open("../data/allyear.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    location = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 2:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:30]:
        print(f"{location} appears {freq} times in the corpus")

search_organisations()

Conseil appears 374 times in the corpus
Ranwet appears 45 times in the corpus
Collège appears 32 times in the corpus
Bas-fonds appears 22 times in the corpus
Société appears 19 times in the corpus
SOMMAIRE appears 17 times in the corpus
Conseil de salubrité appears 15 times in the corpus
Société Philanthropique appears 14 times in the corpus
Constitution appears 14 times in the corpus
Collège des bourgmestre appears 10 times in the corpus
Chambre appears 9 times in the corpus
Conseil communal appears 8 times in the corpus
Conservatoire appears 8 times in the corpus
Administration communale appears 8 times in the corpus
Société Royale de Philanthropie appears 7 times in the corpus
Conseil général des hospices appears 6 times in the corpus
M. Ranwet appears 5 times in the corpus
Administration des Hospices appears 5 times in the corpus
Régence appears 4 times in the corpus
M M appears 4 times in the corpus
A R T appears 4 times in the corpus
Adhésion appears 4 times in the corpus
Code pé

In [20]:
def search_miscellaneous(n=1000000):
    text = open("../data/allyear.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    location = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "MISC" and len(ent.text) > 2:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:20]:
        print(f"{location} appears {freq} times in the corpus")

search_miscellaneous()

S e c r é t a i r e appears 9 times in the corpus
échevin O appears 8 times in the corpus
Considérant appears 8 times in the corpus
A R T appears 8 times in the corpus
« V appears 6 times in the corpus
Cans appears 5 times in the corpus
M. le bourgmestre appears 5 times in the corpus
Code appears 5 times in the corpus
X I appears 5 times in the corpus
Longue-Vie appears 5 times in the corpus
échevin D o u c e t appears 4 times in the corpus
Constitution appears 4 times in the corpus
Waterloo appears 3 times in the corpus
succession de M appears 3 times in the corpus
— Discussion appears 3 times in the corpus
Pâques appears 3 times in the corpus
coup-d'œil appears 3 times in the corpus
marché de la Madeleine appears 3 times in the corpus
« Voilà des terrains appears 3 times in the corpus
Marché-auxHerbes appears 3 times in the corpus
