# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [None]:
nlp = spacy.load('fr_core_news_sm')

# Exemple sur un corpus de Spacy

In [None]:
# Imprimer le corpus de Spacy

sentences

In [None]:
# Isoler la première phrase

sent = sentences[0]

sent

In [None]:
# Traiter la phrase avec Spacy

doc = nlp(sent)

In [None]:
type(doc)

### La documentation est accessible ici, https://spacy.io/api/doc

In [None]:
doc.text

In [None]:
doc.to_json()

## Appliquer l'approche sur toutes les phrases

In [None]:
for sent in sentences:
    doc = nlp(sent)
    entities = []
    for ent in doc.ents:
        entities.append(f"{ent.text} ({ent.label_})")
    if entities:
        print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
    else:
        print(f"'{doc.text}' contains no entities")

# Application sur le corpus

In [None]:
# Charger le texte

n=1000000
text = open("../data/all.txt", encoding='utf-8').read()[:n]

In [None]:
%%time
# Traiter le texte

doc = nlp(text)

In [None]:
# Compter les entités

people = defaultdict(int)

for ent in doc.ents:
    if ent.label_ == "PER" and len(ent.text) > 3:
        people[ent.text] += 1

In [None]:
# Trier et imprimer

sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)

for person, freq in sorted_people[:50]:
    print(f"{person} appears {freq} times in the corpus")