# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [2]:
nlp = spacy.load('fr_core_news_lg')

## Fonctions

In [3]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [4]:
def search(n=1000000):
    text = open("../data/1945.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    places = defaultdict(int)
    orgas = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 8:
            people[ent.text] += 1
        elif ent.label_ == "LOC" and len(ent.text) > 8:
            places[ent.text] += 1
        elif ent.label_ == "ORG" and len(ent.text) > 8:
            orgas[ent.text] += 1
    
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    print('\n-------------PERSONNES------------\n')  
    for person, freq in sorted_people[:10]:
        print(f"{person} appears {freq} times in the corpus")

    sorted_places = sorted(places.items(), key=lambda kv: kv[1], reverse=True)
    print('\n-------------LIEUX------------\n')
    for place, freq in sorted_places[:10]:
        print(f"{place} appears {freq} times in the corpus")
    
    sorted_orgas = sorted(orgas.items(), key=lambda kv: kv[1], reverse=True)
    print('\n-------------ORGANISATIONS------------\n')
    for orga, freq in sorted_orgas[:10]:
        print(f"{orga} appears {freq} times in the corpus")    

## NER sur des données de test

In [5]:
test()

'Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars' contains the following entities: Apple (ORG)
'Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs' contains no entities
'San Francisco envisage d'interdire les robots coursiers sur les trottoirs' contains the following entities: San Francisco (LOC)
'Londres est une grande ville du Royaume-Uni' contains the following entities: Londres (LOC), Royaume-Uni (LOC)
'L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe' contains the following entities: ArcelorMittal (MISC), Europe (LOC)
'Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon' contains the following entities: Apple (ORG), HomePod (MISC), Echo (MISC)
'La France ne devrait pas manquer d'électricité cet été, même en cas de canicule' contains the following entities: La France (LOC)
'Nouvelles attaques de Trump contre le maire de Londres' contains the following entities: Trump (LOC), Lo

## NER sur le corpus des bulletins communaux

In [6]:
search(n=10000)


-------------PERSONNES------------

Ebénisterie appears 2 times in the corpus
Typographie appears 1 times in the corpus
Lithographie appears 1 times in the corpus
Menuiserie appears 1 times in the corpus
Sculpture appears 1 times in the corpus
Secours d'Hiver appears 1 times in the corpus
Ecole Bischoffsheim appears 1 times in the corpus
Henri Werrie appears 1 times in the corpus
John Solvay appears 1 times in the corpus
— Solidarité appears 1 times in the corpus

-------------LIEUX------------

Menuiserie appears 2 times in the corpus
Ebénistes appears 2 times in the corpus
Industrielle appears 1 times in the corpus
Plomberie appears 1 times in the corpus
Bijouterie appears 1 times in the corpus
Ebénisterie appears 1 times in the corpus
rue Haute appears 1 times in the corpus
CS

I. Nombre appears 1 times in the corpus
rue du Marais appears 1 times in the corpus
rue Guimard appears 1 times in the corpus

-------------ORGANISATIONS------------

Section normale appears 1 times in the c