# TP1 - NER

### En utilisant vos connaissances en programmation, codez un système NER qui reconnaît les noms de personnes (PERSON) et les lieux (LOCATION) à partir de l'ensemble de données fourni (europarl).

#### 1 - On récupère toutes nos données

In [1]:
import os
import glob
import json
import re
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')

def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            fullPath = fullPath.rsplit('/', 1)[-1]
            allFiles.append(fullPath)
                
    return allFiles

def readFile(filename):
    with open(filename, "r", encoding="utf8") as f:
        return f.read()

files = getListOfFiles('europarl')

[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 2 - Division du texte en phrases et en mots (c'est-à-dire en jetons)

In [2]:
# 1 - On enlève la ponctuation
def enlever_ponctuation(chaine):
    return re.sub(r"[^\w\s]", " ", chaine)

# 2 - On enlève les mots si ils sont dans la liste des mots vides
def enlever_mots_vides(chaine):
    return ' '.join([mot for mot in chaine.split() if mot not in stopwords.words("french")])

# 3 - Tokenisation des mots, on les sépare en mots
def tokenisation(chaine):
    return chaine.split()

# On crée une fonction qui va appliquer les 3 fonctions précédentes
def preprocess(chaine):
    return tokenisation(enlever_mots_vides(enlever_ponctuation(chaine)))

### 3 - Repérer les localisations et les personnes
Résultat :
Les phrases sont séparées par une nouvelle ligne, et chaque ligne contient un mot et son libellé : O s'il n'est pas contenu dans une entité, B-PER s'il fait partie d'une entité et qu'il apparaît au début du nom, et I-PER s'il se trouve à l'intérieur de l'entité ou à la fin de l'entité.


In [3]:
from operator import length_hint
import requests

LOCATIONS = "https://nominatim.openstreetmap.org"

# LOCALISATIONS
# On interroge une API qui contient toutes les localisations du monde

def est_localisation(mot):
    if mot[0].isupper() == False:
        return False

    if len(mot) < 3:
        return False

    response = requests.get(LOCATIONS + "/search?q="+ mot + "&format=json")
    if response.status_code == 200:
        for data in response.json():
            if data["importance"] > 0.5:
                return True
            return False    
    return False


In [4]:
# NOMS
# API that stores every firstnames and lastnames of the world
NAMES = "https://api.genderize.io"

# Ask the api if "Adrien" is a name
def est_nom(mot):

    if len(mot) < 3:
        return False

    response = requests.get(NAMES + "?name=" + mot +  "&format=json")
    if response.status_code == 200:
        if response.json()["count"] > 1200:
            return True
        return False
    return False




# Name Entity recognition (NER)
# Algorithm for a name entity recognition
# Input: a list of words
# Output: a list of tuples (word, tag)
def ner(file):
    tags = []
    for word in file:
        if est_localisation(word):
            tags.append((word, " B-LOC"))
            print(tags[-1])
        elif est_nom(word):
            if len(tags) > 0 and tags[-1][1] == " I-PER":
                tags.append((word, " I-PER"))
                print(tags[-1])
            tags.append((word, "B-PER"))
            print(tags[-1])
        else:
            tags.append((word, 'O'))
            print(tags[-1])
    return tags


In [46]:
# On parcourt les fichiers europarl
for file in files:
    # On lit le fichier
    text = readFile(file)
    # On le pré-traitement
    text = preprocess(text)
    # On applique l'algorithme de NER
    text = ner(text)
    # On écrit le résultat dans un fichier
    with open("europarl/iob/" + file + ".iob", "w", encoding="utf8") as f:
        for word, tag in text:
            f.write(word + " " + tag + "\n")


('Reprise', 'O')
('session', 'O')
('Je', 'O')
('déclare', 'O')
('reprise', 'O')
('session', 'O')
('Parlement', 'O')
('européen', 'O')
('interrompue', 'O')
('vendredi', 'O')
('17', 'O')
('décembre', 'O')
('dernier', 'O')
('renouvelle', 'O')
('tous', 'O')
('vux', 'O')
('espérant', 'O')
('passé', 'O')
('bonnes', 'O')
('vacances', 'O')
('Comme', 'O')
('pu', 'O')
('constater', 'O')
('grand', 'O')
('bogue', 'O')
('an', 'O')
('2000', 'O')
('produit', 'O')
('En', 'O')
('revanche', 'O')
('citoyens', 'O')
('certain', 'O')
('nombre', 'O')
('pays', 'O')
('victimes', 'O')
('catastrophes', 'O')
('naturelles', 'O')
('vraiment', 'O')
('terribles', 'O')
('Vous', 'O')
('souhaité', 'O')
('débat', 'O')
('sujet', 'O')
('prochains', 'O')
('jours', 'O')
('cours', 'O')
('cette', 'O')
('période', 'O')
('session', 'O')
('En', 'O')
('attendant', 'O')
('souhaiterais', 'O')
('comme', 'O')
('certain', 'O')
('nombre', 'O')
('collègues', 'O')
('demandé', 'O')
('observions', 'O')
('minute', 'O')
('silence', 'O')
('tou

KeyboardInterrupt: 

#### En utilisant le système que vous avez créé la dernière fois, générez un fichier au format IOB pour les phrases du fichier fourni “ep-01-05-30-fr.txt” et évaluer les résultats à l’aide de Conlleval ou Nervaluate.


In [16]:
from conlleval import evaluate

with open("test/ep-01-05-30-fr.iob", "r", encoding="utf-8") as f:
    testNous = f.read()

with open("test/ep-01-05-30-fr.txt", "r", encoding="utf-8") as f:
    testProf = f.read()

def file_to_dict(file: str, separator: str = "\t") -> dict:
    dictionnary = dict()
    file = file.split("\n")
    for element in file:
        el = element.split(separator)
        if len(el) == 2:
            dictionnary.update({el[0]: el[1]})
    return dictionnary


iob_prof = file_to_dict(testProf)
iob_nous = file_to_dict(testNous, " ")

truth_values = []
predict_values = []

# On écrit tous nos résultats dans le fichier "comparaison.txt"
with open("test/comparaison.txt", "w", encoding="utf-8") as f:
    for key, value in iob_prof.items():
        if key in iob_nous.keys():
            f.write(f"{key}\t{value}\t{iob_nous[key]}\n")
            truth_values.append(value)
            predict_values.append(iob_nous[key])
        else:
            f.write(f"{key}\t{value}\tO\n")
            truth_values.append(value)
            predict_values.append("O")

evaluate(truth_values, predict_values)

processed 7218 tokens with 179 phrases; found: 279 phrases; correct: 96.
accuracy:  47.89%; (non-O)
accuracy:  96.09%; precision:  34.41%; recall:  53.63%; FB1:  41.92
              LOC: precision:  56.82%; recall:  67.57%; FB1:  61.73  88
              PER: precision:  24.08%; recall:  43.81%; FB1:  31.08  191


(34.40860215053764, 53.63128491620112, 41.92139737991267)

### À l'aide de spaCy ou BERT, créez un système NER et comparez les performances avec votre système basé sur des règles.
#### Nous utilisons spaCy

In [5]:
!python -m spacy train config/config.cfg --output ./output --paths.train ./spacy/train.spacy --paths.dev ./spacy/test.spacy

2022-10-15 13:14:58.992592: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-10-15 13:14:58.993581: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-15 13:15:03.733885: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2022-10-15 13:15:03.734275: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-15 13:15:03.737873: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: DESKTOP-C1I471H
2022-10-15 13:15:03.738326: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: DESKTOP-C1I471H
[2022-10-15 13:15:11,882] [INFO] Set up nlp object from config
[2022-10-15 13:15:11,900] [INFO] Pipeline: 

✔ Created output directory: output
ℹ Saving to output directory: output
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     36.78    2.43    2.28    2.59    0.02
  0     200         71.56   2473.16   33.85   46.53   26.60    0.34
  0     400        103.43   1969.47   41.06   48.26   35.73    0.41
  0     600        325.15   2435.40   45.76   52.97   40.28    0.46
  0     800        219.20   2554.78   52.98   60.76   46.97    0.53
  0    1000        308.37   3103.96   53.37   60.28   47.88    0.53
  0    1200        492.90   3825.84   58.81   66.67   52.61    0.59
  0    1400        569.37   4177.04   60.58   66.63   55.53    0.61
  0    1600        693.51   4850.34   60.09   64.67   56.11    0.60
  0    1800        881.62   5824.60   66.67   71.53   62.43    0.67
  0    2

2022-10-15 13:37:55.516821: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-10-15 13:37:55.518063: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-15 13:38:07.101668: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2022-10-15 13:38:07.102240: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-15 13:38:07.117288: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: DESKTOP-C1I471H
2022-10-15 13:38:07.118695: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: DESKTOP-C1I471H
[2022-10-15 13:38:21,617] [INFO] Set up nlp object from config
[2022-10-15 13:38:21,653] [INFO] Pipeline: 

In [6]:
!python -m spacy evaluate ./output/model-best ./spacy/test.spacy

ℹ Using CPU
[1m

TOK     99.65
NER P   85.89
NER R   82.08
NER F   83.94
SPEED   13599

[1m

           P       R       F
PER    91.71   90.61   91.16
LOC    86.07   84.71   85.38
MISC   76.06   64.58   69.85
ORG    79.74   71.48   75.39



2022-10-15 21:12:29.839491: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-10-15 21:12:29.840042: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-15 21:12:38.273135: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2022-10-15 21:12:38.273638: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-15 21:12:38.282494: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: DESKTOP-C1I471H
2022-10-15 21:12:38.282933: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: DESKTOP-C1I471H
