In [1]:
import pandas as pd

In [2]:
import nltk

nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

from nltk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/nelth/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /home/nelth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nelth/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /home/nelth/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power \
    in the mobile phone market and ordered the company to alter its practices'

## Process text to token

In [4]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [5]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

## Process chunk

In [6]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [7]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


> Processing to one token per line

In [8]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


## Using name entity recognition

### First gen with NLTK

In [9]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


### Second gen with Spacy

In [10]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google a record $5.1 billion', 'ORG'),
 ('Wednesday', 'DATE')]


In [12]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'I', 'ORG'),
 (record, 'I', 'ORG'),
 ($, 'I', 'ORG'),
 (5.1, 'I', 'ORG'),
 (billion, 'I', 'ORG'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


## Ingest data and recognize entity - fuck it -> it's weird in french

In [19]:
from bs4 import BeautifulSoup
import requests
import re
import html5lib

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.lemonde.fr/elections-legislatives-2022/live/2022/05/03/legislatives-2022-en-direct-l-accord-entre-lfi-et-les-communistes-soumis-au-conseil-national-du-pcf-la-ceremonie-d-investiture-d-emmanuel-macron-aura-lieu-samedi_6124539_6104324.html')
article = nlp(ny_bb)
len(article.ents)

676

In [20]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 222,
         'PERSON': 249,
         'EVENT': 3,
         'QUANTITY': 24,
         'GPE': 58,
         'DATE': 8,
         'PRODUCT': 17,
         'CARDINAL': 19,
         'WORK_OF_ART': 4,
         'LOC': 15,
         'NORP': 42,
         'ORDINAL': 3,
         'LAW': 5,
         'MONEY': 2,
         'FAC': 2,
         'PERCENT': 2,
         'LANGUAGE': 1})

In [21]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('dans', 30), ('un', 24), ('La France', 20)]

In [25]:
sentences = [x for x in article.sents]
print(sentences[17])

          Il n’est pas question du nucléaire dans ce communiqué, point d’achoppement majeur entre Jean-Luc Mélenchon et Fabien Roussel au cours de la campagne présidentielle.


In [26]:
displacy.render(nlp(str(sentences[17])), jupyter=True, style='ent')

## Ingest data and recognize entity - fuck it -> it's weird in french

In [None]:
# !pip install html5lib

In [38]:
from bs4 import BeautifulSoup
import requests
import re
import html5lib

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.theguardian.com/world/2022/may/03/russia-accuses-israel-backing-neo-nazis-kyiv-diplomatic-row-grows')
article = nlp(ny_bb)
len(article.ents)

123

In [39]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 40,
         'NORP': 31,
         'PERSON': 15,
         'ORG': 18,
         'PRODUCT': 1,
         'CARDINAL': 4,
         'DATE': 11,
         'EVENT': 2,
         'TIME': 1})

In [40]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Israel', 14), ('Russia', 11), ('Ukraine', 9)]

In [51]:
sentences = [x for x in article.sents]
sentence = sentences[17]
print(sentence)

As the conflict drags on and more evidence of atrocities committed by Russian forces against civilians has come to light, Israeli criticism has become more vocal, with Lapid last month accusing Russia of war crimes.


In [52]:
displacy.render(nlp(str(sentence)), jupyter=True, style='ent')

> Dependencies of the sentence

In [53]:
displacy.render(nlp(str(sentence)), style='dep', jupyter = True, options = {'distance': 120})

> Lemmatization

In [54]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentence)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('conflict', 'NOUN', 'conflict'),
 ('drags', 'VERB', 'drag'),
 ('evidence', 'NOUN', 'evidence'),
 ('atrocities', 'NOUN', 'atrocity'),
 ('committed', 'VERB', 'commit'),
 ('Russian', 'ADJ', 'russian'),
 ('forces', 'NOUN', 'force'),
 ('civilians', 'NOUN', 'civilian'),
 ('come', 'VERB', 'come'),
 ('light', 'ADJ', 'light'),
 ('Israeli', 'ADJ', 'israeli'),
 ('criticism', 'NOUN', 'criticism'),
 ('vocal', 'ADJ', 'vocal'),
 ('Lapid', 'PROPN', 'Lapid'),
 ('month', 'NOUN', 'month'),
 ('accusing', 'VERB', 'accuse'),
 ('Russia', 'PROPN', 'Russia'),
 ('war', 'NOUN', 'war'),
 ('crimes', 'NOUN', 'crime')]

In [55]:
dict([(str(x), x.label_) for x in nlp(str(sentence)).ents])

{'Russian': 'NORP', 'Israeli': 'NORP', 'last month': 'DATE', 'Russia': 'GPE'}

In [56]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentence])

[(As, 'O', ''), (the, 'O', ''), (conflict, 'O', ''), (drags, 'O', ''), (on, 'O', ''), (and, 'O', ''), (more, 'O', ''), (evidence, 'O', ''), (of, 'O', ''), (atrocities, 'O', ''), (committed, 'O', ''), (by, 'O', ''), (Russian, 'B', 'NORP'), (forces, 'O', ''), (against, 'O', ''), (civilians, 'O', ''), (has, 'O', ''), (come, 'O', ''), (to, 'O', ''), (light, 'O', ''), (,, 'O', ''), (Israeli, 'B', 'NORP'), (criticism, 'O', ''), (has, 'O', ''), (become, 'O', ''), (more, 'O', ''), (vocal, 'O', ''), (,, 'O', ''), (with, 'O', ''), (Lapid, 'O', ''), (last, 'B', 'DATE'), (month, 'I', 'DATE'), (accusing, 'O', ''), (Russia, 'B', 'GPE'), (of, 'O', ''), (war, 'O', ''), (crimes, 'O', ''), (., 'O', '')]


### Rendering entire article

In [57]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')