# Entity Analysis - Python library SpaCy

In [43]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# NLTK Example

In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
from nltk import ne_chunk

In [22]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [4]:
df = pd.read_csv('raw_data_short_text.csv', usecols=['Name a brand that really ‘gets you’ when it comes to their advertising.'])
df['Response'] = df['Name a brand that really ‘gets you’ when it comes to their advertising.']
df = df.drop(['Name a brand that really ‘gets you’ when it comes to their advertising.'], axis=1)

In [5]:
df.head()

Unnamed: 0,Response
0,There isn't one
1,Clarks shoes
2,Not sure
3,
4,


In [46]:
df = df.astype('str')

In [34]:
ex = 'President Donald Trump says US-China relations remain very strong despite the two countries being embroiled in a trade war. Mr Trump said talks between Washington and Beijing will continue even after a deadline passed on Friday for them to reach an agreement. As a result the US raised tariffs to 25% from 10% on goods from China.'

In [10]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [11]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [12]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [13]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [15]:
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [23]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


What are the different entity types?

https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/english-entities-guidelines-v5.6.6.pdf

In [19]:
chunker=nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER)
sorted(chunker._tagger._classifier.labels())

['B-FACILITY',
 'B-GPE',
 'B-GSP',
 'B-LOCATION',
 'B-ORGANIZATION',
 'B-PERSON',
 'I-FACILITY',
 'I-GPE',
 'I-GSP',
 'I-LOCATION',
 'I-ORGANIZATION',
 'I-PERSON',
 'O']

# SpaCy Example

In [66]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.tokens import Span

In [35]:
doc = nlp(ex)
pprint([(X.text, X.label_) for X in doc.ents])

[('Donald Trump', 'PERSON'),
 ('US', 'GPE'),
 ('China', 'GPE'),
 ('two', 'CARDINAL'),
 ('Mr Trump', 'PERSON'),
 ('Washington', 'GPE'),
 ('Beijing', 'GPE'),
 ('Friday', 'DATE'),
 ('US', 'GPE'),
 ('25%', 'PERCENT'),
 ('10%', 'PERCENT'),
 ('China', 'GPE')]


In [36]:
displacy.render(nlp(ex), jupyter=True, style='ent')

In [37]:
displacy.render(nlp(ex), style='dep', jupyter = True, options = {'distance': 120})

In [38]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(ex) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('President', 'PROPN', 'President'),
 ('Donald', 'PROPN', 'Donald'),
 ('Trump', 'PROPN', 'Trump'),
 ('says', 'VERB', 'say'),
 ('China', 'PROPN', 'China'),
 ('relations', 'NOUN', 'relation'),
 ('remain', 'VERB', 'remain'),
 ('strong', 'ADJ', 'strong'),
 ('despite', 'ADP', 'despite'),
 ('countries', 'NOUN', 'country'),
 ('embroiled', 'VERB', 'embroil'),
 ('trade', 'NOUN', 'trade'),
 ('war', 'NOUN', 'war'),
 ('Mr', 'PROPN', 'Mr'),
 ('Trump', 'PROPN', 'Trump'),
 ('said', 'VERB', 'say'),
 ('talks', 'NOUN', 'talk'),
 ('Washington', 'PROPN', 'Washington'),
 ('Beijing', 'PROPN', 'Beijing'),
 ('continue', 'VERB', 'continue'),
 ('deadline', 'NOUN', 'deadline'),
 ('passed', 'VERB', 'pass'),
 ('Friday', 'PROPN', 'Friday'),
 ('reach', 'VERB', 'reach'),
 ('agreement', 'NOUN', 'agreement'),
 ('result', 'NOUN', 'result'),
 ('raised', 'VERB', 'raise'),
 ('tariffs', 'NOUN', 'tariff'),
 ('25', 'NUM', '25'),
 ('%', 'NOUN', '%'),
 ('10', 'NUM', '10'),
 ('%', 'NOUN', '%'),
 ('goods', 'NOUN', 'good'),
 ('Ch

In [63]:
df.head()

Unnamed: 0,Response
0,There isn't one
1,Clarks shoes
2,Not sure
3,
4,


In [69]:
entities = []
for i in tqdm(range(len(df['Response']))):
    doc = nlp(df['Response'][i])
    entities.append([(X.text, X.label_) for X in doc.ents])

100%|████████████████████████████████████████████████████████████████████████████████| 852/852 [00:13<00:00, 62.83it/s]


In [70]:
entity_list = []
for i in tqdm(range(len(entities))):
    for j in range(len(entities[i])):
        if entities[i][j][1] == 'ORG':
            entity_list.append(entities[i][j][0])

100%|████████████████████████████████████████████████████████████████████████████████████████| 852/852 [00:00<?, ?it/s]


In [71]:
entity_list

['Lush Cosmetics',
 'Cadbury',
 'Tena',
 'N/a',
 'Marks',
 'Yours Clothing',
 'Yours Clothing',
 'ASOS',
 'Nike',
 'England Sporting',
 'ASOS',
 'Pepsi',
 'Gordon',
 'Amazon',
 'M &',
 'MARKS',
 'SPENCER',
 'McMillan',
 'Addidas',
 'Marks',
 'Addidas',
 'Marks & Spencer',
 'Marks',
 'Hair',
 'Marks',
 'M&S',
 'Marks & Spencer',
 'Muller',
 'Betty & Co',
 'Rimmell',
 'Nivea',
 'M&S',
 'ASOS',
 'Gucci',
 'Coca-Cola',
 'Aerie',
 'Tena',
 'Aerie',
 'Marks',
 'Diet',
 'Vogue',
 'Marks',
 'N/a',
 "McDonald's",
 'Tesco',
 'RSPCA',
 'Tesco',
 'JOHN',
 'M & S',
 'Actimel',
 'Nutrisse',
 'Nike Miss Dior Channel']

In [76]:
'Garnier' in entity_list

False

In [64]:
df.loc[[18]]

Unnamed: 0,Response
18,Garnier


In [78]:
train_data = [
    ("Garnier", [(0, 7, 'ORG')])]

In [84]:
#optimizer = nlp.begin_training(get_data)
import random
from spacy.gold import GoldParse
for itn in tqdm(range(100)):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        gold = GoldParse(doc, entities=entity_offsets)
        nlp.update([doc], [gold], drop=0.5)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:27<00:00,  3.55it/s]


In [85]:
entities2 = []
for i in tqdm(range(len(df['Response']))):
    doc = nlp(df['Response'][i])
    entities2.append([(X.text, X.label_) for X in doc.ents])

100%|████████████████████████████████████████████████████████████████████████████████| 852/852 [00:13<00:00, 65.23it/s]


In [86]:
entity_list2 = []
for i in tqdm(range(len(entities2))):
    for j in range(len(entities2[i])):
        if entities2[i][j][1] == 'ORG':
            entity_list2.append(entities2[i][j][0])

100%|████████████████████████████████████████████████████████████████████████████| 852/852 [00:00<00:00, 170363.61it/s]


In [87]:
entity_list2

['There',
 'Clarks',
 'None',
 'None',
 'Boots',
 'None',
 'Dior',
 'h&m',
 'Dove',
 'Idk',
 'Dove',
 'campaigns',
 'sizes',
 'None',
 'Dove',
 "L'Oréal",
 'None',
 'Dove',
 'Garnier',
 'New',
 "L'Oréal",
 'Dove',
 'New',
 'Tampax',
 'Venus',
 'Primary',
 'Lush',
 'Dove',
 'Always',
 'Sure',
 'Very',
 'sizes',
 'Nivea',
 'Simply',
 'Compare',
 'market',
 'Nescafe',
 'None',
 'Always',
 'Unknown',
 'George',
 'None',
 'Nobody',
 'Dove',
 'Always',
 'Cadbury',
 'Simply',
 'Victories',
 'Art',
 'Any',
 'Nike',
 'Apple',
 'Dove',
 'Pantene',
 'conditioner',
 'Coop',
 'Garnier',
 'None',
 'Natural',
 'None',
 'None',
 'Dove',
 'Venus',
 'Dove',
 'Always',
 'Always',
 'Boden',
 'Loriel',
 "L'0real",
 'River',
 'Ca',
 'Most',
 'me',
 'Evans',
 'Marks',
 'Spencer',
 'Nivea',
 'Dove',
 'Garnier',
 'Tena',
 'All',
 'Cars',
 'Persil',
 'Chat',
 'Simply',
 'Always',
 'Nivea',
 'Lush',
 'Dove',
 'None',
 'All',
 'Coca',
 'Dominos',
 'Always',
 'Dove',
 'Always',
 'Food',
 'Loreal',
 'None',
 'Dove'

In [88]:
'Garnier' in entity_list2

True

In [91]:
pd.DataFrame(entity_list2)[0].value_counts()[:10]

Dove       84
Nike       37
None       33
Always     29
Next       20
L'Oréal    15
Sure       15
Nivea      14
M&S        14
Loreal     13
Name: 0, dtype: int64