In [1]:
import pandas as pd
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
from pyvis.network import Network

In [2]:
# pip install networkx pyvis

# 01 - Load Data

In [3]:
# data source: Kaggle - https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus
df = pd.read_csv("../Data/ner.csv")
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities from a text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [5]:
df100 = df.head(100)

In [6]:
# Apply NER on the text column
df100['entities'] = df100['Sentence'].apply(extract_entities)

# Save the results to a new CSV
df100.to_csv("data_with_entities.csv", index=False)

print("NER extraction completed. Results saved to 'data_with_entities.csv'.")


NER extraction completed. Results saved to 'data_with_entities.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df100['entities'] = df100['Sentence'].apply(extract_entities)


In [7]:
df100.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,entities
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...","[(Thousands, CARDINAL), (London, GPE), (Iraq, ..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(Stop the Bombings, WORK_OF_ART)]"
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(the Houses of Parliament, ORG), (Hyde Park, ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(10,000, CARDINAL), (1,00,000, CARDINAL)]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(annual, DATE), (Britain, GPE), (Labor Party,..."


# Analysis

### Look at most common types of entities

In [8]:
ent_dict = {}
label_dict = {}
label_ent_dict = {}
for i in range(len(df100)):
    entlist = df100['entities'].iloc[i]
    for ent in entlist:
        if ent in ent_dict.keys():
            ent_dict[ent] += 1
        else:
            ent_dict[ent] = 1
        nent, label = ent
        if label in label_dict.keys():
            label_dict[label] += 1
        else:
            label_dict[label] = 1
        if label in label_ent_dict.keys():
            label_ent_dict[label].append(nent)
        else:
            label_ent_dict[label] = [nent]

for k in label_ent_dict.keys():
    label_ent_dict[k] = set(label_ent_dict[k])

In [9]:
# One sample entry
df100['entities'].iloc[99]

[('Saturday', 'DATE'),
 ('Orakzai', 'PERSON'),
 ('Taliban', 'ORG'),
 ('South Waziristan', 'GPE')]

In [10]:
# Count word-label frequencies
{w:ent_dict[w] for w in sorted(ent_dict, key=ent_dict.get, reverse=True)}

{('Wednesday', 'DATE'): 6,
 ('Iran', 'GPE'): 6,
 ('Iraq', 'GPE'): 5,
 ('annual', 'DATE'): 5,
 ('Mogadishu', 'GPE'): 5,
 ('Saturday', 'DATE'): 5,
 ('Pakistani', 'NORP'): 5,
 ('APEC', 'ORG'): 5,
 ('U.S.', 'GPE'): 4,
 ('one', 'CARDINAL'): 4,
 ('Sunday', 'DATE'): 4,
 ('Pakistan', 'GPE'): 4,
 ('three', 'CARDINAL'): 4,
 ('British', 'NORP'): 3,
 ('last week', 'DATE'): 3,
 ('Baghdad', 'GPE'): 3,
 ('Taleban', 'NORP'): 3,
 ('Afghanistan', 'GPE'): 3,
 ('South Waziristan', 'GPE'): 3,
 ('Halliburton', 'GPE'): 3,
 ('ABAC', 'ORG'): 3,
 ('Sudanese', 'NORP'): 3,
 ('Indonesian', 'NORP'): 3,
 ('Bali', 'GPE'): 3,
 ('London', 'GPE'): 2,
 ('Britain', 'GPE'): 2,
 ('Rome', 'GPE'): 2,
 ('this week', 'DATE'): 2,
 ('Iranian', 'NORP'): 2,
 ('Tuesday', 'DATE'): 2,
 ('Two', 'CARDINAL'): 2,
 ('Nigeria', 'GPE'): 2,
 ('German', 'NORP'): 2,
 ('Somalia', 'GPE'): 2,
 ('Mosul', 'GPE'): 2,
 ('al Qaida', 'ORG'): 2,
 ('more than 200', 'CARDINAL'): 2,
 ('American', 'NORP'): 2,
 ('Friday', 'DATE'): 2,
 ('Brotherhood', 'ORG'): 

In [11]:
# Count label frequencies
{w:label_dict[w] for w in sorted(label_dict, key=label_dict.get, reverse=True)}

{'GPE': 74,
 'DATE': 50,
 'ORG': 49,
 'NORP': 46,
 'CARDINAL': 30,
 'PERSON': 20,
 'LOC': 3,
 'WORK_OF_ART': 1,
 'LANGUAGE': 1,
 'ORDINAL': 1,
 'FAC': 1,
 'QUANTITY': 1,
 'MONEY': 1}

In [12]:
label_ent_dict

{'CARDINAL': {'1,00,000',
  '10,000',
  '21',
  '23',
  '47',
  '68-34',
  '8,500',
  'Thousands',
  'Two',
  'at least 16',
  'at least 34',
  'at least five',
  'at least nine',
  'at least one',
  'four',
  'half',
  'more than 1,300',
  'more than 200',
  'nine',
  'one',
  'six',
  'three'},
 'GPE': {'Afghanistan',
  'Alexandria',
  'Anbar',
  'Baghdad',
  'Bali',
  'Bedford',
  'Brighton',
  'Britain',
  'Darfur',
  'Halliburton',
  'Hyde Park',
  'Indonesia',
  'Iran',
  'Iraq',
  'Islamabad',
  'Khartoum',
  'Labado',
  'Libya',
  'London',
  'Madrid',
  'Mogadishu',
  'Mosul',
  'Nigeria',
  'North Korea',
  'Pakistan',
  'Paris',
  'Rome',
  'Somalia',
  'South Darfur',
  'South Waziristan',
  'Sudan',
  'Sydney',
  'Tehran',
  'U.S.',
  'Vienna',
  'Washington',
  'the United States'},
 'NORP': {'AU',
  'American',
  'Australian',
  'British',
  'Byzantine',
  'Democrats',
  'Egyptian',
  'European',
  'French',
  'German',
  'Germans',
  'Hardline',
  'Indonesian',
  'Irani

# Display Entities

In [13]:
# Display entities
text = df100['Sentence'].iloc[0]
doc = nlp(text)
displacy.render(doc, style="ent")

In [14]:
# Get relations between labels
label_list = []
for i in range(len(df100)):
    entlist = df100['entities'].iloc[i]
    label_ll = []
    for ent in entlist:
        nent, label = ent
        label_ll.append(label)
    label_list.append(label_ll)

In [15]:
label_list

[['CARDINAL', 'GPE', 'GPE', 'NORP'],
 ['WORK_OF_ART'],
 ['ORG', 'GPE'],
 ['CARDINAL', 'CARDINAL'],
 ['DATE', 'GPE', 'ORG', 'LANGUAGE', 'GPE'],
 ['GPE', 'GPE', 'CARDINAL', 'NORP'],
 ['GPE', 'DATE', 'GPE', 'GPE', 'GPE'],
 ['ORG', 'ORDINAL', 'GPE', 'DATE', 'GPE'],
 ['GPE', 'DATE', 'FAC'],
 ['NORP', 'DATE', 'ORG'],
 [],
 ['ORG', 'GPE', 'GPE', 'ORG', 'GPE', 'ORG'],
 ['GPE', 'PERSON', 'DATE', 'NORP', 'GPE', 'NORP'],
 ['CARDINAL', 'NORP', 'CARDINAL', 'NORP', 'GPE', 'LOC'],
 ['NORP', 'PERSON', 'PERSON', 'DATE', 'ORG', 'PERSON', 'ORG'],
 ['NORP', 'ORG'],
 ['LOC'],
 ['GPE', 'QUANTITY', 'DATE'],
 ['NORP', 'GPE', 'PERSON'],
 ['GPE'],
 ['CARDINAL', 'GPE', 'DATE'],
 ['CARDINAL', 'NORP'],
 ['NORP', 'GPE', 'ORG', 'GPE'],
 ['NORP', 'NORP', 'DATE', 'CARDINAL', 'CARDINAL'],
 ['GPE', 'NORP'],
 ['ORG', 'GPE', 'GPE', 'GPE'],
 ['GPE', 'GPE', 'NORP'],
 ['GPE', 'CARDINAL', 'NORP', 'GPE', 'DATE'],
 ['NORP', 'CARDINAL', 'ORG', 'DATE'],
 ['DATE', 'GPE'],
 ['ORG', 'ORG'],
 ['ORG', 'CARDINAL'],
 ['DATE', 'CARDINAL'

In [16]:
from itertools import combinations 

def item_pairs(inp_list):
    return list(combinations(sorted(inp_list), 2))    

A = ['cat', 'baby', 'apple']
item_pairs(A)

[('apple', 'baby'), ('apple', 'cat'), ('baby', 'cat')]

In [17]:
label_pairs= {}
for ll in label_list:
    pairs = item_pairs(ll)
    for pp in pairs:
        if pp in label_pairs.keys():
            label_pairs[pp] += 1
        else:
            label_pairs[pp] = 1

{w:label_pairs[w] for w in sorted(label_pairs, key = label_pairs.get, reverse=True)}

{('GPE', 'NORP'): 45,
 ('GPE', 'GPE'): 37,
 ('DATE', 'GPE'): 34,
 ('DATE', 'ORG'): 31,
 ('GPE', 'ORG'): 30,
 ('DATE', 'NORP'): 28,
 ('CARDINAL', 'NORP'): 21,
 ('CARDINAL', 'DATE'): 16,
 ('CARDINAL', 'GPE'): 15,
 ('ORG', 'ORG'): 15,
 ('DATE', 'PERSON'): 15,
 ('GPE', 'PERSON'): 14,
 ('NORP', 'PERSON'): 12,
 ('NORP', 'ORG'): 11,
 ('ORG', 'PERSON'): 11,
 ('NORP', 'NORP'): 9,
 ('DATE', 'DATE'): 8,
 ('CARDINAL', 'ORG'): 7,
 ('CARDINAL', 'CARDINAL'): 6,
 ('LOC', 'NORP'): 4,
 ('PERSON', 'PERSON'): 3,
 ('GPE', 'MONEY'): 3,
 ('CARDINAL', 'PERSON'): 3,
 ('GPE', 'LANGUAGE'): 2,
 ('GPE', 'ORDINAL'): 2,
 ('CARDINAL', 'LOC'): 2,
 ('GPE', 'LOC'): 2,
 ('DATE', 'LANGUAGE'): 1,
 ('LANGUAGE', 'ORG'): 1,
 ('DATE', 'ORDINAL'): 1,
 ('ORDINAL', 'ORG'): 1,
 ('DATE', 'FAC'): 1,
 ('FAC', 'GPE'): 1,
 ('DATE', 'QUANTITY'): 1,
 ('GPE', 'QUANTITY'): 1,
 ('DATE', 'MONEY'): 1,
 ('MONEY', 'ORG'): 1}

In [18]:
# Get relations between entities
entity_list = []
for i in range(len(df100)):
    entlist = df100['entities'].iloc[i]
    ent_ll = []
    for ent in entlist:
        nent, label = ent
        if label not in ['CARDINAL', 'DATE']:
            ent_ll.append(nent)
    entity_list.append(ent_ll)

entity_list

[['London', 'Iraq', 'British'],
 ['Stop the Bombings'],
 ['the Houses of Parliament', 'Hyde Park'],
 [],
 ['Britain', 'Labor Party', 'English', 'Brighton'],
 ['Britain', 'Iraq', 'British'],
 ['London', 'Rome', 'Paris', 'Madrid'],
 ['The International Atomic Energy Agency', 'second', 'Vienna', 'Iran'],
 ['Iran', 'Isfahan'],
 ['Iranian', 'IAEA'],
 [],
 ['The European Union',
  'U.S.',
  'Iran',
  'the U.N. Security Council',
  'Tehran',
  'the Nuclear Non-Proliferation'],
 ['Iran', 'Mahmoud Ahmadinejad', 'European', 'Iran', 'Iranian'],
 ['Germans', 'Nigerian', 'Nigeria', 'Delta'],
 ['German',
  'Bilfinger Berger',
  'Thomas Horbach',
  'Delta State',
  'Bayelsa State',
  'Royal-Dutch Shell'],
 ['German', 'Shell'],
 ['the Niger Delta'],
 ['Nigeria', '2.3 million barrels'],
 ['Suspected Islamist', 'Somalia', 'Abdullahi Yusuf Ahmad'],
 ['Mogadishu'],
 ['Mogadishu'],
 ['Somali'],
 ['Iraqi', 'Mosul', 'al Qaida', 'Iraq'],
 ['Sunni Arab', 'Kurdish'],
 ['U.S.', 'American'],
 ['al Qaida', 'Iraq',

In [19]:
# Create a dictionary 
entity_pairs= {}
for ents in entity_list:
    pairs = item_pairs(ents)
    for pp in pairs:
        if pp in entity_pairs.keys():
            entity_pairs[pp] += 1
        else:
            entity_pairs[pp] = 1

{w:entity_pairs[w] for w in sorted(entity_pairs, key = entity_pairs.get, reverse=True)}

{('Bali', 'Indonesian'): 4,
 ('Afghanistan', 'Taleban'): 3,
 ('British', 'Iraq'): 2,
 ('European', 'Iran'): 2,
 ('Iran', 'Iranian'): 2,
 ('Iran', 'Mahmoud Ahmadinejad'): 2,
 ('Iraq', 'al Qaida'): 2,
 ('American', 'U.S.'): 2,
 ('Taleban', 'al-Qaida'): 2,
 ('AU', 'Sudanese'): 2,
 ('Bali', 'Java'): 2,
 ('British', 'London'): 1,
 ('Iraq', 'London'): 1,
 ('Hyde Park', 'the Houses of Parliament'): 1,
 ('Brighton', 'Britain'): 1,
 ('Brighton', 'English'): 1,
 ('Brighton', 'Labor Party'): 1,
 ('Britain', 'English'): 1,
 ('Britain', 'Labor Party'): 1,
 ('English', 'Labor Party'): 1,
 ('Britain', 'British'): 1,
 ('Britain', 'Iraq'): 1,
 ('London', 'Madrid'): 1,
 ('London', 'Paris'): 1,
 ('London', 'Rome'): 1,
 ('Madrid', 'Paris'): 1,
 ('Madrid', 'Rome'): 1,
 ('Paris', 'Rome'): 1,
 ('Iran', 'The International Atomic Energy Agency'): 1,
 ('Iran', 'Vienna'): 1,
 ('Iran', 'second'): 1,
 ('The International Atomic Energy Agency', 'Vienna'): 1,
 ('The International Atomic Energy Agency', 'second'): 1,

In [20]:
#net = Network()
net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")

for key, vals in entity_pairs.items():
    a, b = key
    net.add_node(a)
    net.add_node(b)
    net.add_edge(a, b, weight=vals)

net.show('graph.html', notebook=False)

graph.html


In [21]:
#net = Network()
net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")

net.add_node('Cap')
net.add_node('Cap')
net.add_node('Tap')
net.add_edge('Cap', 'Tap', weight=vals)

net.show('graphz.html', notebook=False)

graphz.html


# Draw insights about specific entities

In [22]:
# Q1: Do the news articles report positively about London? 

In [23]:
# TextBlob
from textblob import TextBlob

In [24]:
df100['sentiment'] = df100['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df100['sentiment'] = df100['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [25]:
df100.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,entities,sentiment
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...","[(Thousands, CARDINAL), (London, GPE), (Iraq, ...",0.0
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(Stop the Bombings, WORK_OF_ART)]",-0.1
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(the Houses of Parliament, ORG), (Hyde Park, ...",0.0
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(10,000, CARDINAL), (1,00,000, CARDINAL)]",0.0
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(annual, DATE), (Britain, GPE), (Labor Party,...",0.0


In [26]:
dff = df100[df100['Sentence'].str.contains('Afghanistan')]

In [27]:
dff

Unnamed: 0,Sentence #,Sentence,POS,Tag,entities,sentiment
39,Sentence: 40,"The opposition has denounced the measure , com...","['DT', 'NN', 'VBZ', 'VBN', 'DT', 'NN', ',', 'V...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[(Taleban, NORP), (Afghanistan, GPE)]",0.0
45,Sentence: 46,Pakistani officials say unidentified gunmen ha...,"['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...","[(Pakistani, NORP), (three, CARDINAL), (Afghan...",-0.1
50,Sentence: 51,The area became a refuge for many al-Qaida and...,"['DT', 'NN', 'VBD', 'DT', 'NN', 'IN', 'JJ', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', '...","[(al-Qaida, ORG), (Taleban, NORP), (Taleban, N...",0.5


In [28]:
dff['sentiment'].describe()

count    3.000000
mean     0.133333
std      0.321455
min     -0.100000
25%     -0.050000
50%      0.000000
75%      0.250000
max      0.500000
Name: sentiment, dtype: float64

In [29]:
dff = df100[df100['Sentence'].str.contains('Mogadishu')]
dff['sentiment'].describe()

count    5.000000
mean    -0.028333
std      0.094942
min     -0.141667
25%     -0.100000
50%      0.000000
75%      0.000000
max      0.100000
Name: sentiment, dtype: float64

In [30]:
dff = df100[df100['Sentence'].str.contains('London')]
dff['sentiment'].describe()

count    2.000000
mean    -0.062500
std      0.088388
min     -0.125000
25%     -0.093750
50%     -0.062500
75%     -0.031250
max      0.000000
Name: sentiment, dtype: float64