In [1]:
import pandas as pd
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
from pyvis.network import Network
import re

# 01 - Load Data

In [2]:
# data source: Kaggle - https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus
df = pd.read_csv("NewsText.csv")
df.head()

Unnamed: 0,NewsText
0,"As 2025 draws to a close, the global mining se..."
1,Silver rate today LIVE: Silver bulls returned ...
2,Mining shares linked to precious metals are su...
3,Silver recovered some losses after its biggest...
4,This news item displays a headline only and ha...


In [3]:
def clean_text(inptext):
    text = str(inptext)
    # replace new line
    text = re.sub(r'\n', ' ', text) 
    # replace special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return (text)

In [4]:
df['Article'] = df['NewsText'].apply(lambda x: clean_text(x))

In [5]:
df.head()

Unnamed: 0,NewsText,Article
0,"As 2025 draws to a close, the global mining se...",As 2025 draws to a close the global mining sec...
1,Silver rate today LIVE: Silver bulls returned ...,Silver rate today LIVE Silver bulls returned t...
2,Mining shares linked to precious metals are su...,Mining shares linked to precious metals are su...
3,Silver recovered some losses after its biggest...,Silver recovered some losses after its biggest...
4,This news item displays a headline only and ha...,This news item displays a headline only and ha...


In [6]:
# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities from a text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [7]:
# Apply NER on the text column
df['entities'] = df['Article'].apply(extract_entities)

# Save the results to a new CSV
df.to_csv("news_data_with_entities.csv", index=False)

print("NER extraction completed. Results saved to 'data_with_entities.csv'.")

NER extraction completed. Results saved to 'data_with_entities.csv'.


# Analysis

### Look at most common types of entities

In [8]:
ent_dict = {}
label_dict = {}
label_ent_dict = {}
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    for ent in entlist:
        if ent in ent_dict.keys():
            ent_dict[ent] += 1
        else:
            ent_dict[ent] = 1
        nent, label = ent
        if label in label_dict.keys():
            label_dict[label] += 1
        else:
            label_dict[label] = 1
        if label in label_ent_dict.keys():
            label_ent_dict[label].append(nent)
        else:
            label_ent_dict[label] = [nent]

for k in label_ent_dict.keys():
    label_ent_dict[k] = set(label_ent_dict[k])

In [9]:
# One sample entry
df['entities'].iloc[0]

[('2025', 'DATE'), ('Driven', 'ORG'), ('the last two years', 'DATE')]

In [10]:
# Count word-label frequencies
{w:ent_dict[w] for w in sorted(ent_dict, key=ent_dict.get, reverse=True)}

{('Tuesday', 'DATE'): 5,
 ('2025', 'DATE'): 4,
 ('US', 'GPE'): 4,
 ('Monday', 'DATE'): 3,
 ('Tuesdays', 'DATE'): 2,
 ('oneday', 'DATE'): 2,
 ('Reuters', 'ORG'): 2,
 ('Mondays', 'DATE'): 2,
 ('this year', 'DATE'): 2,
 ('Driven', 'ORG'): 1,
 ('the last two years', 'DATE'): 1,
 ('today', 'DATE'): 1,
 ('Silvers', 'PRODUCT'): 1,
 ('five years', 'DATE'): 1,
 ('155', 'CARDINAL'): 1,
 ('MCX', 'ORG'): 1,
 ('236000', 'CARDINAL'): 1,
 ('11570', 'DATE'): 1,
 ('516', 'CARDINAL'): 1,
 ('236907', 'DATE'): 1,
 ('231100', 'CARDINAL'): 1,
 ('1140 am', 'TIME'): 1,
 ('254174', 'DATE'): 1,
 ('37', 'CARDINAL'): 1,
 ('7485', 'CARDINAL'): 1,
 ('8362', 'DATE'): 1,
 ('Jigar Trivedi Senior Research', 'ORG'): 1,
 ('Reliance Securities', 'ORG'): 1,
 ('the final days of', 'DATE'): 1,
 ('the new year', 'DATE'): 1,
 ('Turnaround', 'WORK_OF_ART'): 1,
 ('Monday December 29 2025', 'DATE'): 1,
 ('more than five years', 'DATE'): 1,
 ('monthly', 'DATE'): 1,
 ('about a third', 'CARDINAL'): 1,
 ('74', 'CARDINAL'): 1,
 ('9', 

In [11]:
# Count label frequencies
{w:label_dict[w] for w in sorted(label_dict, key=label_dict.get, reverse=True)}

{'DATE': 43,
 'CARDINAL': 15,
 'ORG': 10,
 'GPE': 7,
 'PERSON': 4,
 'TIME': 3,
 'PRODUCT': 1,
 'WORK_OF_ART': 1,
 'FAC': 1,
 'NORP': 1,
 'PERCENT': 1}

In [12]:
label_ent_dict

{'DATE': {'11570',
  '2025',
  '2026',
  '236907',
  '254174',
  '8362',
  'January',
  'Monday',
  'Monday December 29 2025',
  'Mondays',
  'Tuesday',
  'Tuesdays',
  'a strong year',
  'five years',
  'just a month',
  'late January early February',
  'monthly',
  'months',
  'more than five years',
  'next year  ',
  'next year  Gold',
  'oneday',
  'singleday',
  'the final days of',
  'the last two years',
  'the new year',
  'this Tuesday December 30',
  'this year',
  'today',
  'two months'},
 'ORG': {'Comex',
  'Driven',
  'Jigar Trivedi Senior Research',
  'MCX',
  'Pepperstone Group Ltd',
  'Reliance Securities',
  'Reuters',
  'XTB',
  'the Federal Reserve'},
 'PRODUCT': {'Silvers'},
 'CARDINAL': {'0000',
  '100',
  '155',
  '231100',
  '236000',
  '37',
  '40',
  '516',
  '74',
  '7447',
  '7485',
  '9',
  'about a third',
  'more than 2',
  'safehaven'},
 'TIME': {'1140 am', '24 hours', 'the minutes later in the day'},
 'WORK_OF_ART': {'Turnaround'},
 'PERSON': {'Barrick

# Display Entities

In [14]:
# Display entities
text = df['Article'].iloc[1]
doc = nlp(text)
displacy.render(doc, style="ent")

In [15]:
# Get relations between labels
label_list = []
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    label_ll = []
    for ent in entlist:
        nent, label = ent
        label_ll.append(label)
    label_list.append(label_ll)

In [16]:
label_list

[['DATE', 'ORG', 'DATE'],
 ['DATE',
  'DATE',
  'PRODUCT',
  'DATE',
  'DATE',
  'ORG',
  'CARDINAL',
  'ORG',
  'CARDINAL',
  'DATE',
  'CARDINAL',
  'DATE',
  'CARDINAL',
  'TIME',
  'DATE',
  'CARDINAL',
  'CARDINAL',
  'DATE',
  'ORG',
  'ORG',
  'ORG'],
 ['DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'WORK_OF_ART',
  'DATE'],
 ['DATE',
  'DATE',
  'DATE',
  'CARDINAL',
  'CARDINAL',
  'DATE',
  'CARDINAL',
  'DATE',
  'PERSON',
  'DATE',
  'PERSON',
  'ORG',
  'ORG',
  'DATE'],
 [],
 ['CARDINAL', 'DATE', 'CARDINAL', 'DATE', 'GPE', 'DATE', 'GPE'],
 ['GPE',
  'GPE',
  'DATE',
  'TIME',
  'DATE',
  'DATE',
  'ORG',
  'CARDINAL',
  'FAC',
  'PERSON',
  'CARDINAL',
  'DATE',
  'DATE'],
 ['CARDINAL', 'DATE'],
 ['NORP',
  'DATE',
  'GPE',
  'DATE',
  'TIME',
  'GPE',
  'DATE',
  'DATE',
  'PERSON',
  'ORG',
  'DATE',
  'GPE',
  'DATE',
  'DATE',
  'PERCENT',
  'CARDINAL',
  'DATE']]

In [17]:
from itertools import combinations 

def item_pairs(inp_list):
    return list(combinations(sorted(inp_list), 2))    

A = ['cat', 'baby', 'apple']
item_pairs(A)

[('apple', 'baby'), ('apple', 'cat'), ('baby', 'cat')]

In [18]:
label_pairs= {}
for ll in label_list:
    pairs = item_pairs(ll)
    for pp in pairs:
        if pp in label_pairs.keys():
            label_pairs[pp] += 1
        else:
            label_pairs[pp] = 1

{w:label_pairs[w] for w in sorted(label_pairs, key = label_pairs.get, reverse=True)}

{('DATE', 'DATE'): 127,
 ('CARDINAL', 'DATE'): 94,
 ('DATE', 'ORG'): 69,
 ('DATE', 'GPE'): 40,
 ('CARDINAL', 'ORG'): 39,
 ('DATE', 'PERSON'): 27,
 ('DATE', 'TIME'): 21,
 ('CARDINAL', 'CARDINAL'): 20,
 ('ORG', 'ORG'): 11,
 ('CARDINAL', 'GPE'): 11,
 ('CARDINAL', 'TIME'): 9,
 ('DATE', 'WORK_OF_ART'): 9,
 ('CARDINAL', 'PERSON'): 9,
 ('DATE', 'PRODUCT'): 8,
 ('DATE', 'NORP'): 8,
 ('DATE', 'PERCENT'): 8,
 ('ORG', 'TIME'): 7,
 ('CARDINAL', 'PRODUCT'): 6,
 ('ORG', 'PERSON'): 6,
 ('ORG', 'PRODUCT'): 5,
 ('GPE', 'GPE'): 5,
 ('DATE', 'FAC'): 5,
 ('GPE', 'ORG'): 5,
 ('GPE', 'PERSON'): 5,
 ('GPE', 'TIME'): 5,
 ('GPE', 'NORP'): 3,
 ('GPE', 'PERCENT'): 3,
 ('CARDINAL', 'FAC'): 2,
 ('FAC', 'GPE'): 2,
 ('PERSON', 'TIME'): 2,
 ('PRODUCT', 'TIME'): 1,
 ('PERSON', 'PERSON'): 1,
 ('FAC', 'ORG'): 1,
 ('FAC', 'PERSON'): 1,
 ('FAC', 'TIME'): 1,
 ('CARDINAL', 'NORP'): 1,
 ('CARDINAL', 'PERCENT'): 1,
 ('NORP', 'ORG'): 1,
 ('NORP', 'PERCENT'): 1,
 ('NORP', 'PERSON'): 1,
 ('NORP', 'TIME'): 1,
 ('ORG', 'PERCENT'):

In [19]:
# Get relations between entities
entity_list = []
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    ent_ll = []
    for ent in entlist:
        nent, label = ent
        if label not in ['CARDINAL', 'ORDINAL', 'DATE']:
            ent_ll.append(nent)
    entity_list.append(ent_ll)

entity_list

[['Driven'],
 ['Silvers',
  'Reuters',
  'MCX',
  '1140 am',
  'Reuters',
  'Jigar Trivedi Senior Research',
  'Reliance Securities'],
 ['Turnaround'],
 ['Tighter', 'Dilin Wu', 'Pepperstone Group Ltd', 'Comex'],
 [],
 ['China', 'US'],
 ['NEW YORK',
  'US',
  '24 hours',
  'the Federal Reserve',
  'Newmont NYSE NEM',
  'Barrick Gold NYSE GOLD'],
 [],
 ['European',
  'Tokyo',
  'the minutes later in the day',
  'US',
  'Kathleen Brooks',
  'XTB',
  'US',
  'more than three percent']]

In [20]:
# Create a dictionary 
entity_pairs= {}
for ents in entity_list:
    pairs = item_pairs(ents)
    for pp in pairs:
        if pp in entity_pairs.keys():
            entity_pairs[pp] += 1
        else:
            entity_pairs[pp] = 1

{w:entity_pairs[w] for w in sorted(entity_pairs, key = entity_pairs.get, reverse=True)}

{('1140 am', 'Reuters'): 2,
 ('Jigar Trivedi Senior Research', 'Reuters'): 2,
 ('MCX', 'Reuters'): 2,
 ('Reliance Securities', 'Reuters'): 2,
 ('Reuters', 'Silvers'): 2,
 ('European', 'US'): 2,
 ('Kathleen Brooks', 'US'): 2,
 ('Tokyo', 'US'): 2,
 ('US', 'XTB'): 2,
 ('US', 'more than three percent'): 2,
 ('US', 'the minutes later in the day'): 2,
 ('1140 am', 'Jigar Trivedi Senior Research'): 1,
 ('1140 am', 'MCX'): 1,
 ('1140 am', 'Reliance Securities'): 1,
 ('1140 am', 'Silvers'): 1,
 ('Jigar Trivedi Senior Research', 'MCX'): 1,
 ('Jigar Trivedi Senior Research', 'Reliance Securities'): 1,
 ('Jigar Trivedi Senior Research', 'Silvers'): 1,
 ('MCX', 'Reliance Securities'): 1,
 ('MCX', 'Silvers'): 1,
 ('Reliance Securities', 'Silvers'): 1,
 ('Reuters', 'Reuters'): 1,
 ('Comex', 'Dilin Wu'): 1,
 ('Comex', 'Pepperstone Group Ltd'): 1,
 ('Comex', 'Tighter'): 1,
 ('Dilin Wu', 'Pepperstone Group Ltd'): 1,
 ('Dilin Wu', 'Tighter'): 1,
 ('Pepperstone Group Ltd', 'Tighter'): 1,
 ('China', 'US'):

In [21]:
#net = Network()
net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")

for key, vals in entity_pairs.items():
    a, b = key
    net.add_node(a)
    net.add_node(b)
    net.add_edge(a, b, weight=vals)

net.show('news_graph.html', notebook=False)

news_graph.html
