In [1]:
import pandas as pd
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
from pyvis.network import Network
import re
import spacy
from transformers import pipeline

In [2]:
# pip install ipywidgets

In [3]:
# Load spaCy's pre-trained model for NER
nlp = spacy.load("en_core_web_sm")

# Alternatively, you can use Hugging Face's transformers pipeline for NER
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")

# Load the small pre-trained NER model
#ner_pipeline = pipeline("ner", model="distilbert-base-cased", tokenizer="distilbert-base-cased")

# Load a SMALL model fine-tuned for NER
#ner_pipeline = pipeline("ner", model="distilbert-base-cased-finetuned-conll03-english, aggregation_strategy="simple"  # groups subword tokens)

# Load the small pre-trained NER model
#ner_pipeline = pipeline("ner", model="Minibase/NER-Small", tokenizer="Minibase/NER-Small")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Sample text for NER
text = "Elon Musk is the CEO of SpaceX. He was born in South Africa in 1971."

# Run NER using the Hugging Face pipeline
ner_results = ner_pipeline(text)

# Print the recognized named entities
print("NER Results:")
for result in ner_results:
    print(f"{result['word']} - {result['entity']}")

NER Results:
El - I-PER
##on - I-PER
Mu - I-PER
##sk - I-PER
Space - I-ORG
##X - I-ORG
South - I-LOC
Africa - I-LOC


In [5]:
# Sample text
text = "Barack Obama was born in Hawaii. He was elected president in 2008."

# Using spaCy for NER
print("spaCy NER Results:")
doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")

# Using Hugging Face NER pipeline
print("\nHugging Face NER Results:")
ner_results = ner_pipeline(text)
for result in ner_results:
    print(f"{result['word']} - {result['entity']}")

spaCy NER Results:
Barack Obama - PERSON
Hawaii - GPE
2008 - DATE

Hugging Face NER Results:
Barack - I-PER
Obama - I-PER
Hawaii - I-LOC


# 01 - Load Data

In [6]:
# data source: Kaggle - https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus
df = pd.read_csv("NewsText.csv")
df.head()

Unnamed: 0,NewsText
0,Silver rate today: After climbing to a record ...
1,
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...
4,


In [7]:
def clean_text(inptext):
    text = str(inptext)
    # replace new line
    text = re.sub(r'\n', ' ', text) 
    # replace special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return (text)

In [8]:
df['Article'] = df['NewsText'].apply(lambda x: clean_text(x))

In [9]:
df.head()

Unnamed: 0,NewsText,Article
0,Silver rate today: After climbing to a record ...,Silver rate today After climbing to a record h...
1,,
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...,Photo ET Now Digital Venezuela Crisis Trader...
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...,XRPs Ambitions Beyond Crypto Speculation The ...
4,,


In [10]:
# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities from a text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [11]:
# Apply NER on the text column
df['entities'] = df['Article'].apply(extract_entities)
df['LLM_entities'] = df['Article'].apply(ner_pipeline)

In [12]:
df.head()

Unnamed: 0,NewsText,Article,entities,LLM_entities
0,Silver rate today: After climbing to a record ...,Silver rate today After climbing to a record h...,"[(today, DATE), (82670, CARDINAL), (Monday las...","[{'entity': 'I-ORG', 'score': 0.6305133, 'inde..."
1,,,"[(nan, PERSON)]",[]
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...,Photo ET Now Digital Venezuela Crisis Trader...,"[(Digital Venezuela Crisis Traders, ORG), (US...","[{'entity': 'I-MISC', 'score': 0.98602885, 'in..."
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...,XRPs Ambitions Beyond Crypto Speculation The ...,"[(The Metal Standard A Flawed, ORG), (Telling ...","[{'entity': 'I-MISC', 'score': 0.7757474, 'ind..."
4,,,"[(nan, PERSON)]",[]


In [13]:
# Save the results to a new CSV
df.to_csv("news_data_with_entities_2.csv", index=False)

print("NER extraction completed. Results saved to 'data_with_entities.csv'.")

NER extraction completed. Results saved to 'data_with_entities.csv'.


# Analysis

### Look at most common types of entities

In [14]:
ent_dict = {}
label_dict = {}
label_ent_dict = {}
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    for ent in entlist:
        if ent in ent_dict.keys():
            ent_dict[ent] += 1
        else:
            ent_dict[ent] = 1
        nent, label = ent
        if label in label_dict.keys():
            label_dict[label] += 1
        else:
            label_dict[label] = 1
        if label in label_ent_dict.keys():
            label_ent_dict[label].append(nent)
        else:
            label_ent_dict[label] = [nent]

for k in label_ent_dict.keys():
    label_ent_dict[k] = set(label_ent_dict[k])

In [15]:
# One sample entry
df['entities'].iloc[0]

[('today', 'DATE'),
 ('82670', 'CARDINAL'),
 ('Monday last week', 'DATE'),
 ('COMEX', 'ORG'),
 ('71300', 'DATE'),
 ('1137', 'DATE'),
 ('1375', 'DATE'),
 ('Friday', 'DATE'),
 ('around 180', 'CARDINAL'),
 ('2025', 'DATE'),
 ('Samsungs', 'ORG'),
 ('Peru', 'GPE'),
 ('Chad', 'GPE'),
 ('US', 'GPE'),
 ('Venezuela', 'GPE'),
 ('Chinas', 'ORG'),
 ('January 1 2026', 'DATE'),
 ('today', 'DATE')]

In [16]:
# Count word-label frequencies
{w:ent_dict[w] for w in sorted(ent_dict, key=ent_dict.get, reverse=True)}

{('US', 'GPE'): 5,
 ('Venezuela', 'GPE'): 3,
 ('nan', 'PERSON'): 3,
 ('today', 'DATE'): 2,
 ('XRP', 'ORG'): 2,
 ('82670', 'CARDINAL'): 1,
 ('Monday last week', 'DATE'): 1,
 ('COMEX', 'ORG'): 1,
 ('71300', 'DATE'): 1,
 ('1137', 'DATE'): 1,
 ('1375', 'DATE'): 1,
 ('Friday', 'DATE'): 1,
 ('around 180', 'CARDINAL'): 1,
 ('2025', 'DATE'): 1,
 ('Samsungs', 'ORG'): 1,
 ('Peru', 'GPE'): 1,
 ('Chad', 'GPE'): 1,
 ('Chinas', 'ORG'): 1,
 ('January 1 2026', 'DATE'): 1,
 ('Digital  Venezuela Crisis Traders', 'ORG'): 1,
 ('Venezuela        Venezuela Crisis', 'ORG'): 1,
 ('Nicolas Maduro', 'PERSON'): 1,
 ('safehaven', 'CARDINAL'): 1,
 ('The Economic Times', 'ORG'): 1,
 ('The Metal Standard A Flawed', 'ORG'): 1,
 ('Telling Comparison  Forget', 'PERSON'): 1,
 ('the late 70s', 'DATE'): 1,
 ('Influencer Chad Steingraber', 'PERSON'): 1,
 ('10x', 'DATE'): 1}

In [17]:
# Count label frequencies
{w:label_dict[w] for w in sorted(label_dict, key=label_dict.get, reverse=True)}

{'DATE': 11, 'GPE': 10, 'ORG': 9, 'PERSON': 6, 'CARDINAL': 3}

In [18]:
label_ent_dict

{'DATE': {'10x',
  '1137',
  '1375',
  '2025',
  '71300',
  'Friday',
  'January 1 2026',
  'Monday last week',
  'the late 70s',
  'today'},
 'CARDINAL': {'82670', 'around 180', 'safehaven'},
 'ORG': {'COMEX',
  'Chinas',
  'Digital  Venezuela Crisis Traders',
  'Samsungs',
  'The Economic Times',
  'The Metal Standard A Flawed',
  'Venezuela        Venezuela Crisis',
  'XRP'},
 'GPE': {'Chad', 'Peru', 'US', 'Venezuela'},
 'PERSON': {'Influencer Chad Steingraber',
  'Nicolas Maduro',
  'Telling Comparison  Forget',
  'nan'}}

# Display Entities

In [19]:
# Display entities
text = df['Article'].iloc[0]
doc = nlp(text)
displacy.render(doc, style="ent")

In [20]:
# Get relations between labels
label_list = []
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    label_ll = []
    for ent in entlist:
        nent, label = ent
        label_ll.append(label)
    label_list.append(label_ll)

In [21]:
label_list

[['DATE',
  'CARDINAL',
  'DATE',
  'ORG',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'CARDINAL',
  'DATE',
  'ORG',
  'GPE',
  'GPE',
  'GPE',
  'GPE',
  'ORG',
  'DATE',
  'DATE'],
 ['PERSON'],
 ['ORG',
  'GPE',
  'ORG',
  'GPE',
  'GPE',
  'GPE',
  'GPE',
  'PERSON',
  'CARDINAL',
  'ORG',
  'GPE'],
 ['ORG', 'PERSON', 'ORG', 'DATE', 'PERSON', 'ORG', 'DATE'],
 ['PERSON'],
 ['PERSON']]

In [22]:
from itertools import combinations 

def item_pairs(inp_list):
    return list(combinations(sorted(inp_list), 2))    

A = ['cat', 'baby', 'apple']
item_pairs(A)

[('apple', 'baby'), ('apple', 'cat'), ('baby', 'cat')]

In [23]:
label_pairs= {}
for ll in label_list:
    pairs = item_pairs(ll)
    for pp in pairs:
        if pp in label_pairs.keys():
            label_pairs[pp] += 1
        else:
            label_pairs[pp] = 1

{w:label_pairs[w] for w in sorted(label_pairs, key = label_pairs.get, reverse=True)}

{('DATE', 'DATE'): 37,
 ('DATE', 'GPE'): 36,
 ('DATE', 'ORG'): 33,
 ('GPE', 'ORG'): 30,
 ('GPE', 'GPE'): 21,
 ('CARDINAL', 'DATE'): 18,
 ('CARDINAL', 'GPE'): 14,
 ('CARDINAL', 'ORG'): 9,
 ('ORG', 'ORG'): 9,
 ('ORG', 'PERSON'): 9,
 ('GPE', 'PERSON'): 6,
 ('DATE', 'PERSON'): 4,
 ('CARDINAL', 'CARDINAL'): 1,
 ('CARDINAL', 'PERSON'): 1,
 ('PERSON', 'PERSON'): 1}

In [24]:
# Get relations between entities
entity_list = []
for i in range(len(df)):
    entlist = df['entities'].iloc[i]
    ent_ll = []
    for ent in entlist:
        nent, label = ent
        if label not in ['CARDINAL', 'ORDINAL', 'DATE']:
            ent_ll.append(nent)
    entity_list.append(ent_ll)

entity_list

[['COMEX', 'Samsungs', 'Peru', 'Chad', 'US', 'Venezuela', 'Chinas'],
 ['nan'],
 ['Digital  Venezuela Crisis Traders',
  'US',
  'Venezuela        Venezuela Crisis',
  'US',
  'Venezuela',
  'US',
  'Venezuela',
  'Nicolas Maduro',
  'The Economic Times',
  'US'],
 ['The Metal Standard A Flawed',
  'Telling Comparison  Forget',
  'XRP',
  'Influencer Chad Steingraber',
  'XRP'],
 ['nan'],
 ['nan']]

In [25]:
# Create a dictionary 
entity_pairs= {}
for ents in entity_list:
    pairs = item_pairs(ents)
    for pp in pairs:
        if pp in entity_pairs.keys():
            entity_pairs[pp] += 1
        else:
            entity_pairs[pp] = 1

{w:entity_pairs[w] for w in sorted(entity_pairs, key = entity_pairs.get, reverse=True)}

{('US', 'Venezuela'): 9,
 ('US', 'US'): 6,
 ('Digital  Venezuela Crisis Traders', 'US'): 4,
 ('Nicolas Maduro', 'US'): 4,
 ('The Economic Times', 'US'): 4,
 ('US', 'Venezuela        Venezuela Crisis'): 4,
 ('Digital  Venezuela Crisis Traders', 'Venezuela'): 2,
 ('Nicolas Maduro', 'Venezuela'): 2,
 ('The Economic Times', 'Venezuela'): 2,
 ('Venezuela', 'Venezuela        Venezuela Crisis'): 2,
 ('Influencer Chad Steingraber', 'XRP'): 2,
 ('Telling Comparison  Forget', 'XRP'): 2,
 ('The Metal Standard A Flawed', 'XRP'): 2,
 ('COMEX', 'Chad'): 1,
 ('COMEX', 'Chinas'): 1,
 ('COMEX', 'Peru'): 1,
 ('COMEX', 'Samsungs'): 1,
 ('COMEX', 'US'): 1,
 ('COMEX', 'Venezuela'): 1,
 ('Chad', 'Chinas'): 1,
 ('Chad', 'Peru'): 1,
 ('Chad', 'Samsungs'): 1,
 ('Chad', 'US'): 1,
 ('Chad', 'Venezuela'): 1,
 ('Chinas', 'Peru'): 1,
 ('Chinas', 'Samsungs'): 1,
 ('Chinas', 'US'): 1,
 ('Chinas', 'Venezuela'): 1,
 ('Peru', 'Samsungs'): 1,
 ('Peru', 'US'): 1,
 ('Peru', 'Venezuela'): 1,
 ('Samsungs', 'US'): 1,
 ('Samsu

In [26]:
#net = Network()
net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")

for key, vals in entity_pairs.items():
    a, b = key
    net.add_node(a)
    net.add_node(b)
    net.add_edge(a, b, weight=vals)

net.show('news_graph.html', notebook=False)

news_graph.html
