In [1]:
import pandas as pd
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
from pyvis.network import Network
import re
import spacy
from transformers import pipeline

In [2]:
# pip install ipywidgets

In [3]:
# Load spaCy's pre-trained model for NER
nlp = spacy.load("en_core_web_sm")

# Alternatively, you can use Hugging Face's transformers pipeline for NER
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")

# Load the small pre-trained NER model
#ner_pipeline = pipeline("ner", model="distilbert-base-cased", tokenizer="distilbert-base-cased")

# Load a SMALL model fine-tuned for NER
#ner_pipeline = pipeline("ner", model="distilbert-base-cased-finetuned-conll03-english, aggregation_strategy="simple"  # groups subword tokens)

# Load the small pre-trained NER model
#ner_pipeline = pipeline("ner", model="Minibase/NER-Small", tokenizer="Minibase/NER-Small")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Sample text for NER
text = "Elon Musk is the CEO of SpaceX. He was born in South Africa in 1971."

# Run NER using the Hugging Face pipeline
ner_results = ner_pipeline(text)

# Print the recognized named entities
print("NER Results:")
for result in ner_results:
    print(f"{result['word']} - {result['entity']}")

NER Results:
El - I-PER
##on - I-PER
Mu - I-PER
##sk - I-PER
Space - I-ORG
##X - I-ORG
South - I-LOC
Africa - I-LOC


In [5]:
# Sample text
text = "Barack Obama was born in Hawaii. He was elected president in 2008."

# Using spaCy for NER
print("spaCy NER Results:")
doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")

# Using Hugging Face NER pipeline
print("\nHugging Face NER Results:")
ner_results = ner_pipeline(text)
for result in ner_results:
    print(f"{result['word']} - {result['entity']}")

spaCy NER Results:
Barack Obama - PERSON
Hawaii - GPE
2008 - DATE

Hugging Face NER Results:
Barack - I-PER
Obama - I-PER
Hawaii - I-LOC


# 01 - Load Data

In [6]:
# data source: Kaggle - https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus
df = pd.read_csv("NewsText.csv")
df.head()

Unnamed: 0,NewsText
0,Silver rate today: After climbing to a record ...
1,
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...
4,


In [7]:
def clean_text(inptext):
    text = str(inptext)
    # replace new line
    text = re.sub(r'\n', ' ', text) 
    # replace special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return (text)

In [8]:
df['Article'] = df['NewsText'].apply(lambda x: clean_text(x))

In [9]:
df.head()

Unnamed: 0,NewsText,Article
0,Silver rate today: After climbing to a record ...,Silver rate today After climbing to a record h...
1,,
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...,Photo ET Now Digital Venezuela Crisis Trader...
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...,XRPs Ambitions Beyond Crypto Speculation The ...
4,,


In [10]:
# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities from a text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [11]:
# Apply NER on the text column
df['entities'] = df['Article'].apply(extract_entities)
df['LLM_entities'] = df['Article'].apply(ner_pipeline)

In [12]:
df.head()

Unnamed: 0,NewsText,Article,entities,LLM_entities
0,Silver rate today: After climbing to a record ...,Silver rate today After climbing to a record h...,"[(today, DATE), (82670, CARDINAL), (Monday las...","[{'entity': 'I-ORG', 'score': 0.6305133, 'inde..."
1,,,"[(nan, PERSON)]",[]
2,Photo : ET Now Digital\n\nVenezuela Crisis: Tr...,Photo ET Now Digital Venezuela Crisis Trader...,"[(Digital Venezuela Crisis Traders, ORG), (US...","[{'entity': 'I-MISC', 'score': 0.98602885, 'in..."
3,XRP’s Ambitions: Beyond Crypto Speculation\n\n...,XRPs Ambitions Beyond Crypto Speculation The ...,"[(The Metal Standard A Flawed, ORG), (Telling ...","[{'entity': 'I-MISC', 'score': 0.7757474, 'ind..."
4,,,"[(nan, PERSON)]",[]


In [13]:
# Save the results to a new CSV
df.to_csv("news_data_with_entities_2.csv", index=False)

print("NER extraction completed. Results saved to 'data_with_entities.csv'.")

NER extraction completed. Results saved to 'data_with_entities.csv'.
