In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/ner_dataset_10k.csv")

# NER (Rule-based for Finance Posts)

## Rule-based NER description

We'll implement simple rules:
- Cashtags: `$AAPL`, `$TSLA` using regex
- Company lookup: exact string match against a company list
- Person lookup: exact string match against a list of known people



In [3]:
import re

In [4]:
# update these lists if you want more/company names
companies = {
    "AAPL": "Apple Inc",
    "MSFT": "Microsoft Corporation",
    "GOOG": "Alphabet Inc",
    "AMZN": "Amazon.com, Inc",
    "TSLA": "Tesla, Inc",
    "NVDA": "NVIDIA Corporation",
    "JPM": "JPMorgan Chase & Co.",
    "BAC": "Bank of America Corporation",
    "WMT": "Walmart Inc",
    "META": "Meta Platforms, Inc"
}
people = [
    "Tim Cook","Satya Nadella","Sundar Pichai","Elon Musk",
    "Jensen Huang","Jamie Dimon","Warren Buffett","Mark Zuckerberg",
    "Andy Jassy","Brian Moynihan"
]


In [5]:
def extract_entities(text):
    entities = {'CASHTAG': [], 'COMPANY': [], 'PERSON': []}
    if not isinstance(text, str):
        return entities
    # cashtags: $ followed by 1-5 uppercase letters/numbers
    entities['CASHTAG'] = re.findall(r"\$[A-Z0-9]{1,5}", text)
    # company exact match (case-sensitive or insensitive? use case-insensitive)
    for comp in companies.values():
        if comp.lower() in text.lower():
            entities['COMPANY'].append(comp)
    # persons
    for p in people:
        if p.lower() in text.lower():
            entities['PERSON'].append(p)
    return entities


In [6]:
# test on a few samples
sample_texts = df['text'].sample(10, random_state=3).tolist()
for t in sample_texts:
    print("TEXT:", t)
    print("EXTRACTED:", extract_entities(t))
    print("---")

TEXT: Satya Nadella of Bank of America Corporation commented on strategy. $BAC reacts to the news. Analysts: hold.
EXTRACTED: {'CASHTAG': ['$BAC'], 'COMPANY': ['Bank of America Corporation'], 'PERSON': ['Satya Nadella']}
---
TEXT: Warren Buffett of Meta Platforms, Inc commented on strategy. $META reacts to the news. Analysts: upgrade.
EXTRACTED: {'CASHTAG': ['$META'], 'COMPANY': ['Meta Platforms, Inc'], 'PERSON': ['Warren Buffett']}
---
TEXT: Mark Zuckerberg of Walmart Inc commented on strategy. $WMT reacts to the news. Analysts: downgrade.
EXTRACTED: {'CASHTAG': ['$WMT'], 'COMPANY': ['Walmart Inc'], 'PERSON': ['Mark Zuckerberg']}
---
TEXT: Satya Nadella of Bank of America Corporation commented on strategy. $BAC reacts to the news. Analysts: upgrade.
EXTRACTED: {'CASHTAG': ['$BAC'], 'COMPANY': ['Bank of America Corporation'], 'PERSON': ['Satya Nadella']}
---
TEXT: Satya Nadella of JPMorgan Chase & Co. commented on strategy. $JPM reacts to the news. Analysts: upgrade.
EXTRACTED: {'CASHT

In [7]:
# apply to first 100 rows (fast demonstration)
df_sample = df.head(100).copy()
df_sample['entities'] = df_sample['text'].apply(extract_entities)
df_sample[['text','entities']].head(10)


Unnamed: 0,text,entities
0,Elon Musk of Bank of America Corporation comme...,"{'CASHTAG': ['$BAC'], 'COMPANY': ['Bank of Ame..."
1,"Jensen Huang of Tesla, Inc commented on strate...","{'CASHTAG': ['$TSLA'], 'COMPANY': ['Tesla, Inc..."
2,Jensen Huang of Apple Inc commented on strateg...,"{'CASHTAG': ['$AAPL'], 'COMPANY': ['Apple Inc'..."
3,Sundar Pichai of NVIDIA Corporation commented ...,"{'CASHTAG': ['$NVDA'], 'COMPANY': ['NVIDIA Cor..."
4,Elon Musk of Alphabet Inc commented on strateg...,"{'CASHTAG': ['$GOOG'], 'COMPANY': ['Alphabet I..."
5,"Sundar Pichai of Meta Platforms, Inc commented...","{'CASHTAG': ['$META'], 'COMPANY': ['Meta Platf..."
6,Warren Buffett of JPMorgan Chase & Co. comment...,"{'CASHTAG': ['$JPM'], 'COMPANY': ['JPMorgan Ch..."
7,Andy Jassy of JPMorgan Chase & Co. commented o...,"{'CASHTAG': ['$JPM'], 'COMPANY': ['JPMorgan Ch..."
8,Andy Jassy of Alphabet Inc commented on strate...,"{'CASHTAG': ['$GOOG'], 'COMPANY': ['Alphabet I..."
9,Brian Moynihan of Alphabet Inc commented on st...,"{'CASHTAG': ['$GOOG'], 'COMPANY': ['Alphabet I..."


# Pre-trained Statistical NER (using spaCy)

In [8]:
! pip install spacy





[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 8.4 MB/s eta 0:00:02
     -------- ------------------------------- 2.6/12.8 MB 6.3 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 5.6 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 3.8 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 3.2 MB/s eta 0:00:03
     ------------ --------------------------- 3.9/12.8 MB 3.1 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 MB 3.5 MB/s eta 0:00:03
     --------------------- ------------------ 6.8/12.8 MB 4.0 MB/s eta 0:00:02
     -------------------------- ------------- 8.4/12.8 MB 4.3 MB/s eta 0:00:02
     ------------------------------- ----


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import spacy

In [11]:
# 1. Load the spaCy model
# This object contains all the statistical knowledge
nlp = spacy.load("en_core_web_sm")

In [12]:
# 3. Define a new extraction function using spaCy
def extract_entities_spacy(text):
        # Process the text with the nlp object
        doc = nlp(text)
        
        # Create a dictionary to hold entities
        entities = {}
        for ent in doc.ents:
            label = ent.label_
            text = ent.text
            
            if label not in entities:
                entities[label] = []
            
            # Add the entity text if it's not already listed
            if text not in entities[label]:
                entities[label].append(text)
                
        return entities

In [13]:
 # 4. Apply the function to a sample
print("Applying spaCy NER to the first 10 rows...")
df_sample = df.head(10).copy()
df_sample['spacy_entities'] = df_sample['text'].apply(extract_entities_spacy)
    

Applying spaCy NER to the first 10 rows...


In [14]:
 # 5. Show the comparison
print("\n--- Comparison of Ground Truth vs. spaCy NER Output ---")
print(df_sample[['text', 'mentioned_person', 'company', 'spacy_entities']])


--- Comparison of Ground Truth vs. spaCy NER Output ---
                                                text mentioned_person  \
0  Elon Musk of Bank of America Corporation comme...        Elon Musk   
1  Jensen Huang of Tesla, Inc commented on strate...     Jensen Huang   
2  Jensen Huang of Apple Inc commented on strateg...     Jensen Huang   
3  Sundar Pichai of NVIDIA Corporation commented ...    Sundar Pichai   
4  Elon Musk of Alphabet Inc commented on strateg...        Elon Musk   
5  Sundar Pichai of Meta Platforms, Inc commented...    Sundar Pichai   
6  Warren Buffett of JPMorgan Chase & Co. comment...   Warren Buffett   
7  Andy Jassy of JPMorgan Chase & Co. commented o...       Andy Jassy   
8  Andy Jassy of Alphabet Inc commented on strate...       Andy Jassy   
9  Brian Moynihan of Alphabet Inc commented on st...   Brian Moynihan   

                       company  \
0  Bank of America Corporation   
1                   Tesla, Inc   
2                    Apple Inc   
3  