Name: Dhruv Pithadia

Roll No: R013

Program: MBA Tech AI

Course: Natural Language Processing

Topic: Named Entity Recognition

Contact: pithadia.dhruv@gmail.com

In [5]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import conll2002

# Download necessary NLTK models and datasets
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Example Text (You can replace this with your dataset)
text = "Apple is looking at buying U.K. startup for $1 billion in 2024. Elon Musk tweeted about Tesla in Los Angeles."

# Tokenize and apply POS tagging
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

# Apply Named Entity Recognition using NLTK
named_entities = ne_chunk(pos_tags)

# Print Named Entities
print(named_entities)

(S
  (GPE Apple/NNP)
  is/VBZ
  looking/VBG
  at/IN
  buying/VBG
  U.K./NNP
  startup/NN
  for/IN
  $/$
  1/CD
  billion/CD
  in/IN
  2024/CD
  ./.
  (PERSON Elon/NNP Musk/NNP)
  tweeted/VBD
  about/IN
  (PERSON Tesla/NNP)
  in/IN
  (GPE Los/NNP Angeles/NNP)
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dhruvpithadia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/dhruvpithadia/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/dhruvpithadia/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
# Define a dummy dataset
dummy_dataset = [
    "John works at OpenAI in San Francisco.",
    "Elon Musk is the CEO of Tesla and SpaceX.",
    "Barack Obama was the president of the United States.",
    "Apple is based in Cupertino, California."
]

# Define a dictionary of words and their entity types (NER mapping)
ner_mapping = {
    "John": "PERSON",
    "OpenAI": "ORG",
    "San Francisco": "LOC",
    "Elon Musk": "PERSON",
    "Tesla": "ORG",
    "SpaceX": "ORG",
    "Barack Obama": "PERSON",
    "United States": "LOC",
    "Apple": "ORG",
    "Cupertino": "LOC",
    "California": "LOC"
}

# Function to perform NER mapping
def ner_mapper(sentence, ner_dict):
    entities = []
    words = sentence.split()  # Tokenize the sentence into words
    for i, word in enumerate(words):
        # Check if the word or a phrase exists in the NER dictionary
        if word in ner_dict:
            entities.append((word, ner_dict[word]))
        elif i < len(words) - 1:
            # Check for two-word phrases (e.g., 'San Francisco')
            phrase = f"{word} {words[i + 1]}"
            if phrase in ner_dict:
                entities.append((phrase, ner_dict[phrase]))
                continue  # Skip the next word since it's part of a phrase
    return entities

# Apply NER mapping to each sentence in the dataset
for sentence in dummy_dataset:
    entities = ner_mapper(sentence, ner_mapping)
    print(f"Sentence: {sentence}")
    print(f"Entities: {entities}")
    print("\n")

Sentence: John works at OpenAI in San Francisco.
Entities: [('John', 'PERSON'), ('OpenAI', 'ORG')]


Sentence: Elon Musk is the CEO of Tesla and SpaceX.
Entities: [('Elon Musk', 'PERSON'), ('Tesla', 'ORG')]


Sentence: Barack Obama was the president of the United States.
Entities: [('Barack Obama', 'PERSON')]


Sentence: Apple is based in Cupertino, California.
Entities: [('Apple', 'ORG')]


