# NLP Assignment 02

## Name: Parth Desai
## PRN: 24070149017

## Part A: Named Entity Recognition (NER)

### 1. Load any pre-trained SpaCy model and perform NER on the following text:
### "Elon Musk founded SpaceX in 2002 and later acquired Twitter, now known as X, in 2022."


* Extract all named entities along with their entity types.
* Display the entities in a tabular format (Entity, Entity Type).

In [2]:
import spacy
import pandas as pd

# Load the pre-trained SpaCy model
nlp = spacy.load('en_core_web_sm')

# Input text
text = "Elon Musk founded SpaceX in 2002 and later acquired Twitter, now known as X, in 2022."

# Apply the NLP pipeline
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Display in tabular format
df = pd.DataFrame(entities, columns=['Entity', 'Entity Type'])
print(df)

      Entity Entity Type
0  Elon Musk      PERSON
1       2002        DATE
2    Twitter      PERSON
3       2022        DATE


### 2. Write a Python function that takes any text as input and highlights the following entity types:

* Person
* Organization
* Date

In [1]:
import spacy
import pandas as pd
from termcolor import colored

# Load the pre-trained SpaCy model
nlp = spacy.load('en_core_web_sm')

# Function to highlight specific entity types
def highlight_entities(text):
    doc = nlp(text)
    highlighted_text = text

    # Loop through entities in reverse to avoid index shifting
    for ent in sorted(doc.ents, key=lambda x: x.start_char, reverse=True):
        if ent.label_ in ['PERSON', 'ORG', 'DATE']:
            highlighted_text = (
                highlighted_text[:ent.start_char] + 
                colored(ent.text, 'cyan', attrs=['bold']) + 
                highlighted_text[ent.end_char:]
            )
    return highlighted_text

# Input text
text = "Elon Musk founded SpaceX in 2002 and later acquired Twitter, now known as X, in 2022."

# Apply the NLP pipeline
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Display in tabular format
df = pd.DataFrame(entities, columns=['Entity', 'Entity Type'])
print(df)

# Highlight specific entities
highlighted_text = highlight_entities(text)
print("\nHighlighted Text:\n", highlighted_text)

      Entity Entity Type
0  Elon Musk      PERSON
1       2002        DATE
2    Twitter      PERSON
3       2022        DATE

Highlighted Text:
 [1m[36mElon Musk[0m founded SpaceX in [1m[36m2002[0m and later acquired [1m[36mTwitter[0m, now known as X, in [1m[36m2022[0m.


## Part B: Regular Expressions

### 3. Use Python's re module to extract all email addresses from the following text:
### "Please contact us at support@example.com, info@nlp.com, or feedback123@textai.org for further details."

In [6]:
import re

# Input text
text = "Please contact us at support@example.com, info@nlp.com, or feedback123@textai.org for further details."

# Regular expression pattern for email addresses
email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'

# Extract email addresses
email_addresses = re.findall(email_pattern, text)

# Display the extracted email addresses
for email in email_addresses:
    print(email)

support@example.com
info@nlp.com
feedback123@textai.org


### 4. Write a Python script to perform the following tasks:

* Replace all digits in the text:
"The meeting is scheduled for 10:30 AM on 25th January 2025."
Replace digits with the string '*'.

* Find and extract all dates from the text:
 "Important dates are 25-01-2025, 26/01/2025, and 27-01-2025."

In [7]:
import re

# Task 1: Replace all digits with '*'
text1 = "The meeting is scheduled for 10:30 AM on 25th January 2025."
replaced_text = re.sub(r'\d', '*', text1)
print("Text after replacing digits:", replaced_text)

# Task 2: Extract all dates
text2 = "Important dates are 25-01-2025, 26/01/2025, and 27-01-2025."
date_pattern = r'\b\d{2}[-/]\d{2}[-/]\d{4}\b'
dates = re.findall(date_pattern, text2)

print("Extracted dates:", dates)

Text after replacing digits: The meeting is scheduled for **:** AM on **th January ****.
Extracted dates: ['25-01-2025', '26/01/2025', '27-01-2025']


### 5. Implement a function using regular expressions to check whether a given string is a valid Indian phone number (10 digits, starts with 7, 8, or 9). Test your function with various inputs.


In [8]:
import re

# Function to validate Indian phone numbers
def is_valid_indian_phone(number):
    pattern = r'^[789]\d{9}$'
    return bool(re.match(pattern, number))

# Test the function with various inputs
test_numbers = [
    "9876543210",  # Valid
    "8123456789",  # Valid
    "7123456789",  # Valid
    "6123456789",  # Invalid (starts with 6)
    "987654321",   # Invalid (9 digits)
    "998877665544" # Invalid (12 digits)
]

# Display the results
for number in test_numbers:
    result = "Valid" if is_valid_indian_phone(number) else "Invalid"
    print(f"Phone Number: {number} - {result}")

Phone Number: 9876543210 - Valid
Phone Number: 8123456789 - Valid
Phone Number: 7123456789 - Valid
Phone Number: 6123456789 - Invalid
Phone Number: 987654321 - Invalid
Phone Number: 998877665544 - Invalid
