## Mini Activity: RegEx

In [12]:
import re

In [13]:
# 1. Write a regex pattern to match any string that starts with a digit and ends with a letter.

start_digit_end_letter = re.compile(r"^\d.*[a-zA-Z]$")

texts = ["2004April", "NLP"]

for text in texts:
    match_obj = start_digit_end_letter.match(text)
    print(match_obj.group() if match_obj else "No Match")

2004April
No Match


In [14]:
# 2. Create a regex to validate standard email addresses (e.g., user@example.com).
def validate_email(email):
    regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    if re.match(regex, email):
        return True
    else:
        return False


emails = [
    "user@example.com",
    "user.name+123@example.co",
    "invalid-email@.com",
    "name@domain",
]
for email in emails:
    print(f"{email}: {validate_email(email)}")

user@example.com: True
user.name+123@example.co: True
invalid-email@.com: False
name@domain: False


In [15]:
# 3. Write a regex pattern that matches US phone numbers in the format (123) 456-7890.
phone_number_regex = re.compile(r"\(\d{3}\) \d{3}-\d{4}")

test_numbers = ["(123) 456-7890", "(987) 654-3210", "123-456-7890"]

for number in test_numbers:
    match_obj = phone_number_regex.match(number)
    print(match_obj.group() if match_obj else "No Match")

(123) 456-7890
(987) 654-3210
No Match


In [16]:
# 4. How would you write a regex to extract dates in the format DD/MM/YYYY from a text?

date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")

dates = ["25/12/2020", "01/01/1999", "31/04/2023", "12-12-2022"]

# Check for matches
for date in dates:
    match_obj = date_regex.match(date)
    print(match_obj.group() if match_obj else "No Match")

25/12/2020
01/01/1999
31/04/2023
No Match


In [17]:
# 5. Write a regex that replaces multiple spaces in a string with a single space.


def replace_multiple_spaces(text):
    return re.sub(r"\s+", " ", text)


text = "This   is   a   test    string with  multiple   spaces."
result = replace_multiple_spaces(text)
print(result)

This is a test string with multiple spaces.


In [18]:
# 6. Create a regex to match filenames that end with .jpg, .jpeg, or .png.
image_extensions = re.compile(r"^\S.*\.(jpg|jpeg|png)$")

files = ["research.jpg", "lab-act.docx", "academic_calendar.jpeg", "image.png"]

for file in files:
    match_obj = image_extensions.match(file)
    print(match_obj.group() if match_obj else "No Match")

research.jpg
No Match
academic_calendar.jpeg
image.png


In [19]:
# 7. How can you use capturing groups in regex to extract both the area code and the number from a phone number like (123) 456-7890?


def extract_phone_number(phone):
    regex = r"\((\d{3})\)\s(\d{3}-\d{4})"
    match = re.match(regex, phone)

    if match:
        area_code = match.group(1)
        number = match.group(2)
        return area_code, number
    else:
        return None


phone_number = "(123) 456-7890"
result = extract_phone_number(phone_number)
if result:
    area_code, number = result

In [20]:
# 8. Write a regex to find all occurrences of the word "cat" in a sentence, ensuring it only matches as a whole word (not as part of another word).
def find_whole_word_cat(text):
    regex = r"\bcat\b"
    return re.findall(regex, text)


sentence = "The cat sat on the mat, but the catalog was on the shelf."
matches = find_whole_word_cat(sentence)
print(matches)

['cat']


In [21]:
# 9. Construct a regex pattern that matches a string containing a sequence of digits followed by either a hyphen or a space, and then a sequence of letters.
def match_digits_hyphen_or_space_letters(text):
    regex = r"\d+[- ]+[a-zA-Z]+"
    return re.findall(regex, text)


text = "123-abc 456 def 789-ghijk 012 xyz"
matches = match_digits_hyphen_or_space_letters(text)
print(matches)

['123-abc', '456 def', '789-ghijk', '012 xyz']


In [22]:
# 10. How would you use regex to sanitize a string by removing all non-alphanumeric characters (keeping only letters and numbers)?


def sanitize_string(text):
    regex = r"[^a-zA-Z0-9]"
    return re.sub(regex, "", text)


text = "Hello, World! 123 @#"
sanitized_text = sanitize_string(text)
print(sanitized_text)

HelloWorld123
