In [1]:
# https://docs.python.org/3/howto/regex.html
# https://regexone.com/
# ^           Starts with
# $           Ends with
# .           Any character
# ?           Optional character (e.g. "p?each", will match both "peach" and "each", as p is optional)
# \s          Whitespace
# \S          Non-whitespace characters
# *           Repeats a character zero or more times (greedy means it will try to match the longest string possible)
# *?          Repeats a character zero or more times (non-greedy)
# +           Repeats a character one or more times
# +?          Repeats a character one or more times(non-greedy)
# [aeiou]     a single character in the listed set
# [^XYZ]      a single character not in the listed set
# [a-z0-9]    set of characters can include a range
# ()          use () to group conditions together, e.g. r"^([1-9]|1[0-2]):[0-5][0-9]\s?(AM|PM|am|pm)$"
# /b          word boundary (e.g. r"\bcat\b" will match "cat" but not "catch")
# \d          digits
# |           or, e.g. r"cat|dog" will match "cat" or "dog"

In [2]:
import re
x = 'My 2 favorite numbers are 19 and 42'
print(re.findall('[0-9]+', x))      # ['2', '19', '42']
print(re.search(r'vorite', x))      # <re.Match object; span=(7, 13), match='vorite'>
print(x.startswith('My'))           # True
print(x.find('vorite'))             # 7     # returns the index of first occurance

['2', '19', '42']
<re.Match object; span=(7, 13), match='vorite'>
True
7


In [8]:
# Greedy matching - * and + push outward in both directions (greedy)
# to match the largest possible string

x ='From: stephen@gmail.com Sat Jan 5 09:14:16 2022'
y = re.findall('^F.+:', x)      # greedy match
print(y)                        # ['From: stephen@gmail.com Sat Jan 5 09:14:']

z = re.findall('^F.+?:', x)     # non-greedy match
print(z)                        # ['From:']

# find the substring, but extract only what's in ()
a = re.findall('^From: (\S+?@\S+)', x)   
print(a)                        # ['stephen@gmail.com']

# Dual Split pattern - cut both ways
b = re.findall('From.*@([^ ]*)', x)       # [^ ] == match non-blank char
print(b)                        # ['gmail.com']

['From: stephen@gmail.com Sat Jan 5 09:14:']
['From:']
['stephen@gmail.com']
['gmail.com']


In [12]:
def check_aei (text):
    # text contains a, e, or i in any order with exactly one occurence, sandwiched between two other characters, ignore case
    result = re.search(r'[^aei][aei][^aei]', text, re.IGNORECASE)   # r'' is raw string
    return result != None

print(check_aei("academia")) # True
print(check_aei("aerial")) # False
print(check_aei("paramedic")) # True

True
False
True


In [14]:
# ----------- OR operator -----------
print(re.search(r"cat|dog", "I like cats."))             # <re.Match object; span=(7, 10), match='cat'>
print(re.findall(r"cat|dog", "I like dogs and cats."))   # ['dog', 'cat']

<re.Match object; span=(7, 10), match='cat'>
['dog', 'cat']


In [18]:
# ----------- Capturing Groups -----------
def rearrange_name(name):
    result = re.search(r"^([\w \.-]*), ([\w \.-]*)$", name)     # () is a capturing group (will be returned as a tuple)
    if result == None:      # if no match,
        return name
    return "{} {}".format(result[2], result[1])

name=rearrange_name("Kennedy, John F.")
print(name)


def extract_pid(log_line):
  # match digits in square brackets, followed by a space, followed by one or more uppercase letters
  regex = r"\[(\d+)\]: ([A-Z]+)"    # () is a capturing group, which will be returned as a tuple
  result = re.search(regex, log_line)
  if result is None:
    return None
  return "{} ({})".format(result[1], result[2])

print(extract_pid("July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade")) # 12345 (ERROR)
print(extract_pid("99 elephants in a [cage]")) # None
print(extract_pid("A string that also has numbers [34567] but no uppercase message")) # None
print(extract_pid("July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup")) # 67890 (RUNNING)

John F. Kennedy


In [28]:
# ----------- Split -----------
re.split(r"[.?!]", "One sentence. Another one? And the last one!") # ['One sentence', ' Another one', ' And the last one', '']
# to include the punctuation in the result, use () to capture the punctuation
re.split(r"([.?!])", "One sentence. Another one? And the last one!") # ['One sentence', '.', ' Another one', '?', ' And the last one', '!', '']

# ----------- Substitution -----------
re.sub(r"[\w.%+-]+@[\w.-]+", "[REDACTED]", "Received an email for go_nuts@example.com") # 'Received an email for [REDACTED]'
re.sub(r"^([\w .-]*), ([\w .-]*)$", r"\2 \1", "Lovelace, Ada") # 'Ada Lovelace'

'Ada Lovelace'

In [41]:
import re
def transform_record(record):
    new_record = re.sub(r"(\d{3}-?\d{3}-?\d{4})", r"+1-\1", record)
    return new_record

print(transform_record("Sabrina Green,802-867-5309,System Administrator")) 
# Sabrina Green,+1-802-867-5309,System Administrator

print(transform_record("Eli Jones,684-3481127,IT specialist")) 
# Eli Jones,+1-684-3481127,IT specialist

print(transform_record("Melody Daniels,846-687-7436,Programmer")) 
# Melody Daniels,+1-846-687-7436,Programmer

print(transform_record("Charlie Rivera,698-746-3357,Web Developer")) 
# Charlie Rivera,+1-698-746-3357,Web Developer

Sabrina Green,+1-802-867-5309,System Administrator
Eli Jones,+1-684-3481127,IT specialist
Melody Daniels,+1-846-687-7436,Programmer
Charlie Rivera,+1-698-746-3357,Web Developer


In [42]:
import re
def multi_vowel_words(text):
    # find all words with >= 3 consecutive vowels (a, e, i, o, u)
    pattern = r"\w*[aeiou]{3,}\w*"
    result = re.findall(pattern, text)
    return result

print(multi_vowel_words("Life is beautiful")) 
# ['beautiful']

print(multi_vowel_words("Obviously, the queen is courageous and gracious.")) 
# ['Obviously', 'queen', 'courageous', 'gracious']

print(multi_vowel_words("The rambunctious children had to sit quietly and await their delicious dinner.")) 
# ['rambunctious', 'quietly', 'delicious']

print(multi_vowel_words("The order of a data queue is First In First Out (FIFO)")) 
# ['queue']

print(multi_vowel_words("Hello world!")) 
# []

['beautiful']
['Obviously', 'queen', 'courageous', 'gracious']
['rambunctious', 'quietly', 'delicious']
['queue']
[]


In [45]:
import re
def transform_comments(line_of_code):
    # find >=1 # collectively as one group, replace this group with "//"
    result = re.sub(r"#+", "//", line_of_code)
    return result

print(transform_comments("### Start of program")) 
# Should be "// Start of program"
print(transform_comments("  number = 0   ## Initialize the variable")) 
# Should be "  number = 0   // Initialize the variable"
print(transform_comments("  number += 1   # Increment the variable")) 
# Should be "  number += 1   // Increment the variable"
print(transform_comments("  return(number)")) 
# Should be "  return(number)"

// Start of program
  number = 0   // Initialize the variable
  number += 1   // Increment the variable
  return(number)


In [52]:
import re
def convert_phone_number(phone):
    # find 3 digits followed by a dash, followed by 3 digits, followed by a dash, followed by 4 digits, and not 5 digits
    # replace with (3 digits) space (3 digits) dash (4 digits)
    # if it is not a phone number, return the original string
    result = re.sub(r"(\d{3})-(\d{3})-(\d{4} )", r"(\1) \2-\3", phone)
    return result

print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300

My number is 212-345-9999.
Please call 888-555-1234
123-123-12345
Phone number of Buckingham Palace is +44 303 123 7300


In [None]:
# Coursera Qwiklab week 3, Using Python to Interact with the Operating System
#!/usr/bin/env python3

import csv
import re

def contains_domain(address, domain):
    """Returns True if the email address contains the given, domain, in the domain position, false if not."""
    domain_pattern = r'[\w\.-]+@'+domain+'$'
    if re.match(domain_pattern, address):
        return True
    return False

def replace_domain(address, old_domain, new_domain):
    """Replaces the old domain with the new domain in the received address."""
    old_domain_pattern = r'' + old_domain + '$'
    address = re.sub(old_domain_pattern, new_domain, address)
    return address

def main():
    """Processes the list of emails, replacing any instances of the old domain with the new domain."""
    old_domain, new_domain = 'abc.edu', 'xyz.edu'
    csv_file_location = '/home/student-03-30b89dc53895/data/user_emails.csv'
    report_file = '/home/student-03-30b89dc53895' + '/data/updated_user_emails.csv'
    user_email_list = []
    old_domain_email_list = []
    new_domain_email_list = []
    with open(csv_file_location, 'r') as f:
        user_data_list = list(csv.reader(f))
        user_email_list = [data[1].strip() for data in user_data_list[1:]]
        for email_address in user_email_list:
            if contains_domain(email_address, old_domain):
                old_domain_email_list.append(email_address)
                replaced_email = replace_domain(email_address, old_domain, new_domain)
                new_domain_email_list.append(replaced_email)
        email_key = ' ' + 'Email Address'
        email_index = user_data_list[0].index(email_key)
        for user in user_data_list[1:]:
            for old_domain, new_domain in zip(old_domain_email_list, new_domain_email_list):
                if user[email_index] == ' ' + old_domain:
                    user[email_index] = ' ' + new_domain
        f.close()
    with open(report_file, 'w+') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(user_data_list)
        output_file.close()

main()