# Capturing Groups

Capturing groups - portions of patterns enclosed in parenthesis.



In [23]:
import re

In [24]:
result = re.search(r"^(\w*), (\w*)$","Lovelace, Ada")
result
#matches group of letters, comma and space, and a group of letters

<re.Match object; span=(0, 13), match='Lovelace, Ada'>

In [25]:
print(result.groups())
print(result[0])
print(result[1])
print(result[2])

('Lovelace', 'Ada')
Lovelace, Ada
Lovelace
Ada


In [26]:
print(f"{result[2]} {result[1]}")
print("{} {}".format(result[2],result[1]))

Ada Lovelace
Ada Lovelace


In [27]:
def rearrange_name(name):
    result = re.search(r"^(\w*), (\w*)$",name)
    if result is None:
        return name
    return f"{result[2]} {result[1]}"

In [28]:
rearrange_name("Lovelace, Ada")

'Ada Lovelace'

In [29]:
rearrange_name("Lovelace,Ada")

'Lovelace,Ada'

In [30]:
def rearrange_name_full(name):
    result = re.search(r"^([\w \.-]*), ([\w \.-]*)$",name)
    if result is None:
        return name
    return f"{result[2]} {result[1]}"

In [31]:
rearrange_name_full("Kennedy, John F.")

'John F. Kennedy'

## More on Repitition Qualifiers


In [33]:
re.search(r"[a-zA-Z]{5}","a ghost")
#looking for letters repeated for 5 times

<re.Match object; span=(2, 7), match='ghost'>

In [34]:
re.search(r"[a-zA-Z]{5}","a scary ghost appeared")

<re.Match object; span=(2, 7), match='scary'>

In [35]:
re.findall(r"[a-zA-Z]{5}","a scary ghost appeared")

['scary', 'ghost', 'appea']

### \b

matches word limits at beginning and end of pattern

In [36]:
re.findall(r"\b[a-zA-Z]{5}\b","a scary ghost appeared")

['scary', 'ghost']

In [37]:
re.findall(r"\w{5,10}","I really like strawberries")

['really', 'strawberri']

In [38]:
re.findall(r"\w{5,}","I really like strawberries")

['really', 'strawberries']

In [39]:
re.search(r"s\w{,20}","I really like strawberries")

<re.Match object; span=(14, 26), match='strawberries'>

## Extracting a PID Using regexes in Python

In [43]:
log = "July 31 07:51:48 mycomputer bad_process [12345]: ERROR Performing package upgrade"
regex = r"\[(\d+)\]"
result = re. search(regex, log)
print(result)

<re.Match object; span=(40, 47), match='[12345]'>


In [45]:
def extract_pid(log_line):
    regex = r"\[(\d+)\]"
    result = re. search(regex, log_line)
    if result is None:
        return ""
    return result

In [46]:
print(extract_pid(log))

<re.Match object; span=(40, 47), match='[12345]'>


# Splitting and Replacing


In [47]:
re.split(r"[.?!]","One sentence. Another one? And the last one!")
#does not include characters

['One sentence', ' Another one', ' And the last one', '']

In [48]:
re.split(r"([.?!])","One sentence. Another one? And the last one!")

['One sentence', '.', ' Another one', '?', ' And the last one', '!', '']

In [49]:
re.sub(r"[\w.%+-]+@[\w.-]+","[REDACTED]","Received an email for go_nuts95@my.example.com")
# \w - letters numbers and _


'Received an email for [REDACTED]'

In [50]:
re.sub(r"^([\w .-]*),([\w .-]*)$", r"\2 \1", "Lovelace, Ada")
# second paramenter \2 means second captured group


' Ada Lovelace'

# Practice Quiz: Advanced Regular Expressions

### Question 1

We're working with a CSV file, which contains employee information. Each record has a name field, followed by a phone number field, and a role field. The phone number field contains U.S. phone numbers, and needs to be modified to the international format, with "+1-" in front of the phone number. Fill in the regular expression, using groups, to use the transform_record function to do that.

In [51]:
import re
def transform_record(record):
  new_record = re.sub(r"(.*),(.*),(.*)",r"\1,+1-\2,\3",record)
  return new_record

print(transform_record("Sabrina Green,802-867-5309,System Administrator")) 
# Sabrina Green,+1-802-867-5309,System Administrator

print(transform_record("Eli Jones,684-3481127,IT specialist")) 
# Eli Jones,+1-684-3481127,IT specialist

print(transform_record("Melody Daniels,846-687-7436,Programmer")) 
# Melody Daniels,+1-846-687-7436,Programmer

print(transform_record("Charlie Rivera,698-746-3357,Web Developer")) 
# Charlie Rivera,+1-698-746-3357,Web Developer

Sabrina Green,+1-802-867-5309,System Administrator
Eli Jones,+1-684-3481127,IT specialist
Melody Daniels,+1-846-687-7436,Programmer
Charlie Rivera,+1-698-746-3357,Web Developer


### Question 2
The `multi_vowel_words` function returns all words with 3 or more consecutive vowels (a, e, i, o, u). Fill in the regular expression to do that.

In [52]:
def multi_vowel_words(text):
  pattern = r"\b\w*[aeiou]{3,}\w*\b"
  result = re.findall(pattern, text)
  return result

print(multi_vowel_words("Life is beautiful")) 
# ['beautiful']

print(multi_vowel_words("Obviously, the queen is courageous and gracious.")) 
# ['Obviously', 'queen', 'courageous', 'gracious']

print(multi_vowel_words("The rambunctious children had to sit quietly and await their delicious dinner.")) 
# ['rambunctious', 'quietly', 'delicious']

print(multi_vowel_words("The order of a data queue is First In First Out (FIFO)")) 
# ['queue']

print(multi_vowel_words("Hello world!")) 
# []

['beautiful']
['Obviously', 'queen', 'courageous', 'gracious']
['rambunctious', 'quietly', 'delicious']
['queue']
[]


Question 4
The transform_comments function converts comments in a Python script into those usable by a C compiler. This means looking for text that begins with a hash mark (#) and replacing it with double slashes (//), which is the C single-line comment indicator. For the purpose of this exercise, we'll ignore the possibility of a hash mark embedded inside of a Python command, and assume that it's only used to indicate a comment. We also want to treat repetitive hash marks (##), (###), etc., as a single comment indicator, to be replaced with just (//) and not (#//) or (//#). Fill in the parameters of the substitution method to complete this function: 

In [53]:
import re
def transform_comments(line_of_code):
  result = re.sub(r"#{1,}",r"//",line_of_code)
  return result

print(transform_comments("### Start of program")) 
# Should be "// Start of program"
print(transform_comments("  number = 0   ## Initialize the variable")) 
# Should be "  number = 0   // Initialize the variable"
print(transform_comments("  number += 1   # Increment the variable")) 
# Should be "  number += 1   // Increment the variable"
print(transform_comments("  return(number)")) 
# Should be "  return(number)"

// Start of program
  number = 0   // Initialize the variable
  number += 1   // Increment the variable
  return(number)


Question 5
The convert_phone_number function checks for a U.S. phone number format: XXX-XXX-XXXX (3 digits followed by a dash, 3 more digits followed by a dash, and 4 digits), and converts it to a more formal format that looks like this: (XXX) XXX-XXXX. Fill in the regular expression to complete this function.

In [None]:
import re
def convert_phone_number(phone):
  result = re.sub(r"\b(\d{3})-(\d{3})-(\d{4})\b",r"(\1)\2-\3",phone)
  return result

print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300