# Regular expressions

In [85]:
import re

### Basic regex syntax and rules
##### More at: https://regex101.com/

In [86]:
pattern = r"abc"  # Literal characters
pattern = r"\$"  # matching a dollar sign
pattern = r"\\d"  # Escape sequences: matching the literal string "\d"

# Characters
pattern = r"\d"  # Predefined character class: matching digits
pattern = r"[aeiou]"  # Custom character set: matching vowels
pattern = r"[^aeiou]"  # Negation: matching non-vowels
pattern = r"colou?r"  # ? (0 or 1): matching "color" and "colour"

# Quantifiers
pattern = r"\d*"  # * (0 or more): matching any number of digits
pattern = r"\d+"  # + (1 or more): matching one or more digits
pattern = r"\d{3,5}"  # {m,n} (m to n occurrences): matching 3 to 5 digits

In [87]:
text = "What does the fox say?"
pattern = r"fox"
regex_object = re.compile(pattern)

# Find all matches
matches = regex_object.findall(text)
for match in regex_object.finditer(text):
    print("Matched string:", match.group())
    print("Start position:", match.start())
    print("End position:", match.end())
    print("Match span:", match.span())
    print("Original string:", match.string)
    print("Regex object:", match.re)
    

Matched string: fox
Start position: 14
End position: 17
Match span: (14, 17)
Original string: What does the fox say?
Regex object: re.compile('fox')


In [88]:
text = "John Smith: 555-1234"
pattern = r"(?P<name>\w+ \w+): (?P<phone>\d{3}-\d{4})"
regex_object = re.compile(pattern)
match = regex_object.search(text)

if match:
    # Print the dictionary of named groups and their matches
    print("Named groups:", match.groupdict())

Named groups: {'name': 'John Smith', 'phone': '555-1234'}


### Using a regex is safer

In [89]:
text = "G65_E_VL-VKBG_VS116OHVP45R_******_Param_10_m06xR"

parts = text.split("_")
substring = parts[-2] + "_" + parts[-1]
print(substring)  # Output: "10_m06xR"

pattern = r"Param_(\w+)"
regex_object = re.compile(pattern)
match = regex_object.search(text)
if match:
    substring = match.group(1)
    print(substring)  # Output: "10_m06xR"
else:
    print(match)

10_m06xR
10_m06xR


In [90]:
text = "G65_E_VL-VKBG_VS116OHVP45R_******"
parts = text.split("_")
substring = parts[-2] + "_" + parts[-1]
print(substring) # Output: VS116OHVP45R_****** <- NOT WHAT WE WANT!

pattern = r"Param_(\w+)"
regex_object3 = re.compile(pattern)
match = regex_object3.search(text)
if match:
    substring4 = match.group(1)
    print(substring4)
else:
    print(match)  # Output: None <- IS WHAT WE WANT!

VS116OHVP45R_******
None


### Character classes


In [91]:
text = "The quick fox jumps over the lazy dog!"
pattern = r"[aeiou]"
regex_object = re.compile(pattern)

# Find matches
matches = regex_object.findall(text)
print(matches)  # Output: ['e', 'u', 'i', 'o', 'o', 'u', 'o', 'e', 'a', 'o']

# Replace matches
new_text = regex_object.sub("-", text)
print(new_text)  # Output: "Th- q--ck br--wn f-x j-mps -v-r th- l-z- d-g."


['e', 'u', 'i', 'o', 'u', 'o', 'e', 'e', 'a', 'o']
Th- q--ck f-x j-mps -v-r th- l-zy d-g!


### Quantifiers


In [92]:
text = "Are you a city mouse or a country mouse at heart?"
pattern = r"\b\w{4,6}\b"
regex_object = re.compile(pattern)

# Find matches
matches = regex_object.findall(text)
print(matches)  # Output: ['city', 'mouse', 'mouse', 'heart']

# Replace matches
new_text = regex_object.sub("*****", text)
print(new_text)  # Output: Are you a ***** ***** or a country ***** at *****?

['city', 'mouse', 'mouse', 'heart']
Are you a ***** ***** or a country ***** at *****?


### Anchors

In [93]:
text = "The quick fox jumps over the lazy dog!"
pattern = r"^The|\bdog\!"
'''
^The      Matches the string "The" only if it appears at the beginning of a line.
|         = OR
\bdog\!   Matches the string "dog!" only if it appears at the end of a word followed by "!".
'''
regex_object = re.compile(pattern, re.MULTILINE)

matches = regex_object.findall(text)
print(matches)  # Output: ['The', 'dog!']


['The', 'dog!']


### Groups

In [94]:
import re

text = "My name is Milán, and I'm 22 years old."
pattern = r"My name is (\w+), and I'm (\d+) years old\."
'''
My name is  Matches "My name is ".
(\w+)      Matches one or more word characters and captures them as a group.
, and I'm  Matches the string ", and I'm "
(\d+)      Matches one or more digits and captures them as a group.
years old\. Matches the string "years old."
'''
regex_object = re.compile(pattern)

# Find matches and get the captured groups
match = regex_object.search(text)
name = match.group(1)
age = match.group(2)

print("Name: " + name)  # Output: Name: Milán
print("Age: " + age)    # Output: Age: 22

Name: Milán
Age: 22


### Lookarounds

In [95]:
text = "The quick fox jumps over the lazy dog."
pattern = r"(?<=quick )fox"
# matches the word "fox" only if it comes after the word "quick" with a space.
regex_object = re.compile(pattern)

# Find matches
matches = regex_object.findall(text)
print(matches)  # Output: ['fox']

['fox']


### FLags

In [96]:
text = "Hello, world!"
pattern = r"hello"
regex_object = re.compile(pattern, re.IGNORECASE)

# Find matches
matches = regex_object.findall(text)
print(matches)  # Output: ['Hello']

['Hello']


### Different ways to match a patter

In [97]:
text = "The quick fox jumps over the lazy dog."

# Match a pattern
# match only looks for the pattern at the beginning of the string,
# while search searches for the pattern throughout the entire string.
pattern = r"The"
match = re.match(pattern, text)
if match:
    print("Found match:", match.group()) # Output: Found match: The

# Search for a pattern
pattern = r"quick"
match = re.search(pattern, text)
if match:
    print("Found match:", match.group()) # Output: Found match: quick

# Find all occurrences of a pattern
pattern = r"\b\w{4}\b"
matches = re.findall(pattern, text)
print("Found matches:", matches) # Output: Found matches: ['over', 'lazy']

# Replace a pattern
pattern = r"quick|lazy"
new_text = re.sub(pattern, "red", text)
print("New text:", new_text) # Output: New text: The quick fox jumps over the red dog.

# Split the text into a list of substrings using a pattern
pattern = r"\W+"
words = re.split(pattern, text)
print("Words:", words) # Output: Words: ['The', 'quick', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '']

Found match: The
Found match: quick
Found matches: ['over', 'lazy']
New text: The red fox jumps over the red dog.
Words: ['The', 'quick', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '']
