# Regular Expressions


Regular expressions/regexp/regex/re - 
A tool for matching patterns in text. The 're' module in Python. The applications for regular expressions are wide-spread but complex - use regex as a last resort! An example regex - r"^(From|To|Cc).*?python-list@python.org" 

The caret, ^ , matches text at the beginning of a line. '(From|To|Cc)' indicates that the text to match must start with one of the words separated by the pipe. The '* ?' means indicates that any no. characters may be un-greedily matched until the newline \n character. 

The un-greedy part means to match as few repetitions as possible. The '.' character means any non-newline character, the * means to repeat at least 1 once, and the '?' character means to make it un-greedy.

In [None]:
# Example: 
import re
pattern = re.compile(r"\[(on|off)\]") # Slight optimization
print(re.search(pattern, "Mono: Playback 65 [75%] [-16.50dB] [on]"))
# Returns a Match object!
print(re.search(pattern, "Nada...:-("))
# Doesn't return anything.
# End Example

# Exercise: make a regular expression that will match an email
def test_email(your_pattern):
    pattern = re.compile(your_pattern)
    emails = ["john@example.com", "python-list@python.org", "wha.t.`1an?ug{}ly@email.com"]
    for email in emails:
        if not re.match(pattern, email):
            print("You failed to match %s" % (email))
        elif not your_pattern:
            print("Forgot to enter a pattern!")
        else:
            print("Pass")
pattern = r"" # Your pattern here!
test_email(pattern)

In [None]:
pattern = r"\d+"
string = "Some text. 123"
print(re.split(pattern,string))

In [None]:
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
replacement = "blahem"
result, replaced_count = re.subn(pattern, replacement, string)
print(replaced_count)

In [None]:
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
re_obj = re.compile(pattern)
print(re_obj.findall(string))

In [None]:
import re
pattern = r"(\d+) (\d+)?"
string = "123 text"
match_object = re.match(pattern, string)
print(match_object.group(match_object.lastindex))

In [None]:
string = "Some text; 123, Some text, 123"
pattern = r"[,;]"
print(re.split(pattern,string))

In [None]:
string = "Python"
pattern = r"^Python$"
re_obj = re.compile(pattern)
if re_obj.search(string):
    print("Blahem")

##  Fetch all occurrences of a regex pattern

In [29]:
import re 
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
print(re.findall(pattern,string))

['123', '321']


In [None]:
import re 
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
occurrences = re.finditer(pattern, string)
print([o.group() for o in occurrences])

In [None]:
#The wrong way
import re 
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
re_obj = re.compile(pattern)
print(re_obj.search(string))

In [None]:
import re 
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
re_obj = re.compile(pattern)
print(re_obj.findall(string))

## Finding out how many occurrences of a regex pattern were replaced

In [None]:
import re 
pattern = r"\d+"
string = "Some text. 123. Some text. 321"
replacement = "Some text"
result, replaced_count = re.subn(pattern, replacement,string)
print(replaced_count)

## Get a part of the string where there was a match

In [1]:
import re
pattern = r"\d+"
string = "Some text.3600"
match_object = re.match(pattern,string)
print(match_object[0])

TypeError: 'NoneType' object is not subscriptable

In [3]:
import re
pattern = r"\d+"
string = "Some text.3600"
match_object = re.match(pattern,string)
print(match_object.group())

AttributeError: 'NoneType' object has no attribute 'group'

In [4]:
import re
pattern = r"\d+"
string = "Some text.3600"
match_object = re.match(pattern,string)
print(match_object.group(0))

AttributeError: 'NoneType' object has no attribute 'group'

In [5]:
import re
pattern = r"\d+"
string = "Some text.3600"
match_object = re.match(pattern,string)
print(match_object.group()[0])

AttributeError: 'NoneType' object has no attribute 'group'

## Find a substring that matched the last capturing group of the regex

In [6]:
import re 
pattern = r"(\d+) (\d+)?"
string = "123 text"
match_object = re.match(pattern, string)
print(match_object.lastgroup)

None


In [7]:
import re 
pattern = r"(\d+) (\d+)?"
string = "123 text"
match_object = re.match(pattern, string)
print(match_object.group(match_object.lastindex))

123


In [8]:
import re 
pattern = r"(\d+) (\d+)?"
string = "123 text"
match_object = re.match(pattern, string)
print(match_object.groups()[match_object.lastindex])

None


In [9]:
import re 
pattern = r"(\d+) (\d+)?"
string = "123 text"
match_object = re.match(pattern, string)
print(match_object.group()[-1])

 


## Splitting a string by multiple delimiters

In [25]:
import re
string = "Some text; 123, Some Text, 123"
pattern = r"(,;)"
print(re.split(pattern,string))

['Some text; 123, Some Text, 123']


In [26]:
string = "Some text; 123, Some Text, 123"
pattern = [",", ";"]
print(string.split(pattern))

TypeError: must be str or None, not list

In [27]:
string = "Some text; 123, Some Text, 123"
pattern = r",;"
print(re.split(pattern, string))

['Some text; 123, Some Text, 123']


In [28]:
string = "Some text; 123, Some Text, 123"
pattern = r"[,;]"
print(re.split(pattern, string))

['Some text', ' 123', ' Some Text', ' 123']


## A pattern that capturese the shortest possible match

In [40]:
import re
string = "Some text 'a', Some text 'b'"
pattern = r"'(.*??)'"
re_obj = re.compile(pattern)
result = re_obj.findall(string)
print(result)

error: multiple repeat at position 5

In [41]:
import re
string = "Some text 'a', Some text 'b'"
pattern = r"'(.?*)'"
re_obj = re.compile(pattern)
result = re_obj.findall(string)

error: multiple repeat at position 4

In [43]:
import re
string = "Some text 'a', Some text 'b'"
pattern = r"'(.*)'"
re_obj = re.compile(pattern)
result = re_obj.findall(string)
print(result)

["a', Some text 'b"]


In [44]:
import re
string = "Some text 'a', Some text 'b'"
pattern = r"'(.*?)'"
re_obj = re.compile(pattern)
results = re_obj.findall(string)
print(results)

['a', 'b']


In [48]:
import re 
string = '''multiline
string
'''
pattern = r"mul.+ing"
re_obj = re.compile(pattern, re.DOTALL)
re_obj.findall(string)

['multiline\nstring']

# 

In [45]:
import re
string = '''multiline
string
'''
pattern = r"mul.+ing"
re_obj = re.compile(pattern)
re_obj.findall(string)

[]

In [46]:
import re
string = '''multiline
string
'''
pattern = r"mul(.\n)+ing"
re_obj = re.compile(pattern)
re_obj.findall(string)

[]

In [47]:
import re
string = '''multiline
string
'''
pattern = r"mul.+ing"
re_obj = re.compile(pattern, re.MULTILINE)
re_obj.findall(string)

[]

## Check if a whole string matches a pattern

In [49]:
import re
string = "Python"
pattern = r"Python"
if re.match(pattern, string):
    print("match")

match


In [50]:
import re
string = "Python"
pattern = r"^Python$"
if re.match(pattern, string):
    print("match")

match


In [51]:
import re
string = "Python"
pattern = r"Python"
if re.fullmatch(pattern, string):
    print("match")

match


In [52]:
import re
string = "Python"
pattern = r"^Python$"
re_obj = re.compile(pattern)
if re_obj.search(string):
    print("match")

match


## Perform case-insensitive matching

In [21]:
import re
string = "hello"
pattern = r"/hello/i"
re_obj = re.compile(pattern)
if re_obj.search(string):
    print("fuck")

In [22]:
import re 
string = "hello"
pattern = r"hello"
re_obj = re.compile(pattern, re.I)
if re_obj.search(string):
    print("blahem")

blahem


In [23]:
import re 
string = "hello"
pattern = r"hello"
if re.findall(pattern, string, re.IGNORECASE):
    print("blahem")

blahem


In [24]:
import re 
string = "hello"
pattern = r"hello"
re_obj = re.compile(pattern, re.IGNORECASE)
if re_obj.match(string):
    print("blahem")

blahem


## Capture any 2 characters except a new line between "py" and "on"

In [38]:
import re 
string = """python,
py
on"""
pattern = r"py..on"
print(re.findall(pattern,string))

['python']
