## Without regex

In [2]:
def is_a_phone_number(text:str)->bool:
    '''Checks if a string is a valid  phone number '''
    # 05x-xxx-xxxx  x ∈ [1,9]
    if len(text) != 12:
        return False
    if text[0:2] != '05':
        return False
    nums = text.split('-')
    if len(nums) != 3:
        return False
    for num in nums:
        if not num.isdecimal():
            return False
        if num == nums[2] and len(num)!= 4:
            return False
    return True

In [3]:
import doctest
'''
>>> is_a_phone_number('055-555-5555')
True
>>> is_a_phone_number('055-555-555x')
False
>>> is_a_phone_number('055-555-555')
False
>>> is_a_phone_number('phone number')
False
'''
doctest.testmod()

TestResults(failed=0, attempted=4)

In [4]:
from typing import List
def find_phone_numbers(text:str)-> List[str]:
    '''Finds all phone number in a string '''
    list_of_phone_numbers = list()
    for i in range(len(text)):
        if is_a_phone_number(text[i:i+12]):
           list_of_phone_numbers.append(text[i:i+12])
    return list_of_phone_numbers

In [5]:
import doctest
'''
>>> txt ='054-444-4444 gj fdkl055-555-5555mvkdl053-545-4545'
>>> find_phone_numbers(txt)
['054-444-4444', '055-555-5555', '053-545-4545']
>>> txt = 'hjhkd056-666-66666csjkccsd'
>>> find_phone_numbers(txt)
['056-666-6666']
'''
doctest.testmod()

TestResults(failed=0, attempted=4)

## With regex

In [6]:
import re
phone_pattern = re.compile(r'05\d-\d\d\d-\d\d\d\d')  # r'' = raw string; do not treat \ as a special char
def find_phone_numbers_regex(text:str)-> list[str]: 
    phones = phone_pattern.findall(text)
    return phones

In [7]:
import doctest
'''
>>> txt ='054-444-4444 gj fdkl055-555-5555mvkdl053-545-4545'
>>> find_phone_numbers_regex(txt)
['054-444-4444', '055-555-5555', '053-545-4545']
>>> txt = 'hjhkd056-666-66666csjkccsd'
>>> find_phone_numbers_regex(txt)
['056-666-6666']
>>> find_phone_numbers_regex('045-454-4545')
[]
'''
doctest.testmod()

TestResults(failed=0, attempted=5)

## Regex special chars

In [67]:
#.: any character
re.findall(r"a..z", "123 abcz adez afz az axyz")

['abcz', 'adez', 'axyz']

In [82]:
#[]: set of characters
print(re.findall(r"a[bd].z", "123 abcz adez afxz az axyz"))
print(re.findall(r"a[^bd].z", "123 abcz adez afxz az axyz")) # ^ = not
print(re.findall(r"a[b-f].z", "123 abcz adez afxz az axyz")) # [b-f] = [bcdef]
print(re.findall(r"a[^b-f].z", "123 abcz adez afxz az axyz")) 
print(re.findall(r"a.[.]z", "123 ab.z adez afxz az ax.z"))

['abcz', 'adez']
['afxz', 'axyz']
['abcz', 'adez', 'afxz']
['axyz']
['ab.z', 'ax.z']


In [74]:
# \d: digit = [0-9]
# \w: letter or digit = [a-zA-Z0-9]
# \s: space = [ \t] 
print(re.findall(r"\d\w\s", "1A 2b\t a3 ab 34 "))

['1A ', '2b\t', '34 ']


In [83]:
#^ = start of string
#$ = end of string
print(re.findall(r"^\d\w\s", "1A 2b\t a3 ab 34 "))
print(re.findall(r"\d\w\s$", "1A 2b\t a3 ab 34 "))

['1A ']
['34 ']


## Regex operators

In [10]:
#+: one or more
txt = " nooooo or just no that is the question"
re.findall(r"no+", txt)

['nooooo', 'no']

In [11]:
#*: zero or more
txt = "yesss we can ,ye"
re.findall("yes*", txt)

['yesss', 'ye']

In [16]:
#{}: exact count
txt = "alex reallly falls in alllllll times"
re.findall("al{2,4}", txt)

['alll', 'all', 'allll']

In [15]:
#|: or
txt = "yes we can or no we can't"
re.findall("ye*s|no", txt)

['ys', 'no']

In [21]:
#?: zero or one
batman_pattern = re.compile(r'Batw?o?man')
batwoman_txt = 'Batwoman is stronger than Batman'
print(re.findall(batman_pattern, batwoman_txt))
batman_txt = 'Batman is stronger than Batwoman'
print(re.findall(batman_pattern, batman_txt))

['Batwoman', 'Batman']
['Batman', 'Batwoman']


## Groups

In [27]:
#(): groups
txt ='abc054-444-4444 gj fdkl055-555-5555mvkdl053-545-4545'
phone_pattern_with_groups = re.compile(r'(05\d)-(\d\d\d-\d\d\d\d)')
print(phone_pattern_with_groups.findall(txt))
phone_pattern_with_entire_number =  re.compile(r'((05\d)-(\d\d\d-\d\d\d\d))')
print(phone_pattern_with_entire_number.findall(txt))

[('054', '444-4444'), ('055', '555-5555'), ('053', '545-4545')]
[('054-444-4444', '054', '444-4444'), ('055-555-5555', '055', '555-5555'), ('053-545-4545', '053', '545-4545')]


In [36]:
pattern_groups = phone_pattern_with_groups.search(txt)
print(pattern_groups.group()) # Returns the entire matching text
# Equivalent to: print(pattern_groups.group(0))
print(pattern_groups.group(1))
print(pattern_groups.group(2))
# print(pattern_groups.group(3)) # error - no such group

054-444-4444
054
444-4444


In [59]:
# Groups with "?"
print(re.findall(r'Bat(wo)?man', 'Batman VS Superman')) # the group is empty
print(re.findall(r'Bat(wo)?man', 'Batwoman VS Superman')) # the group is 'wo' 
print(re.findall(r'(Bat(wo)?man)', 'Batman VS Superman'))

['']
['wo']
[('Batman', '')]


## Flags

In [77]:
# Ignore case
all_cases_a_patt = re.compile('a', re.IGNORECASE)
all_cases_a_patt.findall('And they have all they wanted AMEN')

['A', 'a', 'a', 'a', 'A']

In [87]:
# re.VERBOSE - ignores newlines in the pattern (for readability)
phone_patt = re.compile(R''' 
05\d-
\d{3}-
\d{4}
''', re.VERBOSE)
phone_patt.findall('054-444-4444 gj fdkl055-555-5555mvkdl053-545-4545')

['054-444-4444', '055-555-5555', '053-545-4545']

In [88]:
# re.S = the . char matches newline
find_all_patt = re.compile('[a-z].+',re.S|re.IGNORECASE)
find_all = find_all_patt.search('Every thing I write \nwill be found')
print(find_all.group())

Every thing I write 
will be found


## Methods

In [80]:
# Span
txt = 'abc 054-444-4444 gj fdkl055-555-5555mvkdl053-545-4545'
search_results = re.search(r'(\d\d\d)-(\d\d\d-\d\d\d\d)' , txt)
print(search_results.span())

(4, 16)


In [53]:
# Split
phone_number_row = 'Tom Pythonovitz,055-555-5555, Tammi Pythonovitz,  054-444-4444'
list_of_fields = re.split(r', *', phone_number_row)
print(list_of_fields)
list_of_users = re.split(r', *\d\d\d-\d\d\d-\d\d\d\d', phone_number_row)
print(list_of_users)

['Tom Pythonovitz', '055-555-5555', 'Tammi Pythonovitz', '054-444-4444']
['Tom Pythonovitz', ', Tammi Pythonovitz', '']


In [55]:
# Sub[stitute]
txt = "The rain in Spain"
print(re.sub("\s", "9", txt))  # \s = space or tab
print(re.sub("\s", "9", txt, 2))  # 2 = replace at most two times

The9rain9in9Spain
The9rain9in Spain


In [5]:
# Substitute with groups
txt = "Agent Adam is going to meet Agent Yosi in the same location they met last time"
agent_pattern = re.compile("Agent (\w)\w+") # \w = any letter (a-z,A-Z)
re.sub(agent_pattern, r"Agent \1", txt) # \1 = group(1)

'Agent A is going to meet Agent Y in the same location they met last time'