# Regexp Basics

In [1]:
import re

In [2]:
pattern = 'abc.e'
assert re.match(pattern, 'abcde')

In [3]:
assert not re.match(pattern, 'abcdde')

In [4]:
pattern = 'abcd?e'
assert re.match(pattern, 'abcde')
assert re.match(pattern, 'abce')
assert not re.match(pattern, 'acbe')

In [5]:
pattern = 'ab[cd]e'
assert re.match(pattern, 'abce')
assert re.match(pattern, 'abde')
assert not re.match(pattern, 'abcde')
assert not re.match(pattern, 'abe')

## Exercise

In [6]:
pattern = 'ab[cd]?e'
assert re.match(pattern, 'abce')
assert re.match(pattern, 'abde')
assert not re.match(pattern, 'abcde')
assert re.match(pattern, 'abe')

Note: We didn't cover the rest of this notebook during the course due to lack of time.

# Regexp Syntax: Basics


## Any character: .

In [7]:
assert     re.match('abc.e', 'abcde')
assert     re.match('abc.e', 'abcfe')
assert not re.match('abc.e', 'abce')
assert not re.match('abc.e', 'abcdde')

## Optional: ?

In [8]:
assert     re.match('abcd?e', 'abce')
assert     re.match('abcd?e', 'abcde')
assert not re.match('abcd?e', 'abcfe')
assert not re.match('abcd?e', 'abcdde')

## Match n times: {n}

In [9]:
assert not re.match('abcd{2}e', 'abce')
assert not re.match('abcd{2}e', 'abcde')
assert     re.match('abcd{2}e', 'abcdde')

## Grouping

In [10]:
assert     re.match('ab(cd)?e', 'abe')
assert     re.match('ab(cd)?e', 'abcde')
assert not re.match('ab(cd)?e', 'abfe')
assert not re.match('ab(cd)?e', 'abcdfe')

## Alternative: |

In [11]:
assert     re.match('ab(c|d)e', 'abce')
assert     re.match('ab(c|d)e', 'abde')
assert not re.match('ab(c|d)e', 'abe')
assert not re.match('ab(c|d)e', 'abcde')

## Alternative group: []

In [12]:
assert     re.match('ab[cd]e', 'abce')
assert     re.match('ab[cd]e', 'abde')
assert not re.match('ab[cd]e', 'abe')
assert not re.match('ab[cd]e', 'abcde')

## Alternative class of characters: [-]

In [13]:
assert     re.match('ab[c-f]e', 'abce')
assert     re.match('ab[c-f]e', 'abde')
assert     re.match('ab[c-f]e', 'abfe')
assert not re.match('ab[c-f]e', 'abe')
assert not re.match('ab[c-f]e', 'abcde')
assert not re.match('ab[c-f]e', 'abcge')

## Any character expect for: [^]

In [14]:
assert not re.match('ab[^cd]e', 'abce')
assert not re.match('ab[^cd]e', 'abde')
assert not re.match('ab[^cd]e', 'abe')
assert not re.match('ab[^cd]e', 'abcde')
assert     re.match('ab[^cd]e', 'abfe')

## Escaping: \

In [15]:
assert     re.match('ab\[d', 'ab[d')
assert not re.match('ab\[d', 'abcd')

## Match to the end of string: $

In [16]:
assert     re.match('abc', 'abc')
assert     re.match('abc', 'abcd')

In [17]:
assert     re.match('abc$', 'abc')
assert not re.match('abc$', 'abcd')

# Exercises: Basic Regexps

For each subexercise, create pattern (as string) in `patternX` variable.
This pattern doesn't have to extract any data from strings.

Then, create compiled patter and store it in `regexX`.
Use `re.VERBOSE` flag for compiled patterns.
Extend the pattern so that you extract data from strings.

## Example

In [18]:
pattern = 'ab.$'
regex = re.compile('''
    ab         # match "ab" characters
    (?P<char>  # create "char" group, so that we can extract data
      .        # match any character
    )          # end of group
    $          # match to the end the string
''', re.VERBOSE)

In [19]:
assert     re.match(pattern, 'aba')
assert     re.match(pattern, 'abb')
assert     re.match(pattern, 'abz')
assert     re.match(pattern, 'ab7')
assert     re.match(pattern, 'ab[')
assert not re.match(pattern, 'ab')
assert not re.match(pattern, 'abcc')

In [20]:
assert     regex.match('aba').groupdict() == {'char': 'a'}
assert     regex.match('abz').groupdict() == {'char': 'z'}
assert     regex.match('ab7').groupdict() == {'char': '7'}
assert     regex.match('ab[').groupdict() == {'char': '['}

## Simple Pattern

In [21]:
pattern1 = 'ab[cd]?$'
regex1 = re.compile('''
    ab                   # match "ab" characters
    (?P<optional_group>  # let's create a group to extract data
      [cd]?              # match c, d or none character
    )                    # end of "optional_group" group
    $                    # match to the end of string
''', re.VERBOSE)

In [22]:
assert     re.match(pattern1, 'ab')
assert     re.match(pattern1, 'abc')
assert     re.match(pattern1, 'abd')
assert not re.match(pattern1, 'abcd')
assert not re.match(pattern1, 'abe')

In [23]:
assert     regex1.match('ab').groupdict() == {'optional_group': ''}
assert     regex1.match('abc').groupdict() == {'optional_group': 'c'}
assert     regex1.match('abd').groupdict() == {'optional_group': 'd'}

## Phone Numbers

In [24]:
pattern2 = '[0-9]{3}[ -]?[0-9]{3}[ -]?[0-9]{3}$'
regex2 = re.compile('''
    (?P<group1> [0-9]{3})
    [ -]?
    (?P<group2> [0-9]{3})
    [ -]?
    (?P<group3> [0-9]{3})
''', re.VERBOSE)

In [25]:
assert     re.match(pattern2, '123 456 789')
assert     re.match(pattern2, '678 543 970')
assert     re.match(pattern2, '987654321')  # grouping digits is optional
assert     re.match(pattern2, '123-456-789')  # you can separate groups by space or dash (or not separate it at all)
assert not re.match(pattern2, '12 345 678') # there must be exactly 9 digits
assert not re.match(pattern2, '12345678')  
assert not re.match(pattern2, '1234567890')
assert not re.match(pattern2, '12 3456 789')  # grouping matters (3 digits in each group)

In [26]:
assert     regex2.match('123 456 789').groupdict() == {'group1': '123', 'group2': '456', 'group3': '789'}
assert     regex2.match('987654321').groupdict() == {'group1': '987', 'group2': '654', 'group3': '321'}

## Phone Numbers with Country Prefix

In [27]:
pattern3 = '(\+[0-9]{1,2}[ -]?)?[0-9]{3}[ -]?[0-9]{3}[ -]?[0-9]{3}$'
regex3 = re.compile('''
    (                      # let's group all the prefix related stuff
      \+                   # plus sign
      (?P<prefix>          # extract prefix number
        [0-9]{1,2}         # prefix is one or two digits
      )
      [ -]?                # optional space or dash
    )?                     # entire prefix is optional
    
    (?P<group1> [0-9]{3})  # frist three digits
    [ -]?                  # optional space or dash
    (?P<group2> [0-9]{3})  # second three digits
    [ -]?                  # optional space or dash
    (?P<group3> [0-9]{3})  # last three digits
''', re.VERBOSE)

In [28]:
assert     re.match(pattern3, '123 456 789')
assert     re.match(pattern3, '678 543 970')
assert     re.match(pattern3, '123456789')
assert     re.match(pattern3, '123-456-789')
assert     re.match(pattern3, '+48 123 456 789')
assert     re.match(pattern3, '+48 123456789')
assert     re.match(pattern3, '+48123456789')
assert     re.match(pattern3, '+1 345 111 222')
assert not re.match(pattern3, ' 345 111 222')
assert not re.match(pattern3, '12 456 789')  # there must be exactly 9 digits (plus optional country prefix)
assert not re.match(pattern3, '12345678')
assert not re.match(pattern3, '1234567890')
assert not re.match(pattern3, '12 3456 789')  # grouping matters (3 digits in each group)
assert not re.match(pattern3, '+489 123 456 789')  # one or two digits allowed after plus
assert not re.match(pattern3, '48 123 456 789')  # if country prefix is present, it must be prepended by plus
assert not re.match(pattern3, '+123 456 789')  # if plus is present, country prefix is required

In [29]:
assert     regex3.match('123456789').groupdict() == {'prefix': None, 'group1': '123', 'group2': '456', 'group3': '789'}
assert     regex3.match('+1 345 111 222').groupdict() == {'prefix': '1', 'group1': '345', 'group2': '111', 'group3': '222'}

# Regexp Syntax: Quantifiers

## Match zero or more: *

In [30]:
assert     re.match('abcd*e', 'abce')
assert     re.match('abcd*e', 'abcde')
assert     re.match('abcd*e', 'abcdde')

## Match one or more: +

In [31]:
assert not re.match('abcd+e', 'abce')
assert     re.match('abcd+e', 'abcde')
assert     re.match('abcd+e', 'abcdde')

## Optional (match zero or one): ?

In [32]:
assert     re.match('abcd?e', 'abce')
assert     re.match('abcd?e', 'abcde')
assert not re.match('abcd?e', 'abcdde')

## Match n times: {n}

In [33]:
assert not re.match('abcd{2}e', 'abce')
assert not re.match('abcd{2}e', 'abcde')
assert     re.match('abcd{2}e', 'abcdde')

## Match between n and m times: {n,m}

In [34]:
assert not re.match('abcd{2,4}e', 'abcde')
assert     re.match('abcd{2,4}e', 'abcdde')
assert     re.match('abcd{2,4}e', 'abcddde')
assert     re.match('abcd{2,4}e', 'abcdddde')
assert not re.match('abcd{2,4}e', 'abcddddde')

## Match n times or more: {n,}

In [35]:
assert not re.match('abcd{2,}e', 'abcde')
assert     re.match('abcd{2,}e', 'abcdde')
assert     re.match('abcd{2,}e', 'abcddde')
assert     re.match('abcd{2,}e', 'abcdddde')
assert     re.match('abcd{2,}e', 'abcddddde')

## Match up to n times: {,n}

In [36]:
assert     re.match('abcd{,4}e', 'abcde')
assert     re.match('abcd{,4}e', 'abcdde')
assert     re.match('abcd{,4}e', 'abcddde')
assert     re.match('abcd{,4}e', 'abcdddde')
assert not re.match('abcd{,4}e', 'abcddddde')

# Regexp Syntax: Character Classes

## Digit: \d

In [37]:
assert     re.match('\d', '0')
assert     re.match('\d', '3')
assert not re.match('\d', 'a')
assert not re.match('\d', '[')
assert not re.match('\d', '\n')

## Non-digit: \D 

In [38]:
assert not re.match('\D', '0')
assert not re.match('\D', '3')
assert     re.match('\D', 'a')
assert     re.match('\D', '[')
assert     re.match('\D', '\n')

## Whitespace \s

In [39]:
assert     re.match('\s', ' ')
assert     re.match('\s', '\t')
assert     re.match('\s', '\n')
assert not re.match('\s', 'a')
assert not re.match('\s', '[')
assert not re.match('\s', '3')

## Non-whitespace: \S

In [40]:
assert not re.match('\S', ' ')
assert not re.match('\S', '\t')
assert not re.match('\S', '\n')
assert     re.match('\S', 'a')
assert     re.match('\S', '[')
assert     re.match('\S', '3')

## Special characters: \n \t 

In [41]:
assert     re.match('\n', '\n')
assert not re.match('\n', ' ')
assert not re.match('\n', '\t')