# Regular Expressions in Python

In [1]:
# import the "re" library to work with regular expressions
import re

## Searching strings for particular words using regex
* Only gives results for first match
* Suitable for checking if strings contain words or patters
* Not suitable for extracting data from strings

In [2]:
# we can search a string for a particular substring
s = 'This is a sentence.'
result = re.search('is', s)
# if the substring was found in the string we get a match object
print('is: ', result)

# else we get 'None'
result2 = re.search('two', s)
print('two: ', result2)

is:  <_sre.SRE_Match object; span=(2, 4), match='is'>
two:  None


In [3]:
# ignore case
print(re.search('is', 'Is', flags=re.IGNORECASE))
# or
print(re.search('is', 'Is', flags=re.I))

<_sre.SRE_Match object; span=(0, 2), match='Is'>
<_sre.SRE_Match object; span=(0, 2), match='Is'>


Notice the match object constains the span of indices where the first match is found, and tells us what the match is.

In [4]:
# we can get the span itself
span = result.span()
print(span)

# and use it to select that substring from the string
print(s[span[0] : span[1]])
# alternatively
print(s[result.start() : result.end()])

(2, 4)
is
is


In [5]:
# the match object contains several other attributes and methods
# but the only one worth mentioning now is `string`

# original string
print(result.string)

This is a sentence.


#### Using search results in a function

In [6]:
s = 'She sells sea shells down by the sea shore'
# get a printout telling us if a word is contained
def word_check(substring, string):
    if re.search(substring, string):
        # use double quotes in case of contractions
        print("{} has been found!".format(substring))
    else:
        print("Sorry, {} is not in the string.".format(substring))

In [7]:
word_check('shell', s)

shell has been found!


In [8]:
word_check('beach', s)

Sorry, beach is not in the string.


## Character or sequence matching using character classes

Character classes are used to match character types, ranges, and sets of specific values. Square brackets are used to denote a character set or range. For example:

* `[xyz]` matches on any of `x`, `y` or `z`
* `[^xyz]` negated set matches anything that is not `x`, `y` or `z`
* `[a-z]` matches on any lower case letter in the english alphabet
* `[0-9]` matches on any integer 0-9
* `[a-zA-Z]` matches any lower or upper case letter
* `[a-z3-9]` matches a-z and 3-9

There are also shorthand character classes

* `\d` for digits
* `\D` not a digit
* `\w` words (alpha-numeric, including underscore)
* `\W` not alpha-numeric
* `\s` whitespace
* `\S` not whitespace
* `.` (dot) matches any single character except line breaks

### Search for numbers (digits)

In [9]:
s = 'This strings contains numbers 123 456 7758'

# return the first digit found using the '\d' token
pattern = r"\d"
print(re.search(pattern, s))

# return first match of one or more consectutive digits
pattern = r"\d+"
print(re.search(pattern, s))

# return first match of one or more digits where digits are in range 4-6
pattern = r"[4-6]+"
print(re.search(pattern, s))

# return first match of one or more digits where digits are in range
# notice that only 77 is consectutive (5 breaks the pattern)
pattern = r"[7-9]+"
print(re.search(pattern, s))

<_sre.SRE_Match object; span=(30, 31), match='1'>
<_sre.SRE_Match object; span=(30, 33), match='123'>
<_sre.SRE_Match object; span=(34, 37), match='456'>
<_sre.SRE_Match object; span=(38, 40), match='77'>


### Character matching

#### Search for a range

In [10]:
# first group containing letters q, r, s, t, u, or v
# the '+' indicates one or more (without would only match first in group)
pattern = r"[q-v]+"
print(re.search(pattern, "In a queue"))

# without '+'
pattern = r"[q-v]"
print(re.search(pattern, "In a queue"))

<_sre.SRE_Match object; span=(5, 7), match='qu'>
<_sre.SRE_Match object; span=(5, 6), match='q'>


#### Search for letters only (excludes alpha-numeric)

In [11]:
pattern = r"[a-zA-z]+"
print(re.search(pattern, "Alphanum3ric"))

# alternatively (search for non-digits)
pattern = r"(\D+)"
print(re.search(pattern, "Alphanum3ric"))

<_sre.SRE_Match object; span=(0, 8), match='Alphanum'>
<_sre.SRE_Match object; span=(0, 8), match='Alphanum'>


#### Search for non-special characters

In [12]:
pattern = r"\w+"  # will include underscore
print(re.search(pattern, "sp3cial-characters"))
print(re.search(pattern, "sp3cial_characters"))

<_sre.SRE_Match object; span=(0, 7), match='sp3cial'>
<_sre.SRE_Match object; span=(0, 18), match='sp3cial_characters'>


#### Search for special characters

In [13]:
pattern = r"\W+"
print(re.search(pattern, "sp3cial-%characters"))

<_sre.SRE_Match object; span=(7, 9), match='-%'>


#### Search for whitespace

In [14]:
re.search(r"\s+", "Here   is a string with whitespace")

<_sre.SRE_Match object; span=(4, 7), match='   '>

#### Search for non-whitespace

In [15]:
# This will find the first word
re.search(r"\S+", "  Here is a string with leading whitespace")

<_sre.SRE_Match object; span=(2, 6), match='Here'>

#### Search for negation of set

In [16]:
pattern = r"[^123]+"
# this will return first group not containing 1, 2 or 3
re.search(pattern, "12345")

<_sre.SRE_Match object; span=(3, 5), match='45'>

#### Use wild card

In [17]:
s = "quark, quick"
# match anything with one character between 'qu' and 'ck
# quark will not match
pattern = r"qu.ck"
re.search(pattern, s)

<_sre.SRE_Match object; span=(7, 12), match='quick'>

#### Search for a specific date format

In [18]:
# this will return the first instance that matches the pattern of:
#  one or more letters followed by a space and then one or more digits
pattern = r"[a-zA-z]+ \d+"
re.search(pattern, "The date is August 27")

<_sre.SRE_Match object; span=(12, 21), match='August 27'>

In [19]:
# this will return the first instance that matches the pattern:
#  one or more digits + / + one or more digits + / + one or more digits
pattern = r"\d+/\d+/\d+"
re.search(pattern, "The date is Aug 27, or 08/27/2017")

<_sre.SRE_Match object; span=(23, 33), match='08/27/2017'>

## Pattern matching using global searches

In [20]:
s = "The dates are 9/25/2017, 10/21/2017, and 12/12/2017"
pattern = r"\d+/\d+/\d+"
# return list of all dates in string matching the pattern
re.findall(pattern, s)

['9/25/2017', '10/21/2017', '12/12/2017']

In [21]:
# we can also use finditer to get results as a generator instead of a list
# this will give us the details for each match
results = re.finditer(pattern, s)
results

<callable_iterator at 0x7f042c34ec18>

In [22]:
for d in results:
    print(d.span(), d.group())

(14, 23) 9/25/2017
(25, 35) 10/21/2017
(41, 51) 12/12/2017


In [23]:
# we can get date tuples by using () for groupings around the digit portions
# but don't forget the forward slashes
pattern = r"(\d+)/(\d+)/(\d+)"
# returns list of matches within groups
re.findall(pattern, s)

[('9', '25', '2017'), ('10', '21', '2017'), ('12', '12', '2017')]

### Find and replace

In [24]:
s = "The dates are 9/25/2017, 10/21/2017, and 12/12/2017"

# replace slashes with points
pattern = r"/"
# the dot won't count as wild card in the replacement string
# since it only works that way in the pattern
repl = "."
re.sub(pattern, repl, s)

'The dates are 9.25.2017, 10.21.2017, and 12.12.2017'

## Anchors

Anchors are special characters or tokens used to indicate locations in text to match. You can see each of these anchors in use below.

#### Beginning
Anchor: `^`

In [25]:
s1 = "number: 1"
s2 = "this is number 2"
# match on beginning of string
pattern = r"^number"
print(re.search(pattern, s1))  # matches
print(re.search(pattern, s2))  # doesn't match

<_sre.SRE_Match object; span=(0, 6), match='number'>
None


#### End
Anchor: `$`

In [26]:
s1 = "count: ten"
s2 = "count: 3"
# match on end of string
pattern = r"\d$"
print(re.search(pattern, s1))  # no match
print(re.search(pattern, s2))  # match

None
<_sre.SRE_Match object; span=(7, 8), match='3'>


#### Word boundary
Anchor: `\b`

In [27]:
s = "The duck called the doctor a quack"
# matches if there is a whitespace (boundary) before the match
pattern = r"\bd"
print(re.findall(pattern, s))
# boundary after match
pattern = r"d\b"
print(re.findall(pattern, s))

['d', 'd']
['d']


#### Not word boundary
Anchor: `\B`

In [28]:
s = "Numb3r 15a"
# matches if there is not a boundary before match
pattern = r"\B\d"
print(re.findall(pattern, s))
# if not a boundary after match
pattern = r"\d\B"
print(re.findall(pattern, s))

['3', '5']
['3', '1', '5']


## Quantifiers

We have already seen the '+' quantifier, but here is a list of quantifiers

* plust (`+`)
    - used to indicate one or more of previous token
* asterisk (`*`)
    - used to indicate zero or more
* curly braces (`{}`)
    - used to specify number or range
    - {2} indicates exactly 2
    - {2,4} indicates 2 to 4
    - {2,} indicates 2 or more
* question mark (`?`)
    - used to for 0 or 1 of previous token
    - e.g. `r"\d?"` matches 0 or 1 digits

* Logical or `|`
    - used to match tonen_a or token_b
    - e.g. `(\d{1,2}|400)` matches on one or two digits or the number 400
    - e.g. `(T|N|Fr)ed` would match Ted, Ned, or Fred

In [29]:
s = "num1, num15, num301"

# pattern matches first instance with 3 digits
pattern = r"\d{3}"
print(re.search(pattern, s))

# pattern matches first instance with 2 to 5 digits
pattern = r"\d{2,5}"
print(re.search(pattern, s))

# get all that matches 3 lower case letters immediately followed by 2-3 digits
pattern = r"[a-z]{3}\d{2,3}"
print(re.findall(pattern, s))

<_sre.SRE_Match object; span=(16, 19), match='301'>
<_sre.SRE_Match object; span=(9, 11), match='15'>
['num15', 'num301']


## Escaped Characters

These are simply ways of inserting characters that would otherwise be interpreted as quantifiers or other non-string parts of the expression. They are common escape characters preceeded by a backslash:

* `\.`
* `\[`
* `\+`
* etc.

## Groups and Lookahead

### Groups

Groups are formed by placing () around certain parts of the expression (pattern). When you run searches on patterns that contain groups, you can refer to these groups in the in a couple ways, allowing you to get creative with how you use results. The methods are

* using the `group()` method
* using backreference (an example with this in a bit)

#### Capturing groups

These are "captured" into the resulant matching groups, meaning they can be called using the `group(i)` method, where i is the group number. Group number 0 is the entire match as one string. Groups 1+ are the individual groups from the match, in the order they appear in the string.

In [30]:
s = "Name: Sierra Johnson Foo"
pattern = r"(Name:) (\w+ \w+)"
results = re.search(pattern, s)
print(results.group(0))
print(results.group(1))
print(results.group(2))

Name: Sierra Johnson
Name:
Sierra Johnson


#### Non-capturing groups

This simply means any group labeled as non-capturing in the pattern will not be included in the groups.

In [31]:
s = "Name: Sierra Johnson Foo"
# using '?:' at the beginning of a group makes it non-capturing
pattern = r"(?:Name:) (\w+ \w+)"
results = re.search(pattern, s)
# group 1 is now just Sierra Johnson instead of 'Name:'
results.group(1)

'Sierra Johnson'

#### Using backreference

In [32]:
# find/replace multiple groups

# replace slashes with points
# and change remove the day from date
s = "The dates are 9/25/2017, 10/21/2017, and 12/12/2017"
pattern = r"(\d+)/(\d+)/(\d+)"

# we reference groups using '\group_num' (this is called backreference)
# replaces orig dates with month.year (group1.group3)
repl = r"\1.\3"
re.sub(pattern, repl, s)

'The dates are 9.2017, 10.2017, and 12.2017'

### Lookahead

Lookahead has positive and negative directions, allowing us to match classes that either do preceed (positive) or don't preceed (negative) other specific classes/patterns.

#### Positive lookahead

In [46]:
s = "number 123"
# match a pattern that is alpha-numeric and preceeds space+digits
pattern = r"\w+(?= \d+)"
re.search(pattern, s)

<_sre.SRE_Match object; span=(0, 6), match='number'>

#### Negative lookahead

In [47]:
s = "number 123"
# match a pattern that is alpha-numeric and does not preceed space+digits
pattern = r"\w+(?! \d+)"
# this will result in 'numbe' becuse it is the first sequence of one or more
# letters that is not followed by space+digits
re.search(pattern, s)

<_sre.SRE_Match object; span=(0, 5), match='numbe'>

### Lookbehind

Lookbehind is the flipped version of lookahead, and the lookbehind expression must come first. Lookbehind requires fixed-width expressions in the lookbehind.

#### Positive lookbehind

In [48]:
s = "number 123"
# get first match of digits that follows 'number '
pattern = r"(?<=number )\d+"
re.search(pattern, s)

<_sre.SRE_Match object; span=(7, 10), match='123'>

In [None]:
# not a fixed-width pattern; won't work
pattern = r"(?<=\w+ )\d+"
# raises error
re.search(pattern, s)

#### Negative lookbehind (negation)

Format: (?<!expression)expression

## Handing multi-line text

Multi-line text won't work with anchors without using the multiline flag

In [35]:
s = """This is line one,
this is line two,
this is line three
and this is line four."""
# the output contains \n new line chars
s

'This is line one,\nthis is line two,\nthis is line three\nand this is line four.'

In [36]:
print(s)

This is line one,
this is line two,
this is line three
and this is line four.


In [37]:
# use the re.MULTILINE flag to let regex work with metachars (anchors) line start (^) and end ($) 
pattern = r"^this"

# without multiline flag
print(re.findall(pattern, s, re.IGNORECASE))

# with multiline flag
# we can use multiple flags by placing the or symbol (vertical bar) between them 
print(re.findall(pattern, s, re.IGNORECASE | re.MULTILINE))

['This']
['This', 'this', 'this']


## Compiling patterns

This allows us to compile a patter to be used on multiple strings of text. Without this, the same pattern is read and processed for every single string we compare to.

In [38]:
from numpy.random import randint
# create four strings with random dates
strings = ["The dates are 2/{0}/2017, 9/{1}/2017, and 12/{1}/2017".format(randint(1, 29), randint(1,32))
           for i in range(4)]
# compile pattern for multiple uses
pattern = re.compile(r"\d+/\d+/\d+")
for s in strings:
    # now we don't add the pattern arg each time to findall()
    print(pattern.findall(s))

['2/26/2017', '9/2/2017', '12/2/2017']
['2/18/2017', '9/24/2017', '12/24/2017']
['2/19/2017', '9/27/2017', '12/27/2017']
['2/10/2017', '9/5/2017', '12/5/2017']
