# Regular Expressions

Regular expressions are a fundamental tool used (among other things) for extracting information from text.

In [1]:
import re

In [2]:
?re.search

In [3]:
re.search("abc", "abc123")

<_sre.SRE_Match at 0x7f1a8027b168>

In [5]:
re.search("acb", "abc123") == None

True

# Character groups

- User defined, e.g. '[abc]'
- Pre-defined
     - `\d`    Matches any decimal digit; equivalent to the set [0-9].
     - `\D`    Matches any non-digit character; equivalent to the set [^0-9].
     - `\s`    Matches any whitespace character; equivalent to [ \t\n\r\f\v].
     - `\S`    Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
     - `\w`    Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
             With LOCALE, it will match the set [0-9_] plus characters defined
             as letters for the current locale.
     - `\W`    Matches the complement of \w.
     - `\\`    Matches a literal backslash.

In [11]:
text = "The date is 2017-01-16."
match = re.search("\d\d\d\d-\d\d-\d\d", text)
help(match)

Help on SRE_Match object:

class SRE_Match(__builtin__.object)
 |  The result of re.match() and re.search().
 |  Match objects always have a boolean value of True.
 |  
 |  Methods defined here:
 |  
 |  __copy__(...)
 |  
 |  __deepcopy__(...)
 |  
 |  end(...)
 |      end([group=0]) -> int.
 |      Return index of the end of the substring matched by group.
 |  
 |  expand(...)
 |      expand(template) -> str.
 |      Return the string obtained by doing backslash substitution
 |      on the string template, as done by the sub() method.
 |  
 |  group(...)
 |      group([group1, ...]) -> str or tuple.
 |      Return subgroup(s) of the match by indices or names.
 |      For 0 returns the entire match.
 |  
 |  groupdict(...)
 |      groupdict([default=None]) -> dict.
 |      Return a dictionary containing all the named subgroups of the match,
 |      keyed by the subgroup name. The default argument is used for groups
 |      that did not participate in the match
 |  
 |  groups(...)
 | 

In [13]:
text[match.start():match.end()]

'2017-01-16'

In [16]:
text = "[...] the dates between 2017-01-16 and 2017-04-30."
match = re.search("\d\d\d\d-\d\d-\d\d", text)

In [22]:
re.findall("\d\d\d\d-\d\d-\d\d", text)

['2017-01-16', '2017-04-30']

In [23]:
?re

# Special characters

    "."      Matches any character except a newline.
    "^"      Matches the start of the string.
    "$"      Matches the end of the string or just before the newline at
             the end of the string.
    "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
             Greedy means that it will match as many repetitions as possible.
    "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
    "?"      Matches 0 or 1 (greedy) of the preceding RE.
    *?,+?,?? Non-greedy versions of the previous three special characters.
    {m,n}    Matches from m to n repetitions of the preceding RE.
    {m,n}?   Non-greedy version of the above.

In [25]:
re.search('\d\d\d\d-\d\d-\d\d', 'meeting on 2016-1-5, 2:15pm.')

In [26]:
re.search('\d{2,4}-\d{1,2}-\d{1,2}', 'meeting on 2016-1-5, 2:15pm.')

<_sre.SRE_Match at 0x7f1a802238b8>

In [29]:
re.findall('\d.*\d', '6-7, or 8-9') # pairs of numbers, greedy

['6-7, or 8-9']

In [30]:
re.findall('\d.*?\d', '6-7, or 8-9') # pairs of numbers, non-greedy

['6-7', '8-9']

# Capture Groups

In [35]:
log_line = '[2016-12-02 22:01:45] localhost INFO: System rebooted.'
log_re = re.compile('\[(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})\]')

In [36]:
match = re.search(log_re, log_line)

In [40]:
match.groups(), match.group(1)

(('2016-12-02', '22:01:45'), '2016-12-02')

In [41]:
match.start(1), match.end(1)

(1, 11)

In [44]:
match.start(0), match.end(0) # careful, gotcha

(0, 21)

In [45]:
match.start(2), match.end(2)

(12, 20)

## Named groups

In [50]:
log_re = re.compile(('\[(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2})\]'
                     '.+? (?P<level>[A-Z]+): (?P<message>.+)'))

In [51]:
re.search(log_re, log_line).groupdict()

{'date': '2016-12-02',
 'level': 'INFO',
 'message': 'System rebooted.',
 'time': '22:01:45'}

In [53]:
def process_log(date, level, message, time, **kwargs):
    if level == 'INFO':
        print "At {0}, {1}: {2}".format(date, time, message)

In [54]:
process_log(**(re.search(log_re, log_line).groupdict()))

At 2016-12-02, 22:01:45: System rebooted.


## Search vs. Match

In [56]:
text = 'The time is 12:45pm.'
pattern = '(\d{1,2}:\d{1,2})'
re.search(pattern, text), re.match(pattern, text)

(<_sre.SRE_Match at 0x7f1a8015d120>, None)

In [57]:
re.match(pattern, '12:12')

<_sre.SRE_Match at 0x7f1a8015d7b0>