# Regular Expressions

Regular expressions are a fundamental tool used (among other things) for extracting information from text.

In [None]:
import re

In [None]:
?re.search

In [None]:
re.search("abc", "abc123")

In [None]:
re.search("acb", "abc123") == None

# Character groups

- User defined, e.g. '[abc]'
- Pre-defined
     - `\d`    Matches any decimal digit; equivalent to the set [0-9].
     - `\D`    Matches any non-digit character; equivalent to the set [^0-9].
     - `\s`    Matches any whitespace character; equivalent to [ \t\n\r\f\v].
     - `\S`    Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
     - `\w`    Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
             With LOCALE, it will match the set [0-9_] plus characters defined
             as letters for the current locale.
     - `\W`    Matches the complement of \w.
     - `\\`    Matches a literal backslash.

In [None]:
text = "The date is 2017-01-16."
match = re.search("\d\d\d\d-\d\d-\d\d", text)
help(match)

In [None]:
text[match.start():match.end()]

In [None]:
text = "[...] the dates between 2017-01-16 and 2017-04-30."
match = re.search("\d\d\d\d-\d\d-\d\d", text)

In [None]:
re.findall("\d\d\d\d-\d\d-\d\d", text)

In [None]:
?re

# Special characters

    "."      Matches any character except a newline.
    "^"      Matches the start of the string.
    "$"      Matches the end of the string or just before the newline at
             the end of the string.
    "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
             Greedy means that it will match as many repetitions as possible.
    "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
    "?"      Matches 0 or 1 (greedy) of the preceding RE.
    *?,+?,?? Non-greedy versions of the previous three special characters.
    {m,n}    Matches from m to n repetitions of the preceding RE.
    {m,n}?   Non-greedy version of the above.

In [None]:
re.search('\d\d\d\d-\d\d-\d\d', 'meeting on 2016-1-5, 2:15pm.')

In [None]:
re.search('\d{2,4}-\d{1,2}-\d{1,2}', 'meeting on 2016-1-5, 2:15pm.')

In [None]:
re.findall('\d.*\d', '6-7, or 8-9') # pairs of numbers, greedy

In [None]:
re.findall('\d.*?\d', '6-7, or 8-9') # pairs of numbers, non-greedy

# Capture Groups

In [None]:
log_line = '[2016-12-02 22:01:45] localhost INFO: System rebooted.'
log_re = re.compile('\[(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})\]')

In [None]:
match = re.search(log_re, log_line)

In [None]:
match.groups(), match.group(1)

In [None]:
match.start(1), match.end(1)

In [None]:
match.start(0), match.end(0) # careful, gotcha

In [None]:
match.start(2), match.end(2)

## Named groups

In [None]:
log_re = re.compile(('\[(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2})\]'
                     '.+? (?P<level>[A-Z]+): (?P<message>.+)'))

In [None]:
re.search(log_re, log_line).groupdict()

In [None]:
def process_log(date, level, message, time, **kwargs):
    if level == 'INFO':
        print "At {0}, {1}: {2}".format(date, time, message)

In [None]:
process_log(**(re.search(log_re, log_line).groupdict()))

## Search vs. Match

In [None]:
text = 'The time is 12:45pm.'
pattern = '(\d{1,2}:\d{1,2})'
re.search(pattern, text), re.match(pattern, text)

In [None]:
re.match(pattern, '12:12')