In [1]:
import re

## This example also shows how to use a character class and how to capture a match
1. the entire match is captured in group 0
2. you capture part of a match by putting it in parentheses
3. you can capture multiple parts of the match
    - captures are numbered by the nesting of the parentheses (1 - n)
    - innermost is 1, outermost is n

In [3]:
crush = 'Alicia Keys'
match = re.search("Alicia ([a-zA-Z])", crush)
if match:
    print("Full match:", match.group(0))
    print("Captured:", match.group(1))
else:
    print("No match")

Full match: Alicia K
Captured: K


## This example shows how to capture multiple matches
    - captures are numbered by the nesting of the parentheses (1 - n)
    - innermost is 1, outermost is n

In [8]:
crush = 'yonce'
match = re.search("(Be*)*(y(on)ce)", crush)
if match:
    print("capture 0:", match.group(0))
    print("capture 1:", match.group(1))
    print("capture 2:", match.group(2))
    print("capture 3:", match.group(3))
else:
    print("No match")

capture 0: yonce
capture 1: None
capture 2: yonce
capture 3: on


## This example shows how to use the "any old junk" construct
- the `.` matches any character except a newline
- the `*` means "match the preceding character or character class 0 or more times", i.e., it's optional

In [11]:
crush = 'AliciaKeys'
match = re.search("Alicia(.*)Keys", crush)
if match:
    print("Full match:", match.group(0))
    print("Captured:" + "[" + match.group(1) + "]")
else:
    print("No match")

Full match: AliciaKeys
Captured:[]


## A more narrowly defined quantifier: `+`
- `+` means "match the preceding character or character class one or more times
- we can use standard functions to measure the match length

In [15]:
crush = 'Allllllicia Keys'
match = re.search("A(l+)icia Keys", crush)
if match:
    print("Full match:", match.group(0))
    print("There are", len(match.group(1)), "l's:", match.group(1))
else:
    print("No match")

Full match: Allllllicia Keys
There are 6 l's: llllll


## `*`, `+`, and `{}` are greedy, meaning that they match as much as they possibly can

In [16]:
crush = "<BOLD>Holy moly</BOLD>, it's <BOLD>Alicia Keys</BOLD>"
match = re.search("<BOLD>(.*)</BOLD>", crush)
if match:
    print("Found a match:" + "[" + match.group(1) + "]")
else:
    print("No match:", match)

Found a match:[Holy moly</BOLD>, it's <BOLD>Alicia Keys]


## Making the `*`, `+`, and `{}` quantifiers non-greedy with `?`

In [17]:
# add ? to make *, +, and {} NON-GREEDY
crush = "<BOLD>Holy moly</BOLD>, it's <BOLD>Alicia Keys</BOLD>"
match = re.search("<BOLD>(.*?)</BOLD>", crush)
if match:
    print("Found a match:" + "[" + match.group(1) + "]")
else:
    print("No match:", match)

Found a match:[Holy moly]


## Use parentheses to make precise, multi-character matches and captures
- surround the pattern you want to quantify in parentheses
- the entire unit in parentheses is evaluated by the quantifier 

### In this example, we want to match and capture an entire word, or string, rather than a single character

In [20]:
crush = 'AliciaAliciaAlicia Keys'
match = re.search("(Alicia)+", crush)
if match:
    print("Full match:", match.group(0))
    print("Captured:", match.group(1))
else:
    print("No match")

Full match: AliciaAliciaAlicia
Captured: Alicia


## Beware the behavior of `*`

### Why does this match?

In [23]:
crush = 'Beyonce'
match = re.search("(Alicia Keys)*", crush)
if match:
    print("Full match:", match.group(0))
    print("Captured:", match.group(1))
else:
    print("No match")

Full match: 
Captured: None


## Either/or matching if you're not picky

In [33]:
# either, or
crush = 'Alicia'
diva = re.search("Alicia|Beyonce|Mariah", crush)
if match:
    print("Found a diva:", diva.group(0))
else:
    print("No match:", match)

Found a diva: Alicia


## A general, defined quantifier: `{}`

In [38]:
crush = 'AliciaAliciaAliciaAliciaAliciaAlicia Keys'
match = re.search("(Alicia){3,5}", crush)
if match:
    print("Full match:", match.group(0))
    print("Captured:", match.group(1))
else:
    print("No match:", match)

Full match: AliciaAliciaAliciaAliciaAlicia
Captured: Alicia


## More match capturing

In [39]:
crush = 'AliciaAliciaAlicia\t\t\tKeys'
print('crush:', crush)
match = re.search("((Alicia){2,})(\s+)Keys", crush)
if match:
    print("the third capture:" , "[" + match.group(3) + "]")
    print("length of the third capture is", len(match.group(3)))
else:
    print("No match:", match)

crush: AliciaAliciaAlicia			Keys
the third capture: [			]
length of the third capture is 3


## There are lots of modifiers that can be added to your regular expressions
- `re.I` allows for case-insensitive matching

In [40]:
crush = 'Alicia Keys'
match = re.search("alicia", crush, re.I)
if match:
    print("Found a match:", match.group())
else:
    print("No match:", match)

Found a match: Alicia


## Using variables inside regular expressions

In [50]:
cat1 = 'Peaches'
fact = 'We love ' + cat1
print(fact)

pat = re.compile(cat1)
match = pat.search(fact, re.I)
if match:
    print(match.group(0))

We love Peaches
Peaches


## "escape" special characters with a `\` to match them as literals

## a preceding `r` tells Python to match *raw strings*, i.e., not to interpolate special metacharacters

In [76]:
crush  = 'Alicia\tKeys'

match = re.search("t", crush)

if match:
    print("Found a match:", match.group(0))
else:
    print("No match")

match = re.search(r"\t", crush)
if match:
    print("Found another match:" + "[" + match.group(0) + "]")
else:
    print("No match")

No match
1
Found another match:[	]


# THE END