In [3]:
import re

# Simple regex

In [1]:
txt = '배송비 2,500원'

## Getting digits only

In [5]:
re.search(r'\d+', txt)

<re.Match object; span=(4, 5), match='2'>

In [6]:
re.search(r'\d+', txt).group(0)

'2'

In [7]:
re.search(r'\d+', txt).group(1)

IndexError: no such group

In [8]:
re.findall(r'\d+', txt)

['2', '500']

In [12]:
num_only = re.findall(r'\d+', txt)

In [13]:
''.join(num_only)

'2500'

## Getting alphanumeric characters at end of string

In [16]:
re.search(r'\w$', txt)

<re.Match object; span=(9, 10), match='원'>

In [17]:
re.search(r'\w$', txt).group(0)

'원'

In [18]:
re.search(r'\w$', txt).group(1)

IndexError: no such group

## Getting alphanumeric characters at beginning of string

In [25]:
re.search(r'^\w+', txt)

<re.Match object; span=(0, 3), match='배송비'>

In [26]:
re.search(r'^\w+', txt).group(0)

'배송비'

# More difficult regex

In [4]:
txt = '732원/100ml당'

### How to get ml?

one way is to look for the second group of digits first and then look for the word 당 and get anything in between

In [11]:
re.search(r'\d+', txt)

<re.Match object; span=(0, 3), match='732'>

In [12]:
re.search(r'\d+', txt).group(0)

'732'

In [13]:
re.search(r'\d+', txt).group(1)

IndexError: no such group

but how do we get the second group (i.e. the 100?)

## Introducing groups

In [104]:
test_match = re.search(r'(\d+)(.*?)(\d+)', txt)

In [105]:
test_match

<re.Match object; span=(0, 8), match='732원/100'>

In [106]:
# whole match expression i.e. all groups
test_match.group(0)

'732원/100'

In [107]:
# first group (i.e. the first (\d+))
test_match.group(1)

'732'

In [118]:
# second group (i.e. (.*?), any optional number of characters)
# . = any character except newlines
# * = zero more times
# ? = zero or one times (non-greedy or lazy)
# .*? = lazy matching of any character optional times
# * matches any number of times . characters, goes to the end and then comes back to get back to match
# *? matches the first occurrence and then comes back to match
test_match.group(2)

'원/'

In [112]:
# third group (i.e. the second (\d+))
test_match.group(3)

'100'

Ok but how do we get the the ml after the 당?
Just group until you get what you want

In [207]:
test_match2 = re.search(r'\d+.*\d+(.*)당$', txt)
# match returns only groups within parenthesis
# $ = matches at end of string

In [208]:
test_match2.group(0)

'732원/100ml당'

In [209]:
test_match2.group(1)

'ml'

In [210]:
test_match2.group(2)

IndexError: no such group

But this is so complicated. 

I have to parse all the first characters first and then capture

What if I don't care about the all the other characters and just want to get something in the middle?

In [227]:
# Introducting lookahead and lookbehind
# lookbehind = (?<= **characters you want to look for behind what you want**)
# lookahead = (?<= **characters you want to look for ahead what you want**)
test_match3 = re.search(r'(?<=\/)(\d+)(\D+)(?=당)', txt)

In [228]:
test_match3.group(0)

'100ml'

In [229]:
test_match3.group(1) # one or more digits

'100'

In [231]:
test_match3.group(2) # one or more non digits

'ml'

Why split in two groups?

In [232]:
test_match4 = re.search(r'(?<=\/\d+)(\D+)(?=당)', txt)

error: look-behind requires fixed-width pattern

look behinds can only look for fixed number of occurrences of a character

![metacharacters](metacharacters.png)

![special characters](special_characters.png)

![sets](sets.png)