In [1]:
import re  # import re library

In [2]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d')  # create a regex object

mo = phoneNumRegex.search('My number is 412-555-4242')  # return a match object

print(mo.group())  # return the actual matched text

# Note : http:// regexpal.com/ : a web-based re testers

412-555-424


# using (), grouping

In [3]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d)')

mo = phoneNumRegex.search('My number is 412-555-4242')  # return a match object

print(mo.group(1))  # print first group

print(mo.group(2))  # print second group

print(mo.group(0))  # print all

print(mo.groups())

a, b = mo.groups()

print(a)

412
555-424
412-555-424
('412', '555-424')
412


In [4]:
# to match parenthesis
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d)')

mo = phoneNumRegex.search('My number is (412) 555-4242') 

print(mo.group(0))

(412) 555-424


# using |, or

In [5]:
# using pipe |
hero = re.compile(r'superman|batman')

mo1 = hero.search('I am superman not batman')

print(mo1.group())

hero1 = re.compile(r'Bat(man|mobile|copter|bat)')

mo2 = hero1.search('I am Batman')

print(mo2.group())

print(mo2.group(1))

superman
Batman
man


# using ?, optional matching

In [6]:
hero = re.compile(r'Bat(wo)?man')

mo = hero.search('The Adventure of Batman')

mo.group()

'Batman'

In [7]:
mo = hero.search('The Adventure of Batwoman')

mo.group()

'Batwoman'

In [8]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d') 
mo1 = phoneRegex.search('My number is 415-555-4242') 
mo1.group()

'415-555-4242'

In [9]:
mo1 = phoneRegex.search('My number is 555-4242') 
mo1.group()

'555-4242'

# using *, zero or more

In [10]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [11]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [12]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

# using +, 1 or more

In [13]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batman')
mo1 == None

True

In [14]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [15]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

# using {}, Specific repetitions

In [16]:
batRegex = re.compile(r'(Ha){3}')
mo1 = batRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [17]:
mo1 = batRegex.search('HaHa')
mo1 == None

True

In [18]:
batRegex = re.compile(r'(Ha){2,}') # maximum unbounded
mo1 = batRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [19]:
batRegex = re.compile(r'(Ha){2,}')
mo1 = batRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [20]:
batRegex = re.compile(r'(Ha){2,5}') # 2 to 5
mo1 = batRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [21]:
mo1 = batRegex.search('HaHaHaHa')
mo1.group()

'HaHaHaHa'

In [22]:
# Note : Python’s regular expressions are greedy by default, which means that in ambiguous situations they will match the longest string possible.
batRegex = re.compile(r'(Ha){2,5}') # 2 to 5
mo1 = batRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [23]:
batRegex = re.compile(r'(Ha){2,5}?') # non-greedy
mo1 = batRegex.search('HaHaHaHaHa')
mo1.group()

'HaHa'

Note that the question mark can have two meanings in regular expressions: declaring a nongreedy match or flagging an optional group. These meanings are entirely unrelated.

# Findall()
- While `search()` will return a Match object of the first matched text in the searched string, the `findall()` method will return the strings of every match in the searched string. 
- `findall()` will not return a Match object but a list of strings—as long as there are no groups in the regular expression

In [24]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') 
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

If there are groups in the regular expression, then `findall()` will return a list of tuples.

In [25]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') 
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

# Short hand classes
1. \d : numeric digit 0 to 9
2. \D : any char not numeric digit 0 to 9
3. \w : any letter, numeric, _ (matching 'word' characters)
4. \W : not \w
5. \s : any space, tab, or newline character (matching 'space' characters)
6. \S : not \s
7. [0-5] : only match num 0 to 5

In [26]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall(
    '12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge'
)

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

# using [ ], making own character class

In [27]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

- can use hyphen - to define range
- [a-zA-Z0-9] match all lowercase, uppercase & number
- does not nid escape, \ inside []
- using ^ as opening inside [] = negative character class, match all chars not in []

In [28]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

# using ^ and \$, match regex at begining and end
- ^ match regex at begining
- \$ match regex at the end

In [29]:
beginsWithHello = re.compile(r'^Hello')
mo = beginsWithHello.search('Hello world!')
mo.group()

'Hello'

In [30]:
beginsWithHello.search('He said hello.') == None

True

In [31]:
endsWithNumber = re.compile(r'\d+$')
mo = endsWithNumber.search('my age is 12')
mo.group()

'12'

# using wildcard 

## using . , match any 1 char except for a new line

In [32]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [33]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)') 
mo = nameRegex.search('First Name: Al Last Name: Sweigart')

mo.group(1)

'Al'

In [34]:
mo.group(2)

'Sweigart'

In [35]:
mo.group()

'First Name: Al Last Name: Sweigart'

## using .*, match anything & everything

In [36]:
wild = re.compile(r'te.*dd')
mo = wild.search('1231te23123dd12')
mo.group()

'te23123dd'

In [37]:
wild = re.compile(r'te.*dd')
mo = wild.search('1231te23123dd12edte09dd')
mo.group()

'te23123dd12edte09dd'

In [38]:
wild = re.compile(r'te.*?dd') # non-greedy
mo = wild.search('1231te23123dd12edte09dd')
mo.group()

'te23123dd'

In [39]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search(
    'Serve the public trust.\nProtect the innocent. \nUphold the law.')\
    .group()

'Serve the public trust.'

In [40]:
noNewlineRegex = re.compile('.*', re.DOTALL)  # match newline char also
noNewlineRegex.search(
    'Serve the public trust.\nProtect the innocent. \nUphold the law.')\
    .group()



'Serve the public trust.\nProtect the innocent. \nUphold the law.'

# adding re.IGNORECASE / re.I, ignore case-sensitive

In [41]:
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [42]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [43]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

# using `sub()`, substituiting strings

In [45]:
regex = re.compile(r'app\w+')
regex.sub('orange', 'I have an apple.')

'I have an orange.'

In [49]:
# using \1, \2 ... 'Enter the text of group 1 2 and so on in the substituion.'
regex = re.compile(r'(\w+){3} app\w+')
regex.sub(r'\1 orange', 'I have an apple.')

'I have an apple.'