# Python re(gex)?<br>
https://regex101.com/r/HSeO0z/2

## re.search<br>
`in` in string <br>

`re.search(pattern, string, flags=0)`

In [2]:
import re 

sentence = 'This is a sample string'

print( bool(re.search(r'is', sentence)) )
print( bool(re.search(r'xyz', sentence)) )

True
False


In [3]:
# re.IGNORECASE (or re.I) is a flag to enable case insensitive matching

bool(re.search(r'this', sentence, flags=re.I))


True

In [4]:
words = ['cat', 'attempt', 'tattle']

print( [w for w in words if re.search(r'tt', w)] )

print( all(re.search(r'at', w) for w in words) )

print( any(re.search(r'stat', w) for w in words) )



['attempt', 'tattle']
True
False


## re.sub <br>

`str.replace`

`re.sub(pattern, repl, string, count=0, flags=0)`

In [5]:
greeting = 'Have a nice weekend'

# same as: greeting.replace('e', 'E')

re.sub(r'e', 'E', greeting)
    

'HavE a nicE wEEkEnd'

In [6]:
# replace first two occurrences of 'e' with 'E'

re.sub(r'e', 'E', greeting, count=2)


'HavE a nicE weekend'

In [7]:
word = 'cater'

re.sub(r'cat', 'wag', word)


'wager'

In [9]:
# need to explicitly assign the result if 'word' has to be changed

word = re.sub(r'cat', 'wag', word)

## Compiling regular expressions<br>
`re.compile`<br>
`re.compile(pattern, flags=0)`

In [13]:
pet = re.compile(r'dog')

# what is type of compile
print( type(pet) )

print()
print(pet)


<class 're.Pattern'>

re.compile('dog')


In [14]:
bool(pet.search('They bought a dog'))

True

## Pattern.search(string[, pos[, endpos]])

In [17]:
sentence = 'This is a sample string'
word = re.compile(r'is')

# search for 'is' starting from 5th character of 'sentence' variable

print( bool(word.search(sentence, 4)) )

# search for 'is' starting from 7th character of 'sentence' variable
print( bool(word.search(sentence, 6)) )

# search for 'is' between 3rd and 4th characters
print( bool(word.search(sentence, 2, 4)) )


True
False
True


## bytes<br>
To work with bytes data type, the RE must be of bytes data as well.

In [19]:
byte_data = b'This is a sample string'
byte_data

b'This is a sample string'

In [20]:
# error message truncated for presentation purposes
re.search(r'is', byte_data)

TypeError: cannot use a string pattern on a bytes-like object

In [21]:
# use rb'..' for constructing bytes pattern
bool(re.search(rb'is', byte_data))

True

## Anchors<br>
The characters with special meaning are known as metacharacters

### String anchors<br>
\A the matching to the start of string.<br>
\Z the matching to the end of string

In [23]:
# \A is placed as a prefix to the search term
bool(re.search(r'\Acat', 'cater'))

True

In [24]:
# \Z is placed as a suffix to the search term
bool(re.search(r'are\Z', 'spare'))

True

In [25]:
words = ['surrender', 'unicorn', 'newer', 'door', 'empty', 'eel', 'pest']
[w for w in words if re.search(r'er\Z', w)]

['surrender', 'newer']

### string concatenation operations by using the anchors

In [27]:
# insert text at the start of a string

re.sub(r'\A', 're', 'live')

'relive'

In [28]:
# appending text
re.sub(r'\Z', 'er', 'cat')

'cater'

In [30]:
# search at in a strings

word_pat = re.compile(r'\Aat')

print( bool(word_pat.search('cater', 1)) ) # one character
print()
print( bool(word_pat.search('cater'[1:])) ) # starts from

False

True


### re.fullmatch<br>

Combining both the start and end string anchors. similar to `==`. <br>

`re.fullmatch(pattern, string, flags=0)`

In [32]:
word_pat = re.compile(r'\Acat\Z')

bool(word_pat.search('cat'))

True

In [33]:
word_pat = re.compile(r'cat', flags=re.I)

bool(word_pat.fullmatch('Cat'))

True

### Line anchors<br>
^ metacharacter for matching the start of line and $ for matching the end of line.

In [34]:
pets = 'cat and dog'

print( bool(re.search(r'^cat', pets)) )

print( bool(re.search(r'dog$', pets)) )


True
True


In [36]:
# end of lines

greeting = 'hi there\nhave a nice day\n'

print( bool(re.search(r'day$', greeting)) )
print( bool(re.search(r'day\n$', greeting)) ) # matches \n also

True
True


In [38]:
print( bool(re.search(r'day\Z', greeting)) ) # works for words only

print( bool(re.search(r'day\n\Z', greeting)) )

False
True


#### To indicate that the input string should be treated as multiple lines, you need to enable the re.MULTILINE flag (or re.M short form).

In [40]:
# check if any line in the string starts with 'top'
bool(re.search(r'^top', 'hi hello\ntop spot', flags=re.M))

True

In [41]:
# filter all elements having lines ending with 'are'
elements = ['spare\ntool', 'par\n', 'dare']
[e for e in elements if re.search(r'are$', e, flags=re.M)]

['spare\ntool', 'dare']

In [42]:
# check if any complete line in the string is 'par'
bool(re.search(r'^par$', 'spare\npar\ndare', flags=re.M))

True

#### Just like string anchors, you can use the line anchors by themselves as a pattern.

In [43]:
# note that there is no \n at the end of this input string
ip_lines = 'catapults\nconcatenate\ncat'
print(re.sub(r'^', '* ', ip_lines, flags=re.M))


* catapults
* concatenate
* cat


In [44]:
print(re.sub(r'$', '.', ip_lines, flags=re.M))


catapults.
concatenate.
cat.


### Word anchors<br>
\b -> Boundary

In [45]:
words = 'par spar apparent spare part'

# replace 'par' irrespective of where it occurs
re.sub(r'par', 'X', words)

'X sX apXent sXe Xt'

In [47]:
# replace 'par' only at start of word
re.sub(r'\bpar', 'X', words)

'X spar apparent spare Xt'

In [48]:
# replace 'par' only at end of word
re.sub(r'par\b', 'X', words)

'X sX apparent spare part'

In [49]:
# replace 'par' only if it is not part of another word
re.sub(r'\bpar\b', 'X', words)

'X spar apparent spare part'

### The word boundary has an opposite anchor too. \B

In [50]:
words = 'par spar apparent spare part'

# replace 'par' if it is not start of word
re.sub(r'\Bpar', 'X', words)

'par sX apXent sXe part'

In [51]:
# replace 'par' if it is surrounded by word characters
re.sub(r'\Bpar\B', 'X', words)


'par spar apXent sXe part'

In [52]:

re.sub(r'\b', ':', 'copper')

':copper:'

In [53]:
re.sub(r'\B', ':', 'copper')

'c:o:p:p:e:r'

## Alternation and Grouping<br>
Similar to logical OR, alternation in regular expressions allows you to combine multiple patterns. 

In [54]:
# Alternation

# match either 'cat' or 'dog'
bool(re.search(r'cat|dog', 'I like cats'))

True

In [55]:
# replace either 'cat' at start of string or 'cat' at end of word
re.sub(r'\Acat|cat\b', 'X', 'catapults concatenate cat scat')

'Xapults concatenate X sX'

In [56]:
# replace either 'cat' or 'dog' or 'fox' with 'mammal'
re.sub(r'cat|dog|fox', 'mammal', 'cat dog bee parrot fox')

'mammal mammal bee parrot mammal'

### Grouping

In [57]:
# without grouping
re.sub(r'reform|rest', 'X', 'red reform read arrest')


'red X read arX'

In [58]:
# with grouping
re.sub(r're(form|st)', 'X', 'red reform read arrest')

'red X read arX'

In [59]:
re.sub(r'\bpar(|t)\b', 'X', 'par spare part party')

'X spare X party'

### join method

In [61]:
words = ['cat', 'par']
'|'.join(words)

'cat|par'

In [62]:
# without word boundaries, any matching portion will be replaced
re.sub('|'.join(words), 'X', 'cater cat concatenate par spare')


'Xer X conXenate X sXe'

### re.fullmatch

In [63]:
terms = ['no', 'ten', 'it']
items = ['dip', 'nobody', 'it', 'oh', 'no', 'bitten']

pat = re.compile('|'.join(terms))

# matching only whole elements
[w for w in items if(pat.fullmatch(w))]


['it', 'no']

In [64]:
# matching anywhere
[w for w in items if(pat.search(w))]


['nobody', 'it', 'no', 'bitten']

## Precedence rules

In [65]:
words = 'lion elephant are rope not'

print( re.search(r'on', words) )
print( re.search(r'ant', words) )

<re.Match object; span=(2, 4), match='on'>
<re.Match object; span=(10, 13), match='ant'>


In [67]:
print( re.sub(r'on|ant', 'X', words, count=1) ) # only word replaced

liX elephant are rope not


In [70]:
# count optional argument here restricts no. of replacements to 1; so NO replacement
re.sub(r'ant|on', 'X', words, count=1)

'liX elephant are rope not'

#### What happens if alternatives match on same index? The precedence is then left to right in the order of declaration.

In [71]:
mood = 'best years'

print( re.search(r'year', mood) )

print( re.search(r'years', mood) )


<re.Match object; span=(5, 9), match='year'>
<re.Match object; span=(5, 10), match='years'>


In [72]:
# starting index for 'year' and 'years' will always be same

re.sub(r'year|years', 'X', mood, count=1)

'best Xs'

In [73]:
re.sub(r'years|year', 'X', mood, count=1)

'best X'

### without count

In [74]:
words = 'ear xerox at mare part learn eye'

# this is going to be same as: r'ar'
re.sub(r'ar|are|art', 'X', words)

'eX xerox at mXe pXt leXn eye'

In [75]:
# this is going to be same as: r'are|ar'
re.sub(r'are|ar|art', 'X', words)

'eX xerox at mX pXt leXn eye'

In [76]:
# phew, finally this one works as needed
re.sub(r'are|art|ar', 'X', words)


'eX xerox at mX pX leXn eye'

In [79]:
# with sorted

words = ['hand', 'handy', 'handful']

alt = re.compile('|'.join(sorted(words, key=len, reverse=True)))

print( alt.pattern )
print()
print( alt.sub('X', 'hands handful handed handy') )

handful|handy|hand

Xs X Xed X


In [80]:
# without sorting, alternation order will come into play
re.sub('|'.join(words), 'X', 'hands handful handed handy')


'Xs Xful Xed Xy'

## Escaping metacharacters<br>
Escaping with \

In [81]:
# even though ^ is not being used as anchor, it won't be matched literally
bool(re.search(r'b^2', 'a^2 + b^2 - C*3'))


False

In [82]:
# escaping will work
bool(re.search(r'b\^2', 'a^2 + b^2 - C*3'))


True

In [83]:
# match ( or ) literally

re.sub(r'\(|\)', '', '(a*b) + c')


'a*b + c'

In [84]:
# note that here input string is also a raw string
re.sub(r'\\', '/', r'\learn\by\example')


'/learn/by/example'

In [85]:
eqn = 'f*(a^b) - 3*(a^b)'

# straightforward search and replace, no need RE shenanigans

eqn.replace('(a^b)', 'c')

'f*c - 3*c'

### re.escape

In [87]:
expr = '(a^b)'

# print used here to show results similar to raw string
print(re.escape(expr))


\(a\^b\)


In [88]:
# replace only at end of string
eqn = 'f*(a^b) - 3*(a^b)'
re.sub(re.escape(expr) + r'\Z', 'c', eqn)


'f*(a^b) - 3*c'

In [89]:
terms = ['a_42', '(a^b)', '2|3']
# using 're.escape' and 'join' to construct the pattern
pat1 = re.compile('|'.join(re.escape(s) for s in terms))

# using only 'join' to construct the pattern
pat2 = re.compile('|'.join(terms))

print(pat1.pattern)
print()
print(pat2.pattern)


a_42|\(a\^b\)|2\|3

a_42|(a^b)|2|3


In [91]:
s = 'ba_423 (a^b)c 2|3 a^b'

print( pat1.sub('X', s) )
print()
print( pat2.sub('X', s) )

bX3 Xc X a^b

bXX (a^b)c X|X a^b


### Escape sequences

In [92]:
re.sub(r'\t', ':', 'a\tb\tc')

'a:b:c'

In [93]:
re.sub(r'\n', ' ', '1\n2\n3')

'1 2 3'

In [94]:
re.search(r'\e', 'hello')

error: bad escape \e at position 0

### hexadecimal escape of the format \xNN where NN are exactly two hexadecimal characters.

In [96]:
# \x20 is space character
re.sub(r'\x20', '', 'h e l l o')


'hello'

In [97]:
# \x7c is '|' character
re.sub(r'2\x7c3', '5', '12|30')


'150'

In [98]:
re.sub(r'2|3', '5', '12|30')

'15|50'