# Regex Reference Sheet

In [None]:
import re

# Metacharacters

In [None]:
.    #match any character except newline  .* will go all the way to the end and backtrack
\    #Escape special characters. eg: \. or \\ or \[ to match . or \ or [ outside of character class

#Quantifiers
?   #match 0 or 1 repetitions of RE       .*? for shortest match  
*   #match 0 or more repetitions of RE   .* for longest match   
+   #match 1 or more repetitions of RE
{3}    #match exactly 3 copies of previous RE
{1,4}  #match from 1 to 4 repetitions of RE (both inclusive)
{2,}   #match 2 or more repetitions of RE

#Class
[ ]   #match a set of characters

#Logic
'this|that'    #match either this or that

#Group
(...)    #match RE inside parenthesis and indicate start and end of a group. Access by re.match(..).group(0,1,2)  
\1       #contents of Group 1
\2       #contents of Group 2
(?:...)  #non-capturing group

#Anchors/Boundaries
'^From'   #match start of a string
'day$'   #match end of a string or any location followed by a newline character
\b #Word boundary
\B #non-word boundary

## Character Class

In [None]:
#metacharacters are not active inside classes
\d   #Matches any decimal digit; this is equivalent to the class [0-9].
\D   #Matches any non-digit character; this is equivalent to the class [^0-9].
\s   #Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v].
\S   #Matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v].
\w   #Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_].
\W   #Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_].

[a-c,x-z]  #character is a to c or x to z
[^a-c]     #not abc (^ must be the first character)
r'\\section'  #python will convert \\ to \ and r will leave all \ as-is in string

#class subtraction
(?![QK])\w   #using lookahead to exclude Q,K from character class \w
\w(?<![QK])   #using lookbehind to exclude Q,K from character class \w

#class intersection
(?=[^aeiou])(?=[a-z])  #using lookahead to use intersection of 2 classes
    

# Lookahead/Lookbehind

In [None]:
#multiple lookaheads/lookbehinds can look immediately right or left, but they do not alter the position
#use .* to look further ahead
#can use lookahead to check that it does not contain what we dont want
# .*? for shortest match  
# .* for longest match   

(?=bat)    #positive lookahead assertion
(?!bat)    #negative lookahead assertion

(?<=bat)   #positive lookbehind
(?<!bat)   #negative lookbehind

p = re.compile(r'.*[.](?!bat$|exe$)[^.]*$') #negative lookahead to exclude selected words
p = re.compile(r'.*[.](?=cf$|conf$).*$')    #positive lookahead to ensure match of selected words


# Regex Functions

In [None]:
m1 = re.match(A,B)       #Check if the first instance of pattern A is at the start of string B
m1 = re.fullmatch(A,B)   #Check if the whole string B matches pattern A
m3 = re.search(A,B)      #Matches the first instance of pattern A in string B
m4 = re.findall(A,B)     #Matches all instances of pattern A in string B and returns them in a list (no span)
m5 = re.finditer(A,B)    #Creates iterator with all the matches of pattern A in string B, including span
m6 = re.split(A,B)       #Split a string B into a list using the delimiter pattern A
m7 = re.sub(A,B,C,2)     #Replace pattern A with B in the string C up to first 2 occurences

In [None]:
p = re.compile('b[a-z]d')             #use ( ) if want to capture values
p = re.compile(r'php', re.IGNORECASE) #Perform case-insensitive matching

m1 = p.match('bed d bad bd')         #Check if the first instance is at the start of string, return boolean
m2 = p.search('b bed d bad bd')      #Matches the first instance of an expression A in a string B, return boolean
m3 = p.findall('b bed d bad bd')     #Matches all instances of an expression A in a string B and returns them in a list (no span)
m4 = p.finditer('b bed d bad bd')    #Creates iterator with all the matches, including span
m5 = p.split('txt_we',3)             #Split string into a list using instances up to first 3 occurences
m6 = p.sub('a','txt we', 2)          #Replace instance with 'a' in string up to first 2 occurences
m7 = p.subn('a','txt we')            #same as sub() but returns new string and no. of replacements

# Groups 

In [None]:
#Un-named groups
re.match("(a(b)c)d", "abcd").group()          #return entire match
re.match("(a(b)c)d", "abcd").group(1)         #return group 1
re.match("(a(b)c)d", "abcd").group(1,2)       #return groups 1 and 2 in tuples
re.match("(a(b)c)d", "abcd").groups()         #return all groups
re.match("(a(b)c)d", "abcd").__getitem__(g)   #access specified groups
re.match("(a(b)c)d", "abcd").start(1)         #return indices of the start of substring matched by group
re.match("(a(b)c)d", "abcd").end(2)           #return indices of the end of substring matched by group
re.match("(a(b)c)d", "abcd").span(3)          #return indices of the start and end of substring matched by group 
re.match("(a(b)c)d", "abcd").last-index       #return integer index of last matched capturing group 
re.match("(a(b)c)d", "abcd").last-group       #name of the last matched capturing group

#Named groups (?P<name> )
m = re.match(r'(?P<first>\w+) (?P<last>\w+)', 'John Doe')  #named groups
m.group('last')         #to access specific named group 
m.groupdict()           #to retrieve named groups as dict  

#Non-capturing groups  (?: )
m = re.match(r'(?:\w+) (?:\w+)', 'John Doe')  

#Backreferencing (?P=name)
m = re.match(r'(\w+) \1', 'John Doe')                     #for un-named group
m = re.match(r'(?P<first>\w+) (?P=first)', 'John Doe')    #for named group
?P<first>   #to name group
?P=first    #to backreference group

#Repeat group multiple times
([A-Z]_)+   #group will be captured multiple times but will only return the last value captured

#to capture all the groups using finditer
s = "bob sue jon richard harry"
r = re.compile('(?P<name>[a-z]+)\s+(?P<name2>[a-z]+)')
[m.groupdict() for m in r.finditer(s)]

# Applications

In [None]:
r"^\s*$"       #find blank lines

r"\bclass\b"   #match exact word where boundary is whitespace of non-alphanumeric character

r'^(.*\.)[^.]+$'   #replace file extension by using r'\1rar' in re.sub

r'^([^-]*)-(.*)'   #remove dash in string by using r'\1\2' in re.sub

r'(?=\b\w{7}\b)\w*?hay\w*'  #7 letter word containing 'hay'

r'^(?=.*?\bbubble\b).*?\bgum\b.*' #check that string contains 'bubble' and 'gum'

r'\bb[ou]y\b'      #check string contains 'boy' or 'buy'

r'\b(\w+)\b\s+\1\b'  #find repeated words
r'\b(?P<word>\w+)\s+(?P=word)\b'  #find repeated words

r'^(?!.*boy).*'    #string does not contain 'boy'

r'^(?!.*gum)(?!.*bath).*?bubble.*'   #contains bubble but not gum nor bath

r'\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,6}\b'  #check email address

r'(?!boo)[a-z]{3}'    #check that next 3 letters are not 'boo'

(?<=[a-z])(?=[A-Z])  #finding location where case changes and use sub to add space

(?=(\w+))            #find all substring of a string

(?<=_(?=\d{2}_))\d+  #find digits in _12_

\d+(?=_(?!_))        #find digit followed by one underscore only

(?<=(?<!_)_)\d+      #find digit preceded by one underscore only

(?=[^a-z]*[a-z])         #contain lowercase
(?=(?:[^A-Z]*[A-Z]){3})  #contain 3 uppercase
(?=\D*\d)                #contain digits

r'(?=[^a-z]*[a-z])(?=(?:[^A-Z]*[A-Z]){3})(?=\D*\d)\w{6,10}'  #multiple lookaheads for password

r'\(([^()]*)\)'   #capture text within parenthesis

r'\[([^]]+)]'     #capture text within square bracket and use r'<\1>' in sub to replace with < >

r'^(?:1(?:[5-9]|\d\d+)|[2-9]\d+)$'   #check if number is above 15

r'^(?:peas|onions|carrots)(?:,(?:peas|onions|carrots))*+$'  #check a list is made of certain items

r'^(?!0*(?:10*){10}1)[01]+$'  #check that not more than ten 1s 

r'<+(\d+)>*|<*(\d+)>+'         #check for numbers with missing tags and use r'<\1\2>' in sub to replace

r'(?<!the )(?!the )\b\w+\b'    #exclude words preceded by the and exclude the as well

r'^[\s]*(.*?)[\s]*$'     #to find leading and trailing spaces 

r'<([a-z]+)([^<]+)*(?:>(.*)<\/\1>|\s+\/>)'    #to find html tags

r'\B#(?:[a-fA-F0–9]{6}|[a-fA-F0–9]{3})\b'     #to find valid hexadecimal value

r'\b[\w.!#$%&’*+\/=?^`{|}~-]+@[\w-]+(?:\.[\w-]+)*\b'  #to find valid email

r'/^[a-z0-9_-]{3,16}$/'    #check username: Minimum length of 3, maximum length of 16, composed by letters, numbers or dashes

#check password: Minimum length of 6, at least one uppercase letter, at least one lowercase letter, at least one number, at least one special character
r'(?=^.{6,}$)((?=.*\w)(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[|!"$%&\/\(\)\?\^\'\\\+\-\*]))^.*'  

r'^(?=([0-9]*[a-z]){2,})([a-zA-Z0-9]{8,32})$'  #at least 2 letters (uppercase or lowercase) at any index, minimum length of 8, maximum length of 32

r'^(((https?|ftp):\/\/)?([\w\-\.])+(\.)([\w]){2,4}([\w\/+=%&_\.~?\-]*))*$' #to get website name




# Using RE in Function

In [7]:
def hexrepl(match):
    "Return the hex string for a decimal number"
    value = int(match.group())
    return hex(value)

p = re.compile(r'\d+')
print(p.sub(hexrepl, 'Call 65490 for printing, 49152 for user code.'))


Call 0xffd2 for printing, 0xc000 for user code.
