<a href="https://docs.python.org/3/library/re.html#module-re"> Python Standard Library re</a>

<a href="https://regex101.com/"> Test regular expression here </a>

<a href="https://github.com/ziishaned/learn-regex"> learn-regex github </a>

# Checking a Pair

In [0]:
def displaymatch(match):
  if not match:
    return None
  return '<Match: %r, groups=%r>' % (match.group(), match.groups())

In [0]:
import re
valid = re.compile(r'^[a2-9tjqk]{5}')

In [4]:
displaymatch(valid.match('akt5q'))

"<Match: 'akt5q', groups=()>"

In [0]:
pair = re.compile(r'.*(.).*\1')

In [6]:
displaymatch(pair.match('717ak'))

"<Match: '717', groups=('7',)>"

In [7]:
displaymatch(pair.match('354aa'))

"<Match: '354aa', groups=('a',)>"

# Making a Phone Book

In [0]:
text = """Ross McFluff 834.345.1254 155 Elm Street

Ronald Heathmore: 892.345.3428 436 Finley Avenue
Frank Burger: 925.541.7625 662 South Dogwood Way

Heather Albrecht: 548.326.4584 919 Park Place"""

In [0]:
import re
entries = re.split(r'\n+', text)

In [3]:
entries

['Ross McFluff 834.345.1254 155 Elm Street',
 'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
 'Frank Burger: 925.541.7625 662 South Dogwood Way',
 'Heather Albrecht: 548.326.4584 919 Park Place']

In [4]:
[re.split(r':? ', entry, maxsplit=3) for entry in entries]

[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]

# Text Munging

In [0]:
def repl(m):
  import random
  innerWord = list(m.group(2))
  print(innerWord)
  random.shuffle(innerWord)
  print(innerWord)
  return m.group(1) + ''.join(innerWord) + m.group(3)

In [11]:
text = "Professor Abdolmalek, please report your absense promptly."
re.sub('(\w)(\w+)(\w)', repl, text)

['r', 'o', 'f', 'e', 's', 's', 'o']
['o', 'f', 'o', 'e', 'r', 's', 's']
['b', 'd', 'o', 'l', 'm', 'a', 'l', 'e']
['l', 'm', 'b', 'o', 'e', 'a', 'l', 'd']
['l', 'e', 'a', 's']
['e', 'l', 'a', 's']
['e', 'p', 'o', 'r']
['r', 'e', 'p', 'o']
['o', 'u']
['o', 'u']
['b', 's', 'e', 'n', 's']
['s', 'e', 'n', 's', 'b']
['r', 'o', 'm', 'p', 't', 'l']
['m', 'l', 'r', 'p', 't', 'o']


'Pofoerssr Almboealdk, pelase rrepot your asensbe pmlrptoy.'

In [0]:
import random
random.shuffle?

In [12]:
text = "He was carefully disguised but captured quickly by police."
re.findall(r'\w+ly', text)

['carefully', 'quickly']

In [14]:
for m in re.finditer(r'\w+ly', text):
  print('%02d-%02d: %s' % (m.start(), m.end(), m.group(0)))

07-16: carefully
40-47: quickly


In [15]:
re.match(r"\W(.)\1\W", " ff ")

<_sre.SRE_Match object; span=(0, 4), match=' ff '>

In [0]:
re.match('\W(.)\1\W',' ff ')

In [24]:
re.match('\\W(.)\\1\\W',' ff ')

<_sre.SRE_Match object; span=(0, 4), match=' ff '>

In [0]:
import collections
import re

Token = collections.namedtuple('Token', ['type', 'value', 'line', 'column'])

# Writing a Tokenizer

In [42]:
import collections
import re

Token = collections.namedtuple('Token', ['type', 'value', 'line', 'column'])

def tokenize(code):
    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
    token_specification = [
        ('NUMBER',   r'\d+(\.\d*)?'),  # Integer or decimal number
        ('ASSIGN',   r':='),           # Assignment operator
        ('END',      r';'),            # Statement terminator
        ('ID',       r'[A-Za-z]+'),    # Identifiers
        ('OP',       r'[+\-*\/]'),      # Arithmetic operators
        ('NEWLINE',  r'\n'),           # Line endings
        ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
        ('MISMATCH', r'.'),            # Any other character
    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    line_num = 1
    line_start = 0
    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup
        value = mo.group()
        column = mo.start() - line_start
        if kind == 'NUMBER':
            value = float(value) if '.' in value else int(value)
        elif kind == 'ID' and value in keywords:
            kind = value
        elif kind == 'NEWLINE':
            line_start = mo.end()
            line_num += 1
            continue
        elif kind == 'SKIP':
            continue
        elif kind == 'MISMATCH':
            raise RuntimeError(f'{value!r} unexpected on line {line_num}')
        yield Token(kind, value, line_num, column)

statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

for token in tokenize(statements):
    print(token)

Token(type='IF', value='IF', line=2, column=4)
Token(type='ID', value='quantity', line=2, column=7)
Token(type='THEN', value='THEN', line=2, column=16)
Token(type='ID', value='total', line=3, column=8)
Token(type='ASSIGN', value=':=', line=3, column=14)
Token(type='ID', value='total', line=3, column=17)
Token(type='OP', value='+', line=3, column=23)
Token(type='ID', value='price', line=3, column=25)
Token(type='OP', value='*', line=3, column=31)
Token(type='ID', value='quantity', line=3, column=33)
Token(type='END', value=';', line=3, column=41)
Token(type='ID', value='tax', line=4, column=8)
Token(type='ASSIGN', value=':=', line=4, column=12)
Token(type='ID', value='price', line=4, column=15)
Token(type='OP', value='*', line=4, column=21)
Token(type='NUMBER', value=0.05, line=4, column=23)
Token(type='END', value=';', line=4, column=27)
Token(type='ENDIF', value='ENDIF', line=5, column=4)
Token(type='END', value=';', line=5, column=9)


In [43]:
print(tokenRegex)

(?P<NUMBER>\d+(\.\d*)?)|(?P<ASSIGN>:=)|(?P<END>;)|(?P<ID>[A-Za-z]+)|(?P<OP>+\-*\/)|(?P<NEWLINE>\n)|(?P<SKIP>[ \t]+)|(?P<MISMATCH>.)


In [0]:
tokenRegex = r'(?P<NUMBER>\d+(\.\d*)?)|(?P<ASSIGN>:=)|(?P<END>;)|(?P<ID>[A-Za-z]+)|(?P<OP>\+\-\*\/)|(?P<NEWLINE>\n)|(?P<SKIP>[ \t]+)|(?P<MISMATCH>.)'

In [0]:
statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

In [47]:
re.finditer(tokenRegex, statements)

<callable_iterator at 0x7fdca68f2fd0>

In [48]:
for mo in re.finditer(tokenRegex, statements):
  print(mo.lastgroup)
  print(mo.group())
  print(mo.start())
  print(mo.end())
  

NEWLINE


0
1
SKIP
    
1
5
ID
IF
5
7
SKIP
 
7
8
ID
quantity
8
16
SKIP
 
16
17
ID
THEN
17
21
NEWLINE


21
22
SKIP
        
22
30
ID
total
30
35
SKIP
 
35
36
ASSIGN
:=
36
38
SKIP
 
38
39
ID
total
39
44
SKIP
 
44
45
MISMATCH
+
45
46
SKIP
 
46
47
ID
price
47
52
SKIP
 
52
53
MISMATCH
*
53
54
SKIP
 
54
55
ID
quantity
55
63
END
;
63
64
NEWLINE


64
65
SKIP
        
65
73
ID
tax
73
76
SKIP
 
76
77
ASSIGN
:=
77
79
SKIP
 
79
80
ID
price
80
85
SKIP
 
85
86
MISMATCH
*
86
87
SKIP
 
87
88
NUMBER
0.05
88
92
END
;
92
93
NEWLINE


93
94
SKIP
    
94
98
ID
ENDIF
98
103
END
;
103
104
NEWLINE


104
105
