In [1]:
import re
from pytools.regex import PatternSequence

In [14]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
s = 'This is a string '
s

s.strip()
s.replace('string', 'number')
s.split()  # accepts delimiters

'This is a string '

'This is a string'

'This is a number '

['This', 'is', 'a', 'string']

In [14]:
s = 'This is a {}'
s.format('string')
s.format(12)

x = 'string'
f'This is a {x}'

s = 'This {} a {}'
s.format('is', 'word')

s = 'This {x} a {y}'
s.format(y='word', x='is')

'This is a string'

'This is a 12'

'This is a string'

'This is a word'

'This is a word'

In [17]:
s = 'this is a string'
s.capitalize()

'this is a string, this string is wide'.count('string')

'This is a string'

2

In [19]:
bs = s.encode('utf-8')
bs

bs.decode('utf-8')

b'this is a string'

'this is a string'

In [22]:
'this is a string'.find('str')

10

In [30]:
'abc'.isalpha()  # alphabetic chars
'5abs'.isalpha()
'5abs'.isalnum()  # alphanumeric
'2322'.isdigit()  # all digits


True

False

True

True

False

False

In [32]:
' : '.join(['this', 'is', 'a', 'string'])

'this : is : a : string'

In [33]:
'this is a string \n it is wide'.splitlines()

['this is a string ', ' it is wide']

# Regular expressions

explore: 
* re.sub
* re.match
* re.find
* re.findall


## PatternSequence tests

In [2]:
currency_symbol_to_iso3 = {'£': 'GBP', '$': 'USD', '€': 'EUR'}
currencies = list(currency_symbol_to_iso3.values())
currency_symbols = list(currency_symbol_to_iso3.keys())

In [3]:
ps = PatternSequence()
ps.add(f'([-+]?\d*\.\d+|\d+)\s*({"|".join(currencies)})', 
        lambda g: {'amount': float(g[0]), 'currency': g[1]})
ps.add(f'({"|".join(currency_symbols)})\s*([-+]?\d*\.\d+|\d+)'.replace('$', '\$'), 
        lambda g: {'amount': float(g[1]), 
'currency': currency_symbol_to_iso3[g[0]]})

In [4]:
ps.get_patterns()

['([-+]?\\d*\\.\\d+|\\d+)\\s*(GBP|USD|EUR)',
 '(£|\\$|€)\\s*([-+]?\\d*\\.\\d+|\\d+)']

In [5]:
ps.match('$-13.342')

{'amount': -13.342, 'currency': 'USD'}

## regex

In [28]:
# most characters match themselves
re.match('word', 'word')

re.match('word', 'this is a word') is None  # because 'word' is exact pattern, need to allow other things before it

re.match('word', 'word is good')  # but the ok if the pattern is found at the start of the string

# metacharacters:  . ^ $ * + ? { } [ ] \ | ( )
re.match('$', '$') is None
re.match('\$', '$')  # use slash to actually match metacharacters

<re.Match object; span=(0, 4), match='word'>

True

True

<re.Match object; span=(0, 1), match='$'>

<re.Match object; span=(0, 4), match='word'>

### classes ```[..]```

In [29]:
# [...] - defines a class of characters to match, listed or a range with -, like 0-9
re.match('[1-5]', '4')
re.match('[a-e]', 'b')
re.match('[abxy]', 'y')

# [a-z] for any of the lowercase

<re.Match object; span=(0, 1), match='4'>

<re.Match object; span=(0, 1), match='b'>

<re.Match object; span=(0, 1), match='y'>

In [33]:
re.match('[$£]', '$')  # metachars not active inside classes; can use [$] instead of \$ to match $

re.match('[^abc]', 'a') is None  # ^ at the start of the class - match all except the characters
re.match('[^abc]', 'xyz') 

<re.Match object; span=(0, 1), match='$'>

True

<re.Match object; span=(0, 1), match='x'>

### backslash ```\```

In [36]:
# \w - any alphanumeric char, same as [a-zA-Z0-9_]
re.match('\w', 'ab x')
re.match('\w', '&ab x') is None

# \W is for any non-alphanumeric, i.e. [^a-zA-Z0-9_]
# by convention, capital letter often mean a negation

<re.Match object; span=(0, 1), match='a'>

True

* \d : 0-9
* \s : any whitespace
* . : anything except newline

* class can be a union of chars, ranges and escapes: like ```[0-6\s,.]```

In [39]:
# * : repeat 0,1 or many times; is applied to the char before *
re.match('ca*t', 'ct')
re.match('ca*t', 'cat')
re.match('ca*t', 'caaaatt, other things')

<re.Match object; span=(0, 2), match='ct'>

<re.Match object; span=(0, 3), match='cat'>

<re.Match object; span=(0, 6), match='caaaat'>

In [41]:
re.match('[0-9]*', '181857198198273')  #can repeat classes
# * is greedy and will take as many repetitions as possible, while the whole pattern is matched;
# in case the next step in the match fail, it will go back and try to match fewer repetitions

<re.Match object; span=(0, 15), match='181857198198273'>

* ```+``` : same as ```*``` but requires at least one occurence
* ```?``` : match once or zero times
* ```{m,n}``` : at least m, at most n repetition (so ? is {0, 1}, {0,} is *, {1,} is +)

In [44]:
# compiling pattern string into regexps
p = re.compile('ab*')
p_no_case = re.compile('ab*', re.IGNORECASE)  # can use special re flags

# regexp has .match(..) method
p.match('abb')
p.match('AbB') is None
p_no_case.match('AbB')

<re.Match object; span=(0, 3), match='abb'>

True

<re.Match object; span=(0, 3), match='AbB'>

Methods of a compiled regexp (we used only .match(..) so far)

|method| operation|
|:-----|:-------|
|```match()```|  Determine if the RE matches at the beginning of the string.|
|```search()```|  Scan through a string, looking for any location where this RE matches.|
|```findall()```|  Find all substrings where the RE matches, and returns them as a list.|
|```finditer()```|  Find all substrings where the RE matches, and returns them as an iterator.|

In [45]:
re.match('[0-3]', 'abc2') is None
re.search('[0-3]', 'abc2')

True

<re.Match object; span=(3, 4), match='2'>

In [50]:
m = re.search('[0-4]', 'abc 3 xyz')
m.start()
m.end()
m.span()
m.group()  # returns the substring that was matched (pattern with all options restricted to particular found case)

4

5

(4, 5)

'3'

In [57]:
re.findall(r'\d*\$', '12$ and 43$ plus 123$')  # returns list
# raw python string for pattern must be used!!

match_iter = re.finditer(r'\d*\$', '12$ and 43$ plus 123$')  # returns iterator for all separate matches
match_iter 
list(match_iter)

['12$', '43$', '123$']

<callable_iterator at 0x7f4338d1ce20>

[<re.Match object; span=(0, 3), match='12$'>,
 <re.Match object; span=(8, 11), match='43$'>,
 <re.Match object; span=(17, 21), match='123$'>]

* re has all the same methods: re.match, re.search etc.; to be used as ```re.match(patter_str, input_str)```.
it caches compiled regexps, so using .compile(..) is not needed, (unless we save a pattern and use it many times in the code - can do it to reduce notation)

* also .compiled is useful if we apply a flag during regexp compilation

* flags:
    - IGNORECASE : match irrespectively of the case 'a' will match 'A', etc
    - MULTILINE : ^ and $ will match start/end of each line, not only start/end of the string 
    - DOTALL : make '.' include '\n' and thus match everything
    - and others

### zero-width assertions

In [68]:
# | - alternation
re.findall('[0-9]|[a-z]', '5 £$% v')

# ^ match start of the string
re.findall('^[a-z]', 'abc')

# $ end of the string
re.findall('[a-z]$', 'abc')

re.match('^[a-z]$', 'abc') is None
re.match('^[a-z]*$', 'abc')

re.findall('[a-z]', 'abc')
re.findall('[a-z]*', 'abc')  # won't match all repetition options, only the largest one and empty

['5', 'v']

['a']

['c']

True

<re.Match object; span=(0, 3), match='abc'>

['a', 'b', 'c']

['abc', '']

\A and \Z match the start/end of the string (even in MULTILINE mode, unlike ^ and $)

In [76]:
# \b start or end of the word (a sequence of alphanumeric chars)
re.search(r'\bword\b', 'this is a word')  # MUST USE r'' (raw string), in python string \b is backspace
re.search(r'\bword\b', 'this is a buzzword') is None

# \B : only match inside words (not at the edges)

<re.Match object; span=(10, 14), match='word'>

True

###  groups ```(...)```

In [88]:
# each group in the pattern is written inside (..)
# qualifiers can be applied to groups
re.match('(12)+', '1212')

# if pattern contains multiple groups: their values and positions can be accessed with passing index from 1 to 
# .group(..), .span(..)
# .groups() will return a list of group matched values (groups numbered from left to right)
m = re.match('(12)*(34)*', '123434')
m.groups()

m.span(1), m.span(2)
m.group(1), m.group(2)
m.group(1, 2)

<re.Match object; span=(0, 4), match='1212'>

('12', '34')

((0, 2), (4, 6))

('12', '34')

('12', '34')

In [89]:
# nested groups
m = re.match(r'(a(b)c)d', 'abcd')
m.groups()

('abc', 'b')

In [93]:
# group backreferences : \1 means that we must exactly match the first matched group (can do \2, \3 etc)
re.match(r'(ab|xy)\s+\1', 'ab ab')
re.match(r'(ab|xy)\s+\1', 'ab xy') is None

<re.Match object; span=(0, 5), match='ab ab'>

True

### Extensions
* specified by ```(?...)``` (so here ? is not used for repetition, since there's nothing to repeat it's not ambigious)
* ? is followed by a symbol that determines the extension, ?P prefix instead of ? means pure python extension

In [94]:
# non-capturing groups (will be skipped in .groups(), only to organize the pattern)
m = re.match("(?:[abc])+", "abc")
m.groups()

# useful for modifying existing pattern, for which other code already uses the order of output groups 

()

In [98]:
# named group : (?P<name>...)
# useful if the groups will be later processed, make the code more robust and clear, no magic numbers; compare
m = re.match(r'(?P<first>\w+) (?P<last>\w+)', 'Jane Doe')
m.groupdict()  # has specific groupdict method !!
m.groups()

m = re.match(r'(\w+) (\w+)', 'Jane Doe')
m.groups()

# first method is more robust, e.g. to a change like this
m = re.match(r'(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)', 'Jane Diana Doe')
m.groupdict()

{'first': 'Jane', 'last': 'Doe'}

('Jane', 'Doe')

('Jane', 'Doe')

{'first': 'Jane', 'middle': 'Diana', 'last': 'Doe'}

In [104]:
# backreference with name : (?P=name)
re.match(r'(?P<first>\w+) (?P<last>\w+) (?P=first)', 'Jane Doe Jane')

<re.Match object; span=(0, 13), match='Jane Doe Jane'>

In [108]:
# lookahead : (?=...) - check if the pattern is matched (fail if not) and continue from the same position to match 
# the main pattern further (this group won't show up anywhere)
# (?!...) is a similar negative pattern

# e.g. match filename main part and extension, but extension cannot be .txt or .md
filename_pat = re.compile('(.*)[.]((?!txt$|md$)[^.]*)$')
m = filename_pat.match('name.csv')
m.groups()
filename_pat.match('name.md') is None

('name', 'csv')

True

### modifying strings with re

In [111]:
# .split() : split string using whatever matches the pattern as delimiter
re.split(r'\W+', 'This , is a$string')

re.split(r'(\W+)', 'This , is a$string')  # delimiter pattern inside (..) - then found delims also returned 

['This', 'is', 'a', 'string']

['This', ' , ', 'is', ' ', 'a', '$', 'string']

In [122]:
# .sub() : replace whatever matches the pattern by another string
s = 'abxyz, other word, xyz'
replacement = 'repl_string'
re.sub(r'(ab)*xyz', replacement, s)
re.subn(r'(ab)*xyz', replacement, s)  # also returns the number of replacements

'repl_string, other word, repl_string'

('repl_string, other word, repl_string', 2)

In [124]:
p = re.compile('section{ ( [^}]* ) }')  
# here attention (!): [^}]* must be used since * is greedy and has priority over } after the group end
p.sub(r'subsection{\1}','section{First} section{second}')

'section{First} section{second}'

In [125]:
# removing a list of strings 
str_to_remove = ['USD', 'EUR', 'GBP']
re.sub(fr'({"|".join(str_to_remove)})', '', 'Currencies like USD, RUB, EUR, JPY and GBP')

'Currencies like , RUB, , JPY and '

In [127]:
# substitude only words
re.sub(r'\bword\b', 'phrase', 'this is a word') 
re.sub(r'\bword\b', 'phrase', 'this is a buzzword')  # not replacing unlike string .replace()

'this is a phrase'

'this is a buzzword'

# Context-free grammars. Lark

### EBNF grammar
* Tree parsing algorithms 
* DSL - Domain Specific Language
(put info)

lark in PyPi goes under the name lark-parser

In [1]:
from lark import Lark

In [15]:
# simple grammar (grammar is specify as a string, usually multiline or in a separate text file)

grammar = r"""
start: WORD "," WORD "!"
%import common.WORD
%ignore " "
"""

In [16]:
parser = Lark(grammar)

parser.parse('Hello,   World!')  # returns a Tree instance

Tree(start, [Token(WORD, 'Hello'), Token(WORD, 'World')])

In [38]:
# grammar arythmetic operations

parser = Lark('''?sum: product
                     | sum "+" product   -> add
                     | sum "-" product   -> sub
 
                 ?product: item
                     | product "*" item  -> mul
                     | product "/" item  -> div
 
                 ?item: NUMBER           -> number
                      | "-" item         -> neg
                      | "(" sum ")"
 
                 %import common.NUMBER
                 %import common.WS
                 %ignore WS
         ''', start='sum')

In [40]:
parser.parse('(3 + 4) * (1-(4+2)/4)')

Tree(mul, [Tree(add, [Tree(number, [Token(NUMBER, '3')]), Tree(number, [Token(NUMBER, '4')])]), Tree(sub, [Tree(number, [Token(NUMBER, '1')]), Tree(div, [Tree(add, [Tree(number, [Token(NUMBER, '4')]), Tree(number, [Token(NUMBER, '2')])]), Tree(number, [Token(NUMBER, '4')])])])])

Grammar:
* terminals - define the language alphabet
* rules - define structure
* terminal may be a string, a regular expression, or a concatenation of these and other terminals

* in re ? means optional (0 or 1 repeatition)
* in EBNF optionals are supported (and auto-expanded) ```a b? c    ->    (a c | a b c)```
* b* also means repetition

and so on



* each definition or directive is in its own line in the grammar
* comments : //
* definition is a named rule or terminal
```
rule: <EBNF EXPRESSION>
      | etc.

TERM: <EBNF EXPRESSION>   // Rules aren't allowed
```
* rule names are always in lowercase, terminals names - in uppercase
* parsing starts with the rule 'start'
* terminal syntax (support strings, regexps inside /../, grammar operators like |,?,*,+)
```
<NAME> [. <priority>] : <literals-and-or-terminals>
```

* examples of terminals
```
IF: "if"
INTEGER : /[0-9]+/
INTEGER2 : ("0".."9")+          //# Same as INTEGER
DECIMAL.2: INTEGER? "." INTEGER  //# Will be matched before INTEGER
WHITESPACE: (" " | /\t/ )+
SQL_SELECT: "select"i
```

rules
* syntax
```
<name> : <items-to-match>  [-> <alias> ]
       | ...
```
* extended to multiple lines by | 'or' operator
* item is terminal, rule, (item, item, item) - group, item with qualifier like item*, item?, [item, item] - optional group
* 

In [None]:
# further explore

# how to use Tree
# Transformer