## 2.1. Разрезание строк различными разделителями

In [71]:
line = 'asdf fjdk; afed, fjek,asdf, foo '

import re

re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo', '']

In [3]:
# Захватывающие скобки

fields = re.split(r'(;|,|\s)\s*', line)
fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [9]:
values = fields[::2]
delimiters = fields[1::2] + ['']
print(values)
print(delimiters)

''.join(v+d for v,d in zip(values, delimiters))

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
[' ', ';', ',', ',', ',', '']


'asdf fjdk;afed,fjek,asdf,foo'

In [10]:
# использование незахватывающей группы

re.split(r'(?:,|;|\s)\s*', line)


['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [81]:
line_1 = 'flflf , fllflff:,, lflfllf,   flflflf , flkd;gk '

print(re.split(r'\s?(?:,|;|:|\s+)+\s*', line_1))
re.split(r'\s+', line_1)

['flflf', 'fllflff', 'lflfllf', 'flflflf', 'flkd', 'gk', '']


['flflf', ',', 'fllflff:,,', 'lflfllf,', 'flflflf', ',', 'flkd;gk', '']

## 2.2. Поиск текста в начале и в конце строки

In [84]:
filename = 'spam.txt'
print(filename.endswith('txt'))
print(filename.startswith('fuile:'))

url = 'http://www.python.org'
url.startswith('http:')

True
False


True

In [89]:
import os

filenames = os.listdir('.')
print(filenames)
[name for name in filenames if name.endswith(('.c', '.h'))]

print(any(name.endswith('.py') for name in filenames))

['Python Cookbook Часть первая.ipynb', 'floats.bin', 'dummy', 'Глава 4 Unicode-текст и байты.ipynb', 'cafe.txt', 'Python Cookbook Глава 2 Строки и текст.ipynb', 'Пользовательские атрибуты в Python.ipynb', 'Untitled.ipynb', '.ipynb_checkpoints']
False


In [92]:
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

In [94]:
choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(tuple(choices))

True

In [97]:
# использование срезов

filename = 'spam.txt'
filename[-4:] == '.txt'
url = 'http://www.python.org'
url[:5] == 'http:' or url[:6] == 'https:' or url[:4] == 'ftp:'

True

In [98]:
# использование регулярных выражений

import re
url = 'http://www.python.org'
re.match('http:|https:|ftp:', url)

<re.Match object; span=(0, 5), match='http:'>

In [115]:
from pathlib import Path

if not any(str(name).endswith(('.c', '.h')) for name in Path.cwd().iterdir()):
    print('Yes')
    


# str(Path.cwd())

Yes


## 2.3. Поиск строк с использованием масок оболочки (shell)

In [119]:
from fnmatch import fnmatch, fnmatchcase

print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, 'Dat*.csv')]

True
True
True


['Dat1.csv', 'Dat2.csv']

In [120]:
fnmatch('foo.txt', '*.TXT')

False

In [126]:
addresses = [
 '5412 N CLARK ST',
 '1060 W ADDISON ST',
 '1039 W GRANVILLE AVE',
 '2122 N CLARK ST',
 '4802 N BROADWAY',
]

from fnmatch import fnmatchcase
print([addr for addr in addresses if fnmatchcase(addr, '* ST')])
print([addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')])

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
['5412 N CLARK ST']


## 2.4. Поиск совпадений и поиск текстовых паттернов

In [130]:
text = 'yeah, but no, but yeah, but no, but yeah'

text == 'yeah'

text.startswith('yeah')
text.endswith('no')

text.find('no')

10

In [132]:
# использование регулярных выражений

text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

import re

if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')
    
if re.match(r'\w+\s?\d+', text2):
    print('yes')
else:
    print('no')

yes
yes


In [135]:
datepat = re.compile(r'\d+/\d+/\d+')

if datepat.match(text1):
    print('yes')
else:
    print('no')
    
if datepat.match(text2):
    print('yes')
else:
    print('no')

yes
no


In [141]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

datepat.findall(text)

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')


m = datepat.match('11/27/2012')
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.groups())

month, dat, year = m.groups()

print(datepat.findall(text))

for month, day, year in datepat.findall(text):
    print('{}-{}-{}'.format(year, month, day))

11/27/2012
11
27
2012
('11', '27', '2012')
[('11', '27', '2012'), ('3', '13', '2013')]
2012-11-27
2013-3-13


In [142]:
# находим все вхождения итеративно

for m in datepat.finditer(text):
    print(m.groups())

('11', '27', '2012')
('3', '13', '2013')


In [149]:
# для точного совпадения добавить \$

datepat = re.compile(r'(\d+)/(\d+)/(\d+)\$')

# print(datepat.match('11/27/2012abcdef'))
datepat.match('11/27/2012')

datepat.match('11/27/2012')

In [152]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)\$')
datepat.match('11/27/2012abcdef')
datepat.match('11/27/2012')

re.findall(r'(\d+)/(\d+)/(\d+)', text)

[('11', '27', '2012'), ('3', '13', '2013')]

## 2.5. Поиск и замена текста

In [153]:
text = 'yeah, but no, but yeah, but no, but yeah'

text.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [154]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

import re

re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [155]:
# компилирование шаблона

import re

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [158]:
# подстановочная функция обратного вызова

from calendar import month_abbr

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat.sub(change_date, text)

newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext, n, sep='\n')

Today is 2012-11-27. PyCon starts 2013-3-13.
2


## 2.6. Поиск и замена текста без учета регистра

In [160]:
# флаг re.IGNORCASE

text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
re.sub('python', 'snake', text, flags=re.IGNORECASE)

['PYTHON', 'python', 'Python']


'UPPER snake, lower snake, Mixed snake'

In [161]:
# функция поддержки

def matchcase(word):
    def replace(m):
        text = m.group()
        print(text)
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace


# пример использования вышеуказанной функции 

re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

PYTHON
python
Python


'UPPER SNAKE, lower snake, Mixed Snake'

## 2.7. Определение регулярных выражений для поиска
## кратчайшего совпадения

In [169]:
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer days "no".'
print(str_pat.findall(text1))

text2 = 'Computer dats "no". Phone says "yes".'
print(str_pat.findall(text2))

str_pat = re.compile(r'\"(.*?)\"')
print(str_pat.findall(text2))



['no']
['no". Phone says "yes']
['no', 'yes']


## 2.8. Написание регулярного выражения
## для многострочных шаблонов

In [214]:
comment = re.compile(r'/\*((.|/n)*?)\*/')

text1 = '/* this is a comment */'
text2 = '''/* this is a
... multiline comment */
... 
*/ ghjgjg */               '''

comment.findall(text1)
comment.findall(text2)

commet = re.compile(r'/\*((.|\n)*?)\*/')

commet.findall(text2)

comm = commet = re.compile(r'/\*((?:.|\n)*?)\*/')
comm.findall(text2)


[' this is a\n... multiline comment ']

In [215]:
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\n... multiline comment ']

## 2.9. Нормализация текста в Unicode
## к стандартному представлению

In [223]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

print(s1, s2, sep='\n')

print(s1 == s2)
print(len(s1), len(s2), sep='\n')

import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(t1 == t2)
print(ascii(t1))

t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print(t3 == t4)
print(ascii(t3))

Spicy Jalapeño
Spicy Jalapeño
False
14
15
True
'Spicy Jalape\xf1o'
True
'Spicy Jalapen\u0303o'


In [228]:
s = '\ufb01'
print(s)
print(unicodedata.normalize('NFD', s))

print(unicodedata.normalize('NFKD', s))
print(unicodedata.normalize('NFKC', s))

ﬁ
ﬁ
fi
fi


In [229]:
# удаление диакритических знаков

t1 = unicodedata.normalize('NFD', s1)
''.join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeno'

## 2.10. Использование символов Unicode в регулярных
## выражениях

In [232]:
import re

num = re.compile('\d+')
num.match('123')
num.match('\u0661\u0662\u0663')

<re.Match object; span=(0, 3), match='١٢٣'>

In [237]:
pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straße'
pat.match(s)

pat.match(s.upper())
s = s.upper()
pat.match(s)

## 2.11. Срезание нежелательных символов из строк

In [241]:
# срезание пробелов

s = ' hello world \n'
print(s.strip())
print(s.lstrip())
print(s.rstrip())

# срезание символов

t = '-----hello====='
print(t.lstrip('-'))
print(t.rstrip('='))
print(t.strip('-='))

hello world
hello world 

 hello world
hello=====
-----hello
hello


In [249]:
s = ' hello                      world \n'
s = s.strip()
print(s)

s1 = s.replace('  ', '')
re.sub('\s+', ' ', s)

hello                      world


'hello world'

In [250]:
with open(filename) as f:
    lines = (lines.strip() for line in f)
    for line in lines:
        ...

FileNotFoundError: [Errno 2] No such file or directory: 'spam.txt'

## 2.12. Чистка строк

In [252]:
s = 'pýtĥöñ\fis\tawesome\r\n'

In [253]:
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None,
}

a = s.translate(remap)
a

'pýtĥöñ is awesome\n'

In [256]:
# удаление комбинирующихся символов

import unicodedata
import sys

cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) 
                         if unicodedata.combining(chr(c)))
# print(cmb_chrs)
b = unicodedata.normalize('NFD', a)
print(b)
b.translate(cmb_chrs)

pýtĥöñ is awesome



'python is awesome\n'

In [273]:
# таблица перевода, которая отображает все десятичные Unicode
# на их эквиваленты ASCII

digitmap = { c: ord('0') + unicodedata.digit(chr(c)) 
           for c in range(sys.maxunicode)
           if unicodedata.category(chr(c)) == 'Nd' }

print(len(digitmap))

x = '\u0661\u0662\u0663'
print(x.translate(digitmap))

x1 = '\u0661'

x1.translate(digitmap)


650
123


'1'

In [280]:
# ASCII декодирование убирает все комбинирующиеся символы

b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')

b.encode('ascii', 'ignore').decode('ascii')

'python is awesome\n'

In [281]:
# str.replace() часто оказывается самым быстрым способом

def clean_spaces(s):
    s = s.replace('\r', '')
    s = s.replace('\t', ' ')
    s = s.replace('f', ' ')
    return s

## 2.13. Выравнивание текстовых строк

In [288]:
# методы ljust(), rjust(), center()

text = 'Hello World'
text.ljust(20)

text.rjust(20)
text.center(20)

text.rjust(20,'=')
text.ljust(20,'*')
text.center(20,'*')

'****Hello World*****'

In [296]:
format(text, '>20')
format(text, '<20')
format(text, '^20')

format(text, '=>20s')
format(text, '=<20s')
format(text, '*^20s')

'****Hello World*****'

In [297]:
'{:>10s} {:>10s}'.format('Hello', 'World')

'     Hello      World'

In [301]:
# format работает не только со строками, но и любыми форматами

x = 1.2345
format(x, '>10')
format(x, '^10.2f')

'   1.23   '

In [305]:
'%-20s' %text
'%20s' %text

'         Hello World'

## 2.14. Объединение и конкатенация строк

In [308]:
parts = ['Is', 'Chicago', 'Not', 'Chicago?']

' '.join(parts)
','.join(parts)
''.join(parts)

'IsChicagoNotChicago?'

In [309]:
a = 'Is Chicago'
b = 'Not Chicago?'
a + ' ' + b

'Is Chicago Not Chicago?'

In [311]:
print('{} {}'.format(a, b))
print(a + ' ' + b)

Is Chicago Not Chicago?
Is Chicago Not Chicago?


In [315]:
a = 'Hello' ' World'
a

'Hello World'

In [316]:
data = ['ACME', 50, 91.1]
','.join(str(d) for d in data)

'ACME,50,91.1'

In [321]:
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'
    
text = ''.join(sample())
text

for part in sample():
    f.write(part)

NameError: name 'f' is not defined

In [323]:
def combine(source, maxsize):
    parts = []
    size = 0
    for part in source:
        parts.append(part)
        size += len(part)
        if size > maxsize:
            yield ''.join(parts)
            parts = []
            size = 0
    yield ''.join(parts)
    
    
for part in combine(sample(), 3):
    print(part)

IsChicago
NotChicago?



In [324]:
def combine(source, maxsize):
    parts = []
    size = 0
    for part in source:
        parts.append(part)
        size += len(part)
        if size > maxsize:
            yield ''.join(parts)
            perts = []
            size = 0
    yield ''.join(parts)

## 2.15. Интерполяция переменных в строках

In [325]:
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)

'Guido has 37 messages.'

In [326]:
name = 'Guido'
n = 37
s.format_map(vars())

'Guido has 37 messages.'

In [328]:
# vars() такэе рабоает с экземплярами

class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
        
a = Info('Guido', 37)
s.format_map(vars(a))

'Guido has 37 messages.'

In [329]:
s.format(name='Guido')

KeyError: 'n'

In [334]:
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'
    
s.format_map(safesub(vars()))

'Guido has {n} messages.'

In [344]:
import sys

def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))

name = 'Guido'
n = 37
print(sub('Hello {name}'))
print(sub('You have {n} messages.'))
print(sub('Your favorite color is {color}'))
    


Hello Guido
Hello Guido
You have 37 messages.
You have 37 messages.
Your favorite color is {color}
Your favorite color is {color}


In [351]:
name = 'Guido'
n = 37
'%(name) has %(n) messages.' % vars()

ValueError: unsupported format character 'm' (0x6d) at index 17

In [352]:
import string
s = string.Template('$name has $n messages.')
s.substitute(vars())

'Guido has 37 messages.'

## 2.16. Разбивка текста на фиксированное количество
## колонок

In [353]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

In [357]:
import textwrap

print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40))
print(textwrap.fill(s, 40, initial_indent=' '))
print(textwrap.fill(s, 40, subsequent_indent=' '))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
 Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
Look into my eyes, look into my eyes,
 the eyes, the eyes, the eyes, not
 around the eyes, don't look around the
 eyes, look into my eyes, you're under.


In [359]:
# получить размер терминала

import os
os.get_terminal_size()

OSError: [Errno 25] Inappropriate ioctl for device

## 2.17. Работа с HTML- и XML-сущностями в тексте

In [363]:
s = 'Elements are written as "<tag>text</tag>"'
import html
print(s)
print(html.escape(s))

print(html.escape(s, quote=False))

Elements are written as "<tag>text</tag>"
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;"


In [364]:
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

b'Spicy Jalape&#241;o'

In [382]:
s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
p = HTMLParser()
p.unescape(s)

In [386]:
t = 'The prompt is &gt;&gt;&gt;'

from xml.sax.saxutils import unescape
unescape(t)




'The prompt is >>>'

In [384]:
!pip install xml

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement xml (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for xml[0m[31m
[0m

## 2.18. Токенизация текста

In [420]:
text = 'foo = 23 + 42 * 10'

import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

scanner = master_pat.scanner('foo = 42')

for m in iter(scanner.match, None):
    print(m.lastgroup, m.group())

NAME foo
WS  
EQ =
WS  
NUM 42


In [None]:
import re

NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

In [423]:
# упакуем в генератор

from collections import namedtuple

Token = namedtuple('Token', ['type', 'value'])

def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
        
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)
    
tokens = (tok for tok in generate_tokens(master_pat, text)
         if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [430]:
PRINT = r'(?P<PRINT>\bprint\b)'
NAME = r'(?P<NAME>[a-zA-Z][a-zA-Z_0-9]*)'

master_pat = re.compile('|'.join([PRINT, NAME]))

text2 = 'printe'

for tok in generate_tokens(master_pat, text2):
    print(tok)

Token(type='NAME', value='printe')


## 2.19. Написание простого парсера на основе метода
## рекурсивного спуска

In [445]:
import re
import collections

NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                 DIVIDE, LPAREN, RPAREN, WS]))

Token = collections.namedtuple('Token', ['type', 'value'])


def generate_tokens(master_pat, text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok
            
# Парсер

class ExpressionEvaluator:
    '''
 Реализация парсера на базе рекурсивного спуска.
 Каждый метод реализует одно правило грамматики.
 Использует метод ._accept(), чтобы протестировать
 и принять текущий токен предварительного просмотра.
 Использует метод ._expect() для точного совпадения
 и отбрасывания следующего токена на входе
 (или возбуждает SyntaxError, если он не совпадает).
 '''
    
    def parse(self, text):
        self.tokens = generate_tokens(master_pat, text)
        
        self.tok = None
        self.nexttok = None
        self._advance()
        return self.expr()
    
    def _advance(self):
        'Продвинуться на один токен вперед'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
        
    def _accept(self, toktype):
        'Проверить и потребить следующий токен, если он совпадает с toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
            
    def _expect(self, toktype):
        'Потребить следующий токен, если он совпадает с toktype, или возбудить SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected' + toktype)
            
    # Правила грамматики
    
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
        
    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
                
        return termval
        
    def factor(self):
        "factor ::= NUM | ( expr )"
        
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
        else:
            raise SyntaxError('Expexted NUMBER or LPAREN')

In [446]:
e = ExpressionEvaluator()
print(e.parse('2'))
print(e.parse('2 + 3'))

2
5


In [459]:
import re
import collections

NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                 DIVIDE, LPAREN, RPAREN, WS]))

Token = collections.namedtuple('Token', ['type', 'value'])


def generate_tokens(master_pat, text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok



class ExpressionEvaluator:
    
    def parse(self, text):
        self.tokens = generate_tokens(master_pat, text)
        
        self.tok = None
        self.nexttok = None
        self._advance()
        print(self.nexttok.type)
        return self.expr()
        
    def _advance(self):
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
        
    def _accept(self, toktype):
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
        
    def _except(self, toktype):
        if not self._accept(toktype):
            raise SyntaxError('Expected' + toktype)
            
    def expr(self):
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.nexttok.type
            right = self.factor()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
        
        
    def term(self):
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.nexttok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval
        
        
    def factor(self):
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            expr = self.expr()
            self._except('RPAREN')
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
        

In [462]:
m = ExpressionEvaluator()
print(m.parse('2'))
m.parse('3+5-2')
m.parse('2 + (3 + * 4)')

NUM
2
NUM
NUM


SyntaxError: Expected NUMBER or LPAREN (<string>)

In [464]:
# Простое дерево парсинга

class ExpressionTreeBuilder(ExpressionEvaluator):
    def expr(self):
        "expression ::= term { ('+'|'-') term}"
        
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.factor()
            if op == 'PLUS':
                exprval = ('+', exprval, right)
            elif op == 'MINUS':
                exprval = ('-', exprval, right)
        return exprval
    
    def term(self):
        "term ::= factor { ('*'|'/') factor }"
        
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval = ('*', termval, right)
            elif op == 'DIVIDE':
                termval = ('/', termval, right)
        return termval
    
    def factor(self):
        "factor ::= NUM | ( expr )"
        
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            expr = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
            
e = ExpressionTreeBuilder()
e.parse('2+3')
e.parse('2 + 3 * 4')

NUM


('+', 2, 3)

In [471]:
class ExpressionTreeBuilder_1:
    
    def parse(self, text):
        self.tokens = generate_tokens(master_pat, text)
        
        self.tok = None
        self.nexttok = None
        self._advance()
        return self.expr()
    
    def _advance(self):
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
    
    def _accept(self, toktype):
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
        
    def _except(self, toktype):
        if not self._accept(toktype):
            raise SyntaxError('Expected' + toktype)
            
    def expr(self):
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.factor()
            if op == 'PLUS':
                exprval = ('+', exprval, right)
            elif op == 'MINUS':
                exprval = ('-', exprval, right)
        return exprval
        
    def term(self):
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval = ('*', termval, right)
            elif op == 'DIVIDE':
                termval = ('/', termval, right)
        return termval
    
    def factor(self):
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._except('RPAREN')
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
            
e = ExpressionTreeBuilder_1()
e.parse('2 + 3')

('+', 2, 3)

In [473]:
# Использование инструментов PyParsing или PLY
from ply.lex import lex
from ply.yacc import yacc

# список токенов
tokens = [ 'NUM', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'LPAREN', 'RPAREN' ]

# игнорируемые токены
t_ignore = ' \t\n'

# Определения токенов (в форме регулярных выражений)

t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMEES = r'\*'
t_DIVIDE = r'/'
t_LPAREN = r'('
t_RPAREN = r')'

# Функции обработки токенов
def t_NUM(t):
    r'\d+'
    t.value = int(t.value)
    return t

# Обработчик ошибок
def t_error(t):
    print('Bad character: {!r}'.format(t.value[0]))
    t.skip(1)
    
# Создание лексера
lexer = lex()

# Правила грамматики и функции-обработчики
def p_expr(p):
    '''
    expr : expr PLUS term
    | expr MINUS term
    '''
    if p[2] == '+':
        p[0] = p[1] + p[3]
    if p[2] == '-':
        p[0] = p[1] - p[3]
    
def p_expr_term(p):
    '''
    expr : term
    '''
    p[0] = p[1]
    
def p_term(p):
    '''
    term : term TIMES factor
    | term DIVIDE factor
    '''
    if p[2] == '*':
        p[0] = p[1] * p[3]
    elif p[2] == '/':
        p[0] = p[1] / p[3]
        
def p_term_factor(p):
    '''
    term : factor
    '''
    p[0] = p[1]
    
def p_factor(p):
    '''
    factor : NUM
    '''
    p[0] = p[1]
    
def p_factor_group(p):
    '''
    factor : LPAREN or RPAREN
    '''
    p[0] = p[2]
    
def p_error(p):
    print('Syntax error')
    
parser = yacc()

parser.parse('2')


ERROR: Rule 't_TIMEES' defined for an unspecified token TIMEES
ERROR: Invalid regular expression for rule 't_LPAREN'. missing ), unterminated subpattern at position 0
ERROR: Invalid regular expression for rule 't_RPAREN'. unbalanced parenthesis at position 14


TypeError: <module '__main__'> is a built-in module

$$x^{2x}$$



## 2.20. Выполнение текстовых операций
## над байтовыми строками

In [479]:
data = b'Hello World'
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))

b'Hello'
True
[b'Hello', b'World']
b'Hello Cruel World'


In [481]:
# Операции над байтовыми массивами

data = bytearray(b'Hello World')
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))

bytearray(b'Hello')
True
[bytearray(b'Hello'), bytearray(b'World')]
bytearray(b'Hello Cruel World')


In [483]:
data = b'FOO:BAR,SPAM'
import re
re.split(b'[:,]', data)


[b'FOO', b'BAR', b'SPAM']

In [484]:
# При индексировании байтовых строк получаем целые числа
# а не символы

a = 'Hello World'
print(a[0])
b = b'Hello World'
print(b[0])

H
72


In [486]:
s = b'Hello World'
print(s)
print(s.decode('ascii'))

b'Hello World'
Hello World


In [487]:
# операции форматирования недоступны для байтоых строк

b'{} {} {}'.format(b'ACME', 100, 490.1)

AttributeError: 'bytes' object has no attribute 'format'

In [491]:
'{:s} {:d} {:.2f}'.format('ACME', 100, 490.1).encode('ascii')

b'ACME 100 490.10'

In [502]:
with open('jalape\xf1o.txt', 'w') as f:
    f.write('spicy')
    
from pathlib import Path

[bytes(s) for s in Path('.').iterdir()]

[b'Python Cookbook \xd0\xa7\xd0\xb0\xd1\x81\xd1\x82\xd1\x8c \xd0\xbf\xd0\xb5\xd1\x80\xd0\xb2\xd0\xb0\xd1\x8f.ipynb',
 b'floats.bin',
 b'dummy',
 b'\xd0\x93\xd0\xbb\xd0\xb0\xd0\xb2\xd0\xb0 4 Unicode-\xd1\x82\xd0\xb5\xd0\xba\xd1\x81\xd1\x82 \xd0\xb8 \xd0\xb1\xd0\xb0\xd0\xb9\xd1\x82\xd1\x8b.ipynb',
 b'cafe.txt',
 b'Python Cookbook \xd0\x93\xd0\xbb\xd0\xb0\xd0\xb2\xd0\xb0 2 \xd0\xa1\xd1\x82\xd1\x80\xd0\xbe\xd0\xba\xd0\xb8 \xd0\xb8 \xd1\x82\xd0\xb5\xd0\xba\xd1\x81\xd1\x82.ipynb',
 b'jalape\xc3\xb1o.txt',
 b'\xd0\xa8\xd0\xbf\xd0\xb0\xd1\x80\xd0\xb3\xd0\xb0\xd0\xbb\xd0\xba\xd0\xb0 \xd0\xbf\xd0\xbe \xd1\x80\xd0\xb5\xd0\xb3\xd1\x83\xd0\xbb\xd1\x8f\xd1\x80\xd0\xbd\xd1\x8b\xd0\xbc \xd0\xb2\xd1\x8b\xd1\x80\xd0\xb0\xd0\xb6\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f.ipynb',
 b'\xd0\xa7\xd1\x82\xd0\xbe-\xd1\x82\xd0\xbe \xd0\xbf\xd1\x80\xd0\xbe \xd1\x8e\xd0\xbf\xd0\xb8\xd1\x82\xd0\xb5\xd1\x80',
 b'\xd0\x9f\xd0\xbe\xd0\xbb\xd1\x8c\xd0\xb7\xd0\xbe\xd0\xb2\xd0\xb0\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd0\xba\xd0