# Język Python - Wykład 5.

## Wyrażenia regularne

Some people, when confronted with a problem, think "I
know, I'll use regular expressions." Now they have two
problems.
-- Jamie Zawinski, '<alt.religion.emacs>' (08/12/1997)


In [1]:
import re

## Metaznaki

## Użycie

In [4]:
p = re.compile('ab|bc')
print(p)

re.compile('ab|bc')


In [5]:
print(p.match('bc'))

<_sre.SRE_Match object; span=(0, 2), match='bc'>


In [6]:
print(p.match('ba'))

None


In [7]:
print(re.search('b', 'abc'))

<_sre.SRE_Match object; span=(1, 2), match='b'>


In [8]:
print(re.match('b', 'abc'))

None


In [9]:
print(re.match('a', 'abc'))

<_sre.SRE_Match object; span=(0, 1), match='a'>


In [16]:
print(re.split(r'\W+', 'Words, words, words.'))

['Words', 'words', 'words', '']


In [17]:
print(re.split(r'(\W+)', 'Words, words, words.'))

['Words', ', ', 'words', ', ', 'words', '.', '']


In [14]:
text = 'ddabababababcc'
p = re.compile('(ab)+')
span = p.search(text).span()
print(span)
print(text[span[0]:span[1]])

(2, 12)
ababababab


In [15]:
p = re.compile('a/{,3}b')
print(p.match('ab'))
print(p.match('a/b'))
print(p.match('a//b'))
print(p.match('a///b'))
print(p.match('a////b'))

<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 3), match='a/b'>
<_sre.SRE_Match object; span=(0, 4), match='a//b'>
<_sre.SRE_Match object; span=(0, 5), match='a///b'>
None


In [19]:
print(re.search(r'\bfoo\b', 'bar foo baz').group())

foo


In [20]:
print(re.findall(r'[0-9]', 'a1b2c3d4e5'))

['1', '2', '3', '4', '5']


In [21]:
for m in re.finditer(r'[0-9]','a1b2c3d4e5'):
    print(m.group(0), m.span())

1 (1, 2)
2 (3, 4)
3 (5, 6)
4 (7, 8)
5 (9, 10)


## Notacja „raw string”

In [22]:
re.match(r"\W(.)\1\W", " ff ").group()

' ff '

In [23]:
re.match("\\W(.)\\1\\W", " ff ").group()

' ff '

## Niezachłanne dopasowania

In [24]:
s = '<html><head><title>Title</title>'

In [26]:
print(re.match('<.*>', s).group())

<html><head><title>Title</title>


In [27]:
print(re.match('<.*?>', s).group())

<html>


## Grupowanie

In [28]:
p = re.compile('(a(b)c)d')
m = p.match('abcd')

In [29]:
m.group(0)

'abcd'

In [30]:
m.group(1)

'abc'

In [31]:
m.group(2)

'b'

In [32]:
#Match dates formatted like MM/DD/YYYY, MM-DD-YY,...
import re

date = '01/12/2013'

regex = re.compile(r'^(?P<day>\d\d)[-/](?P<month>\d\d)[-/](?P<year>\d\d(?:\d\d)?)$')

match = regex.match(date)

In [33]:
print(match.group(0), match.group(1), match.group(2), match.group(3))

01/12/2013 01 12 2013


In [34]:
print(match.group('day'), match.group('month'), match.group('year'))

01 12 2013


In [35]:
print(match.groups())

('01', '12', '2013')


In [36]:
print(regex.groupindex)

{'day': 1, 'year': 3, 'month': 2}


In [37]:
re.match(r'(ala) \1', 'ala ala').group(0)

'ala ala'

## Extension Notation (?...)

Nazwana grupa (?P<name>...)

Powielenie (?P=)

Nieprzechwytująca grupa (?:foo)

Positive Lookahead (?=...)

Negative Lookahead (?!...)

Positive look behind (?<=regex)

Negative look behind (?<!regex)

Warunkowe wyrażenie (?(id/name)yes-pattern|no-pattern)

### Powielenie (?P=<name>)

In [38]:
p = re.compile(r'(?P<word>\b\w+)\s+(?P=word)')

In [39]:
p.match('ala ala').group()

'ala ala'

In [40]:
p = re.compile(r'(?P<word>\b\w+)\s+\1')

In [41]:
p.match('ala ala').group()

'ala ala'

### Nieprzechwytująca grupa (?:foo)

In [43]:
re.match(r'(\w+@\w+(?:\.\w+)+)','adres@uj.edu.pl').groups()

('adres@uj.edu.pl',)

### Positive Lookahead (?=...)

In [45]:
re.match(r'(\w+(?=@\w+(?:.\w+)+))','adres@uj.edu.pl').group()

'adres'

In [46]:
re.match(r'(\w+(?=@\w+(?:.\w+)+)@)','adres@uj.edu.pl').group()

'adres@'

### Negative Lookahead (?!...)

In [47]:
s='plik.py'
re.match(r'(.*)[.](?!bat$|exe$).*$',s).group(1)

'plik'

In [49]:
s='plik.bat'
re.match(r'(.*)[.](?!bat$|exe$).*$',s).group(1)

AttributeError: 'NoneType' object has no attribute 'group'

### Positive Lookbehind

In [51]:
s='From: adres@uj.edu.pl'
re.search(r'(?<=From: )(.*)',s).groups()

('adres@uj.edu.pl',)

In [52]:
s='From: adres@uj.edu.pl'
re.search(r'(?<=F*)(.*)',s).groups()

error: look-behind requires fixed-width pattern

### Negative Lookbehind 

In [54]:
s='From: adres@uj.edu.pl'
re.search(r'(?<!^Subject)(?<=: )(.*)',s).group()

'adres@uj.edu.pl'

## Flagi

### re.VERBOSE

In [55]:
pat = re.compile(r"\s*(?P<header>[^:]+)\s*:(?P<value>.*?)\s*$")

In [56]:
pat.match('From: adres@uj.edu.pl').groups()

('From', ' adres@uj.edu.pl')

In [57]:
pat = re.compile(r"""
    \s*               # Skip leading whitespace
    (?P<header>[^:]+) # Header name
    \s* :             # Whitespace, and a colon
    (?P<value>.*?)    # The header's value -- *? used to
                      # lose the following trailing whitespace
    \s*$
""", re.VERBOSE)


In [58]:
pat.match('From: adres@uj.edu.pl').groups()

('From', ' adres@uj.edu.pl')

### re.IGNORECASE

In [59]:
re.match(r'(?i)ala','Ala').group()

'Ala'

### re.MULTILINE re.DOTALL

In [60]:
s="""Ala ma
kota"""

print(re.match(r'^A.*ta$', s))
print(re.match(r'(?ms)^A.*ta$', s))

None
<_sre.SRE_Match object; span=(0, 11), match='Ala ma\nkota'>


In [61]:
p = re.compile('.*', re.DEBUG)


p = re.compile('(<)?(\w+@\w+(?:\.\w+)+)(?(1)>)', re.DEBUG)



max_repeat 0 2147483647
  any None
max_repeat 0 1
  subpattern 1
    literal 60
subpattern 2
  max_repeat 1 2147483647
    in
      category category_word
  literal 64
  max_repeat 1 2147483647
    in
      category category_word
  max_repeat 1 2147483647
    subpattern None
      literal 46
      max_repeat 1 2147483647
        in
          category category_word
subpattern None
  groupref_exists 1
    literal 62


## Podstawienie

In [63]:
#urlify - turn URLs into HTML links
import re

text = 'Check the web site, http://www.oreilly.com/catalog/regexppr.'

pattern = r'''
    \b                                    # start at word boundary
    (                                     # capture to \1
    (https?|telnet|gopher|file|wais|ftp) :# resource and colon
    [\w/#~:.?+=&%@!\-] +?                 # one or more valid chars
                                          # take little as possible
    )
    (?=                                   # lookahead
    [.:?\-] *                             # for possible punc
    (?: [^\w/#~:.?+=&%@!\-]               # invalid character
    | $ )                                 # or end of string
    )'''

regex = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
result = regex.sub(r'<a href="\1">\1</a>', text)

print(result)

Check the web site, <a href="http://www.oreilly.com/catalog/regexppr">http://www.oreilly.com/catalog/regexppr</a>.


### Podstawienie funkcją

In [64]:
import random
def repl(m):
    inner_word = list(m.group())
    random.shuffle(inner_word)
    return "".join(inner_word)


In [65]:
text = "Professor Abdolmalek, please report your absences promptly."
re.sub("(?:\w)(\w+)(?:\w)", repl, text)

'Porsorfse Aamlkdbole, lesaep erotpr uyor ebasscne pryplmot.'

In [66]:
indent = lambda s: min(map(len,re.findall('^ *(?=\S)', s, re.MULTILINE)))
flush_left = lambda s: re.sub('^ {%d}' % indent(s), '', s, flags=re.MULTILINE)

In [68]:
s="""    
    Bardzo
       roznie
     wciety
    tekst 
"""
print(s)
print()
print(flush_left(s))

    
    Bardzo
       roznie
     wciety
    tekst 



Bardzo
   roznie
 wciety
tekst 

