# Simple string search and modify
- methods on string class
- useful but limited functionality

In [10]:
# find substring

s = "foozapbar"
s.index('zap')

3

In [11]:
# substring present

'zap' in s

True

In [12]:
s.startswith('foo')

True

In [13]:
# handy for checking file name types

s.endswith('bar')

True

In [14]:
s.replace('zap', 'MAP')

'fooMAPbar'

In [15]:
s.split('a')

['fooz', 'pb', 'r']

In [16]:
s.split('a')

['fooz', 'pb', 'r']

In [17]:
# handy predicates

x = 'aA3 '

for c in x:
    print(c, c.isalpha(), c.isdigit(), 
          c.isalnum(), c.islower(), c.isupper())

a True False True True False
A True False True False True
3 False True True False False
  False False False False False


In [18]:
dir(s)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isascii',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',


# Regular Expressions
- very powerful, widely used
- syntax a tad cryptic at first glance
- Python has a fairly standard implementation, similar to what other languages provide
- module is 're'
- [standard doc](https://docs.python.org/3.5/library/re.html)
- [more readable doc](https://docs.python.org/3.5/howto/regex.html#regex-howto)
- [online development](https://regex101.com)

In [19]:
import re

s = 'zxcvx97848438455ysdfx234yzX333Ycv555234'

In [20]:
match = re.search("5+", s)
match.group()

'55'

In [21]:
re.findall('5+', s)

['55', '555']

# simple patterns
- '.' matches any char except newline
- '*' matches zero or more of the preceeding pattern
- '+' matches one or more of the preceeding pattern
- [...] define a character set
    - [abz] - match 'a', 'b', or 'z'
    - [a-m] - matches first half of lower case letters
    - [0-9] or \d - matches digits
    - \s - matches whitespace char
    - leading '^' inverts the character set
        - [^0-9] - matches any char except digits

In [22]:
# this pattern will find substrings that
# have any number of digits between 'x' and 'y'

# or 'x[0-9]+y'

pat = 'x\d+y'

In [23]:
s

'zxcvx97848438455ysdfx234yzX333Ycv555234'

In [24]:
# find all substrings that match the pattern
# note match is case sensistive

re.findall(pat, s)

['x97848438455y', 'x234y']

In [25]:
# case insensitive search

re.findall(pat, s, re.IGNORECASE)

['x97848438455y', 'x234y', 'X333Y']

In [26]:
s

'zxcvx97848438455ysdfx234yzX333Ycv555234'

In [27]:
# find substrings built out of the chars 3,4,8

re.findall('[348]+',s)

['8484384', '34', '333', '34']

In [28]:
s, pat

('zxcvx97848438455ysdfx234yzX333Ycv555234', 'x\\d+y')

In [29]:
# split on the regular pattern

re.split(pat, s)

['zxcv', 'sdf', 'zX333Ycv555234']

In [30]:
s

'zxcvx97848438455ysdfx234yzX333Ycv555234'

In [31]:
# some re operations return a 'match object'
# if the match succeeds 
# if there is no match, None is returned...

re.search('aaaaa', s), re.search('555', s)

(None, <re.Match object; span=(33, 36), match='555'>)

In [32]:
# ... easy to test with 'if' (either type)

5 if re.search('aaaaa', s) else -1, 3 if re.search('555', s) else -1

(-1, 3)

# substring replacement
- makes an otherwise difficult task easy

In [33]:
# replace the pattern with a string

re.sub(pat, 'FOOBAR', s)

'zxcvFOOBARsdfFOOBARzX333Ycv555234'

In [34]:
# only replace the first occurance of the pattern

re.sub(pat, 'FOOBAR', s, count=1)

'zxcvFOOBARsdfx234yzX333Ycv555234'

In [35]:
# reverse a word in a string

def revw(s, w):
    rw = w[::-1]
    return re.sub(w, rw, s)
    
revw('reverse a word, any word', 'word'),revw('reverse a word, any word', 'people')

('reverse a drow, any drow', 'reverse a word, any word')

# re groups
- can group patterns with '()'
- can fish out what each group matched
- try [online](https://regex101.com)

In [36]:
s

'zxcvx97848438455ysdfx234yzX333Ycv555234'

In [37]:
#  define two groups
pat = '(y[sz]).*(y[sz])'
m = re.search(pat, s)
m

<re.Match object; span=(16, 26), match='ysdfx234yz'>

In [38]:
# group 0 shows everything that matched
# the other groups show what matched inside '()'

m.group(0), m.group(1), m.group(2)

('ysdfx234yz', 'ys', 'yz')

In [39]:
# could put the middle part of the match in a group as well

m = re.search('(y[sz])(.*)(y[sz])', s)
m.group(0), m.group(1), m.group(2), m.group(3)

('ysdfx234yz', 'ys', 'dfx234', 'yz')

# Example - decrypt

In [40]:
e = '{SVIu6Python-)dKct@\\JK)2is:y:=;;~6reallyMZ-&Bk`*6great!NB!|Krj##'

In [41]:
words = re.findall('[0-9][^0-9]+', e)
words

['6Python-)dKct@\\JK)', '2is:y:=;;~', '6reallyMZ-&Bk`*', '6great!NB!|Krj##']

In [42]:
for word in words:
    ln = int(word[0])
    decode = word[1:ln+1]
    print(decode)

Python
is
really
great!


# Example - pull out properties
- complex pattern
- '\d' matches a digit
- '\d{4}' matches 4 digits

In [43]:
s='''
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171B  
<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12M
'''
urls = re.split('\\n', s)[1:-1]
urls

['<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171B  ',
 '<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  ',
 '<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12M']

In [44]:
# [BKMG] - file length will have a bytes/kilo/mega/giga suffix
pat = '.+src="(.+)" .+href="(.+)".+(\d{4})-(\d{2,2})-(\d{2,2}).+(\d{2}):(\d{2})\s+(\d+)([BKMG])'

for u in urls:
    m = re.match(pat, u)
    print(m.groups())

('/icons/unknown.gif', 'Problems_chap2.nb', '2009', '04', '22', '15', '16', '171', 'B')
('/icons/layout.gif', 'Problems_chap2.pdf', '2009', '10', '12', '13', '15', '252', 'K')
('/icons/unknown.gif', 'Style07.nb', '2009', '04', '22', '15', '16', '12', 'M')
