# Chapter 2: Strings and text

## 2.1. Splitting strings on any of multiple delimeters: re.split()

You want to split the stirng based on delimiters and spacing but they are not consistent.

In [47]:
line = 'dwfwrg fcwoirjfw; fwigjfw, wfwg,gfwgw,     www'

# not good:
print(line.split())

import re
# \s* is for al the spaces between the delimiter and the next char
print(re.split(r'[;,\s]\s*', line))

# Using a capture group (paranthesys instead of square brackets)
# then it prints out also the matched text
re.split(r'(;|,|\s])\s*', line)

['dwfwrg', 'fcwoirjfw;', 'fwigjfw,', 'wfwg,gfwgw,', 'www']
['dwfwrg', 'fcwoirjfw', 'fwigjfw', 'wfwg', 'gfwgw', 'www']


['dwfwrg fcwoirjfw', ';', 'fwigjfw', ',', 'wfwg', ',', 'gfwgw', ',', 'www']

In [48]:
fields = re.split(r'(;|,|\s)\s*', line)
# starts at element 0 and skips one element and picks the subsequent, and so on
values = fields[::2]
# starts at element position in 1 and skips the subsequent element
delimeters = fields[1::2] + ['']
print(values)
print(delimeters)

# you can reform the line
print(''.join(v+d for v, d in zip(values, delimeters)))

# if you need to use a capture group but you do not want
# to save the delimiters, theN´n use ?::
re.split(r'(?:;|,|\s)\s*', line)

['dwfwrg', 'fcwoirjfw', 'fwigjfw', 'wfwg', 'gfwgw', 'www']
[' ', ';', ',', ',', ',', '']
dwfwrg fcwoirjfw;fwigjfw,wfwg,gfwgw,www


['dwfwrg', 'fcwoirjfw', 'fwigjfw', 'wfwg', 'gfwgw', 'www']

## 2.2. Matching text at the start or end of a string: .endswith() .atartswith()

You need to check the start or the end of a string for text patterns.

In [57]:
import os

filename = 'spam.txt'
print(filename.endswith(('.txt', '.md')))
print(filename.startswith('file:'))
url = 'http://www.hallooo.org'
print(url.startswith('http:'))

filenames = os.listdir('.')
print(filenames)

print(any(name.endswith('.md') for name in filenames))

# MAKE SURE THE CHOICES ARE A TUPLE!!!

[name for name in filenames if name.endswith(('.md', '.h'))]


True
False
True
['Chapter_2.ipynb', 'README.md', '.ipynb_checkpoints']
True


['README.md']

## 2.3. Matching strings using shell wildcard patterns: fnmatch

Match text with commonly used patterns in unix shell.

In [79]:
from fnmatch import fnmatch, fnmatchcase
# Mac is case sensitive, unline windows os
# if this matters, then use fnmatchcase, which makes
# the command case sensitive
print(
    fnmatch('foo.txt', '*.txt'),
    fnmatch('foo.txt', '*.TXT'),
    fnmatch('foo.txt', '?oo.txt'),
    fnmatch('Dat4544554956544584.csv', 'Dat[0-9]*')
)

names = ['Dat1.csv', 'config.ini']
[name for name in names if fnmatch(name, 'Dat*.csv')]

# fnmatch can also be used for non-file names
addresses = [
    '7424 N CLARK ST',
    '2922 W HALOO AVE'
]

print([addr for addr in addresses if fnmatchcase(addr, '*ST')])
print([addr for addr in addresses if fnmatchcase(addr, '74[1-9][1-9] *CLARK*')])

# For code that explicitly works with filenames, use the glob module of 5.13.

True False True True
['7424 N CLARK ST']
['7424 N CLARK ST']


## 2.4. Splitting strings on any of multiple delimeters: re.compile, .match(), .findall(), .finditer()

You want to search or match text for a specific pattern.

In [107]:
# You can use exact match with ==, startswith, endswith, and 
text = 'yes no no no yes no yes yes'
# Yields the position of the first character of the 
# first occurance
print(text.find('no'))

import re

date = '11/27/2021'
# \d+ means match one or more digits
# The "$" at the end is to speciy an exact match
pattern = re.compile(r'\d+/\d+/\d+$')

# This always starts checking from the beginning of
# the string
print('yes') if pattern.match(date) else print('no')

date_2 = 'lipsum 11/27/2021 lipsum 11/27/2021'

# this finds the pattern wherever it is in the tex
pattern.findall(date_2)

# it is notmal to use capture groups. This simplifies
# later processing because you can selectively
# pick the groups later on

pattern = re.compile(r'(\d+)/(\d+)/(\d+)')
m = pattern.match(date)
print(
    m.group(0),
    m.group(1),
    m.group(3)
)
print(m.groups())

m = pattern.findall(date_2)
print(m,
m[0][0])

# USE finditer to find them iteratively
for m in pattern.finditer(date_2):
    print(m.groups())

4
yes
11/27/2021 11 2021
('11', '27', '2021')
[('11', '27', '2021'), ('11', '27', '2021')] 11
('11', '27', '2021')
('11', '27', '2021')


## 2.5. Searching and replacing text

In [113]:
# for simple literal patterns, use:
text = 'yes no no no yes no yes yes'
text.replace('yes', 'yep')

# for more complicated paterns
import re

text =  'lipsum 11/27/2021 lipsum 11/27/2021'
# very useful to use groups
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

# also
pattern = re.compile(r'(\d+)/(\d+)/(\d+)')
pattern.sub(r'\3-\1-\2', text)

'lipsum 2021-11-27 lipsum 2021-11-27'

In [118]:
# For more complicated substitutions, you can use
#  a callback function instead

from calendar import month_abbr

# the input is a matcg or find object
def change_date(m):
    
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

newtext = pattern.sub(change_date, text)
pattern.subn(change_date, text)

('lipsum 27 Nov 2021 lipsum 27 Nov 2021', 2)

## 2.6. Searching and replacing case-insensitive text

## 2.7. Specifying a regular expression for the shortest match

## 2.8. Writing a regular expression for multiline patterns

## 2.9. Normalizing unicode text to a standard representation

## 2.10. Working with unicode characters in regular expressions

## 2.11. Stripping unwanted characters from strings

## 2.12. Sanitizing and cleaning up text

## 2.13. Aligning text strings

## 2.14. Combining and concatenating strings

## 2.15. Interpolating variables in strings

## 2.16. Reformatting text to a fixed number of columns

## 2.17. Handling HTML and XML entities in text

## 2.18. Tokenizing text

## 2.19. Writing a simple recursive descent parser

## 2.20. Performing text operations on byte strings