In [31]:
#2.1 Splitting Strings on Any Multiple Delimiters

import re

line = 'Jason! Kevin,   Terry# Jen'
re.split(r'[#,!\s]\s*', line)

['Jason', 'Kevin', 'Terry', 'Jen']

In [32]:
#2.2 Matching Text at the Start or End of a String

filename = 'passwd_list.txt'
filename.startswith('passw')

True

In [33]:
import os

folder1 = os.listdir('.')
folder1

['.ipynb_checkpoints', 'Chapter_2_Strings&Text.ipynb']

In [38]:
[name for name in folder1 if name.endswith(('.ipynb'))]

['Chapter_2_Strings&Text.ipynb']

In [47]:
#2.3 Matching Strings Using Shell Wildcard Patterns

from fnmatch import fnmatch, fnmatchcase

etc_files = ['passwd.txt', 'passwd_list.txt', 'shadow.txt', 'shadow1.txt']
[name for name in etc_files if fnmatch(name, 'passwd*.txt')]

['passwd.txt', 'passwd_list.txt']

In [50]:
[name for name in etc_files if fnmatch(name,'shadow*[0-9].txt')]

['shadow1.txt']

In [56]:
#2.4 Matching and Searching for Text Patterns

message = 'this is a private pin mumber, 123456, do not disclose confidential information'

if re.search(r'\d', message):
    print('yes')
else:
    print('no')

yes


In [57]:
pin = re.findall(r'\d', message)
pin

['1', '2', '3', '4', '5', '6']

In [112]:
#2.5 Searching and Replacing Text

new_message = message.replace('123456', '654321')
print(new_message)

this is a private pin mumber, 654321, do not disclose confidential information


In [113]:
#2.6 Searching and Replacing Case-Insenstitive Text
text_1 = 'ALL CAPS, no caps, Some Caps'
re.findall('caps', text_1, flags=re.IGNORECASE)

['CAPS', 'caps', 'Caps']

In [114]:
text_2 = re.sub('caps', 'null', text_1, flags=re.IGNORECASE)
text_2

'ALL null, no null, Some null'

In [121]:
def matchcase(word):
                     def replace(m):
                         text_1 = m.group()
                     if text_1.isupper():
                         return word.upper()
                     elif text_1.islower():
                         return word.lower()
                     elif text_1[0].isupper():
                         return word.capitalize()
                     else:
                         return word
                         return replace

new_text_1 = re.sub('caps', matchcase('upper case'), text_1, flags=re.IGNORECASE)
new_text_1

'ALL Upper case, no Upper case, Some Upper case'

In [133]:
#2.7 Specifying a Regular Expression for the Shortest Match

credentials = 'your username is "GDawg" and PIN is "8008135"'
str_pattern = re.compile(r'\"(.*?)\"')
str_pattern.findall(credentials)

['GDawg', '8008135']

In [152]:
#2.8 Writing a Regular Expression for Multiline Patterns

user1 = re.compile((r'/\*((?:.|\n)*?)\*/'))
credential_1 = '''/* username: Tflow
                     password: nyancat */'''
               
user1.findall(credential_1)

[' username: Tflow\n                     password: nyancat ']

In [248]:
#2.9 Normalizing Unicode Text to a Standard Representation

user23 = 'Ivan Perisi\u010d'
user24 = 'Ivan Perisic\u030c'
print(user23)
print(user24)

Ivan Perisič
Ivan Perisič


In [163]:
import unicodedata

t1 = unicodedata.normalize('NFC', user23)
t2 = unicodedata.normalize('NFC', user24)
t1 == t2

True

In [195]:
#2.10 Working with Unicode Characters in Regular Expressions

num = re.compile('\d+')
num.match('0678')

<re.Match object; span=(0, 4), match='0678'>

In [206]:
num.match('\u0667\u0668')

<re.Match object; span=(0, 2), match='٧٨'>

In [249]:
pat = re.compile('Ivan Perisi\u010d', re.IGNORECASE)
user23 = 'Ivan Perisič'
pat.match(user23)

<re.Match object; span=(0, 12), match='Ivan Perisič'>

In [207]:
#2.11 Stripping Unwanted Characters from Strings

text_str = "The hacker's name is Michael"
text_str.strip('Michael')

"The hacker's name is "

In [215]:
#2.12 Sanitizing and Cleaning Up Text

messy_str = 'Hi my name$ is #Michael'
remap = {
    ord('$'): '',
    ord('#'): None
}
clean_str = messy_str.translate(remap)
clean_str

'Hi my name is Michael'

In [220]:
#2.13 Aligning Text Strings

billboard = "Don't miss out on the Crypto Boom!!!"
new_billboard = format(billboard, '$^50')
print(new_billboard)


$$$$$$$Don't miss out on the Crypto Boom!!!$$$$$$$


In [221]:
#2.14 Combining and Concatenating Strings

words = ['Is',
         'Cambridge',
         'part',
         'of',
         'Boston?'
        ]
' '.join(words)


'Is Cambridge part of Boston?'

In [224]:
#2.15 Interpolating Variables in Strings

login_info = 'Please Login with {username}, your password is {password}'
login_info.format(username='<Kayla>', password='<lulzsec>')

'Please Login with <Kayla>, your password is <lulzsec>'

In [225]:
class Info:
    def __init__(hacker, username, password):
        hacker.username = username
        hacker.password = password
        
creds = Info('<Kayla>', '<lulzsec>')
login_info.format_map(vars(creds))
        

'Please Login with <Kayla>, your password is <lulzsec>'

In [234]:
#2.16 Reformating Text to a Fixed Number of Columns

lyrics = "Where it began, I can't begin to knowing, But then I know it's growing strong, Was in the spring, And spring became the summer, Who'd have believed you'd come along , Hands, touching hands, Reaching out, touching me, touching you\Sweet Caroline, Good times never seemed so good, I've been inclined, To believe they never would"

import textwrap
print(textwrap.fill(lyrics,25))

Where it began, I can't
begin to knowing, But
then I know it's growing
strong, Was in the
spring, And spring became
the summer, Who'd have
believed you'd come along
, Hands, touching hands,
Reaching out, touching
me, touching you\Sweet
Caroline, Good times
never seemed so good,
I've been inclined, To
believe they never would


In [235]:
#2.17 Handling HTML and XML Entities in Text

import html

warning_msg = 'You are not on the sudoers list, this incident will be reported "<tag>text</text>".'
print(html.escape(warning_msg))

You are not on the sudoers list, this incident will be reported &quot;&lt;tag&gt;text&lt;/text&gt;&quot;.


In [236]:
print(html.escape(warning_msg, quote=False))

You are not on the sudoers list, this incident will be reported "&lt;tag&gt;text&lt;/text&gt;".


In [250]:
user24.encode('ascii', errors='xmlcharrefreplace')

b'Ivan Perisic&#780;'

In [251]:
from html.parser import HTMLParser

user25 = 'Ivan Perisic&#780;'
parser = html
user25 = parser.unescape(user25)
print(user25)

Ivan Perisič


In [252]:
user25 == user24

True

In [261]:
#2.18 Tokenizing Text

text_a = 'baz = 12 - 6 / 2'

tokens = [
    ('NAME', 'baz'),
    ('EQ', '='),
    ('NUM', '12'),
    ('MINUS', '-'),
    ('NUM', '6'),
    ('DIVIDE', '/'),
    ('NUM', '2')
]

NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
MINUS = r'(?P<MINUS>\-)'
DIVIDE = r'(?P<DIVIDE>\/)'
EQ = r'(?P<EQ>\=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, MINUS, DIVIDE, EQ, WS]))

scanner = master_pat.scanner('baz = 12')
scanner.match()

<re.Match object; span=(0, 3), match='baz'>

In [262]:
_.lastgroup,_.group()

('NAME', 'baz')

In [263]:
scanner.match()

<re.Match object; span=(3, 4), match=' '>

In [264]:
_.lastgroup,_.group()

('WS', ' ')

In [265]:
scanner.match()

<re.Match object; span=(4, 5), match='='>

In [266]:
_.lastgroup,_.group()

('EQ', '=')

In [267]:
scanner.match()

<re.Match object; span=(5, 6), match=' '>

In [268]:
_.lastgroup,_.group()

('WS', ' ')

In [269]:
scanner.match()

<re.Match object; span=(6, 8), match='12'>

In [270]:
_.lastgroup,_.group()

('NUM', '12')

In [271]:
from collections import namedtuple

Token = namedtuple('Token', ['type', 'value'])

def generate_tokens(pat, text_a):
    scanner = pat.scanner(text_a)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
        
for tok in generate_tokens(master_pat, 'baz = 12'):
    print(tok)

Token(type='NAME', value='baz')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='12')


In [272]:
tokens = (tok for tok in generate_tokens(master_pat, text_a)
          if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='baz')
Token(type='EQ', value='=')
Token(type='NUM', value='12')
Token(type='MINUS', value='-')
Token(type='NUM', value='6')
Token(type='DIVIDE', value='/')
Token(type='NUM', value='2')


In [277]:
#2.19 Writing a Simple Recursive Descent Parser

import collections

def generate_tokens(text_a):
    scanner = master_pat.scanner(text_a)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok

class ExpressionEvaluator:
    
    def parse(self,text_a):
        self.tokens = generate_tokens(text_a)
        self.tok = None 
        self.nexttok = None 
        self._advance() 
        return self.expr()
    
    def _advance(self):
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
        
    def _accept(self,toktype):
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
        
def _expect(self,toktype):
    if not self._accept(toktype):
        raise SyntaxError('Expected ' + toktype)
        
def expr(self):
    "expression ::= term { ('+'|'-') term }*"
    
    exprval = self.term()
    while self._accept('PLUS') or self._accept('MINUS'):
        op = self.tok.type
        right = self.term()
        if op == 'PLUS':
            exprval += right
        elif op == 'MINUS':
            exprval -= right
    return exprval

def term(self):
    "term ::= factor { ('*'|'/') factor }*"
 
    termval = self.factor()
    while self._accept('TIMES') or self._accept('DIVIDE'):
        op = self.tok.type
        right = self.factor()
        if op == 'TIMES':
            termval *= right
        elif op == 'DIVIDE':
            termval /= right
    return termval

def factor(self):
    "factor ::= NUM | ( expr )"
    
    if self._accept('NUM'):
        return int(self.tok.value)
    elif self._accept('LPAREN'):
        exprval = self.expr()
        self._expect('RPAREN')
        return exprval
    else:
        raise SyntaxError('Expected NUMBER or LPAREN')

In [279]:
e = ExpressionEvaluator()
e.parse('12 - 6 / 2')

AttributeError: 'ExpressionEvaluator' object has no attribute 'expr'

In [281]:
#2.20 Performing Text Operations on Byte Strings

my_name = b'Gavin Poon'
my_name.split()

[b'Gavin', b'Poon']

In [282]:
my_name.replace(b'Gavin', b'Call me Mr.')

b'Call me Mr. Poon'