## **regex practice**

In [1]:
import re

In [2]:
def show_all_matches(regexes, subject, re_length=6):
    print('Sentence:')
    print()
    print('    {}'.format(subject))
    print()
    print(' regexp{} | matches'.format(' ' * (re_length - 6)))
    print(' ------{} | -------'.format(' ' * (re_length - 6)))
    for regexp in regexes:
        fmt = ' {:<%d} | {!r}' % re_length
        matches = re.findall(regexp, subject)
        if len(matches) > 8:
            matches = matches[:8] + ['...']
        print(fmt.format(regexp, matches))

In [3]:
sentence = 'Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.'

show_all_matches([
    r'a',
    r'm',
    r'M',
    r'Mary',
    r'little',
    r'1',
    r'10',
    r'22',
], sentence)


Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 a      | ['a', 'a', 'a', 'a', 'a']
 m      | ['m', 'm']
 M      | ['M']
 Mary   | ['Mary']
 little | ['little', 'little']
 1      | ['1', '1', '1']
 10     | ['10']
 22     | ['22']


In [4]:
res = [
    r'\w',
    r'\d',
    r'\s',
    r'.',  # matches every character
    r'\.', # a literal period
]
show_all_matches(res, sentence)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \w     | ['M', 'a', 'r', 'y', 'h', 'a', 'd', 'a', '...']
 \d     | ['1', '1', '0', '5', '6', '1', '2', '2', '...']
 \s     | [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '...']
 .      | ['M', 'a', 'r', 'y', ' ', 'h', 'a', 'd', '...']
 \.     | ['.', '.', '.']


#### These can be combined together

In [5]:
show_all_matches([r'1\w\w\w\W', r'\d\d'], sentence, re_length=9)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp    | matches
 ------    | -------
 1\w\w\w\W | ['1056,']
 \d\d      | ['10', '56', '12', '22']


#### Repeating

In [6]:
show_all_matches([
    r'\d+'
], sentence)

print('\n---\n')

show_all_matches([
    r'a{2,}',
    r'a{2}',
    r'a{3,4}'
], 'aabbaaaa')

Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \d+    | ['1', '1056', '12', '22']

---

Sentence:

    aabbaaaa

 regexp | matches
 ------ | -------
 a{2,}  | ['aa', 'aaaa']
 a{2}   | ['aa', 'aa', 'aa']
 a{3,4} | ['aaaa']


#### Any of or None of

In [7]:
show_all_matches([
    r'[lt]',
    r'[lt]+',
    r'[^aeiou\s\.]', # any letter that's not a vowel
    r'[a-d]'
], sentence, re_length=12)


Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp       | matches
 ------       | -------
 [lt]         | ['l', 't', 't', 'l', 'l', 'l', 't', 't', '...']
 [lt]+        | ['l', 'ttl', 'l', 'l', 'ttl', 'l', 't', 't', '...']
 [^aeiou\s\.] | ['M', 'r', 'y', 'h', 'd', 'l', 't', 't', '...']
 [a-d]        | ['a', 'a', 'd', 'a', 'a', 'b', 'a', 'b']


In [8]:
# wanted to see the full list for this one so just put it into findall
# manually and used the print statement to see the list in a more compact
# form
print(re.findall(r'[^aeiou\s\.]', sentence))

['M', 'r', 'y', 'h', 'd', 'l', 't', 't', 'l', 'l', 'm', 'b', '1', 'l', 't', 't', 'l', 'l', 'm', 'b', 'N', 't', '1', '0', '5', '6', ',', 'n', 't', '1', '2', ',', 'n', 't', '2', '2', ',', 'j', 's', 't', 'n']


#### Anchors

In [9]:
show_all_matches([
    r'\bo\w+', # any word that starts with an 'o'
    r'^\s', # starts with a space
    r'^M', # 'M' at the start of the string
    r'^h', # 'h' at the start of the string
    r'\bh\w+', # word that starts with 'h'
    r'\.$' # string ends with a period
], sentence)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 1056, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \bo\w+ | ['one']
 ^\s    | []
 ^M     | ['M']
 ^h     | []
 \bh\w+ | ['had']
 \.$    | ['.']


#### Other Common re Functions

`match()` matches from the start of the string  
`search()` finds the first instance of the regular expression  
`sub()` makes substitutions with a regular expression  
`compile()` prepare a regular expression for use ahead of time  

#### Capture Groups

In [10]:
sentence = '''
You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).
'''.strip()

In [11]:
ip_re = r'\d+(\.\d+){3}'

match = re.search(ip_re, sentence)
match[0]

'123.123.123.123'

In [12]:
match_all = re.findall(ip_re, sentence)
match_all

['.123']

In [13]:
# simplified for demonstration, a real url to parse urls would be much more
# complex
url_re = r'(https?)://(\w+)\.(\w+)'

protocol, domain, tld = re.search(url_re, sentence).groups()

print(f'''
protocol: {protocol}
domain:   {domain}
tld:      {tld}
''')



protocol: https
domain:   codeup
tld:      com



#### Non-capturing (aka 'Shy') Groups
- a `?:` at the beginning of a group makes is non-capturing
- also note that any group can be named (aliased) by using `?P<name>`

In [14]:
url_re = r'(?P<protocol>https?)://(?:\w+)\.(?P<tld>\w+)'

match = re.search(url_re, sentence)

print(f'''
groups: {match.groups()}
referencing a group by name: {match.group('tld')}
group dictionary: {match.groupdict()}
''')



groups: ('https', 'com')
referencing a group by name: com
group dictionary: {'protocol': 'https', 'tld': 'com'}



In [15]:
match_all = re.findall(url_re, sentence)
match_all

[('https', 'com')]

#### Substitution

In [16]:
# remove anything that's not a digit
re.sub(r'\D', '', 'abc 123')

'123'

In [17]:
# remove anything that's not a letter
re.sub(r'[^a-z]', '', 'abc 123')

'abc'

In [18]:
# find first three characters, capture the second and use it to replace first 3
re.sub(r'.(.).', r'\1', 'abc')

'b'

In [19]:
# capture first three characters and replace them in reverse
re.sub(r'(.)(.)(.)', r'\3\2\1', 'abc')


'cba'

In [20]:
# replace the last two characters with 'X'
re.sub(r'.{2}$', r'X', 'abc')

'aX'

#### Regex Flags

In [26]:
inputs = ['$1,120.98', '1,0', 'ab', '1.50', '3']

In [35]:
def show_all_matches_2(regex_list, inputs, re_length=6, inp_length=7):
    print('Inputs:')
    print()
    print('    {}'.format(inputs))
    print()
    print(' regexp{} |  input{}  | matches'.format(' ' * (re_length - 6), ' ' * (inp_length - 7)))
    print(' ------{} | -------{} | -------'.format(' ' * (re_length - 6), ' ' * (inp_length - 7)))
    for regexp in regex_list:
        fmt = ' {:<%d} | {:<%d} | {!r}' % (re_length, inp_length)
        for input in inputs:
            matches = re.findall(regexp, input)
            if len(matches) > 8:
                matches = matches[:8] + ['...']
            print(fmt.format(regexp, input, matches))

In [47]:
regexes_2 = [r'(\d+)[,.]*(\d+)*', r'(\d+).*(\d{,2})', r'\d+']

In [48]:
show_all_matches_2(regexes_2, inputs)

Inputs:

    ['$1,120.98', '1,0', 'ab', '1.50', '3']

 regexp |  input  | matches
 ------ | ------- | -------
 (\d+)[,.]*(\d+)* | $1,120.98 | [('1', '120'), ('98', '')]
 (\d+)[,.]*(\d+)* | 1,0     | [('1', '0')]
 (\d+)[,.]*(\d+)* | ab      | []
 (\d+)[,.]*(\d+)* | 1.50    | [('1', '50')]
 (\d+)[,.]*(\d+)* | 3       | [('3', '')]
 (\d+).*(\d{,2}) | $1,120.98 | [('1', '')]
 (\d+).*(\d{,2}) | 1,0     | [('1', '')]
 (\d+).*(\d{,2}) | ab      | []
 (\d+).*(\d{,2}) | 1.50    | [('1', '')]
 (\d+).*(\d{,2}) | 3       | [('3', '')]
 \d+    | $1,120.98 | ['1', '120', '98']
 \d+    | 1,0     | ['1', '0']
 \d+    | ab      | []
 \d+    | 1.50    | ['1', '50']
 \d+    | 3       | ['3']


In [46]:
re.findall(r'(\d+)[,.]*(\d+)*', '$1,120.98')

[('1', '120'), ('98', '')]

In [125]:

def wage_input():
    '''
    Function takes in hourly wage input from a user, validates it and returns
    the validated wage as a float rounded to two digits.

    Inputs: User provided number.

    Outputs: Validated user input returned as a float rounded to 2 decimals.
    '''
    while True:
        user_input = input('Enter hourly wage in decimal format:').replace(',', '.')
        user_input = re.sub(r'^[^0-9](?=\d)', '', user_input)
        if re.findall(r'[^0-9.]', user_input) != []:
            print('Please use only numbers and a decimal point')
            continue
        else:
            output = round(float(re.sub(r'\.(?=.*\.)', '', user_input)), 2)
        return output

In [126]:
def confirm_wage_input():
    '''
    Function takes in the output of wage_input() and asks the user
    to confirm that the amount is correct. User inputs y or n and that input is 
    validated. If y is validated, then wage is returned. If n is
    selected then wage_input is run again
    
    Inputs: None.
    
    Outputs: Confirmed wage amount as a float
    '''
    
    validated_input = wage_input()
    
    while True:  
        print(f'You input a wage of ${validated_input:.2f}/hour.')
        confirm = input('Is this correct? (y or n)').lower().strip()
        if confirm == 'y':
            return validated_input
        elif confirm == 'n':
            validated_input = wage_input()
        else:
            print('Please type either y for yes or n for no.')
    

In [123]:
confirm_wage_input()

Enter hourly wage in decimal format: 4.35


You input a wage of $4.35/hour.


Is this correct? (y or n) n
Enter hourly wage in decimal format: 4.23


You input a wage of $4.23/hour.


Is this correct? (y or n) b


Please type either y for yes or n for no.
You input a wage of $4.23/hour.


Is this correct? (y or n) 5.67


Please type either y for yes or n for no.
You input a wage of $4.23/hour.


Is this correct? (y or n) y


4.23