# Chapter 7 - RegEx

In [47]:
# Full regex is available from the .Net object
!powershell [regex]::new('') ^| Get-Member



   TypeName: System.Text.RegularExpressions.Regex

Name                MemberType Definition                                                                              
----                ---------- ----------                                                                              
Equals              Method     bool Equals(System.Object obj)                                                          
GetGroupNames       Method     string[] GetGroupNames()                                                                
GetGroupNumbers     Method     int[] GetGroupNumbers()                                                                 
GetHashCode         Method     int GetHashCode()                                                                       
GetObjectData       Method     void ISerializable.GetObjectData(System.Runtime.Serialization.SerializationInfo info,...
GetType             Method     type GetType()                                                              

In [24]:
import re

In [25]:
phone_re = re.compile(r'\d{3}-\d{3}-\d{4}')
result = phone_re.search('my number is 212-123-4567')
print(f'Found phone number {result.group()}')

Found phone number 212-123-4567


In [56]:
# Powershell builtin
!powershell \
$Msg, $PhoneRe = 'my number is 212-123-4567', '\d{3}-\d{3}-\d{4}' ;\
If ($Msg -match $PhoneRe) {Write-Host "Found phone number $Matches[0]"}

Found phone number 212-123-4567


In [27]:
# Powershell .Net
!powershell \
$PhoneRe = [regex]::new('\d{3}-\d{3}-\d{4}') ;\
$Result = $PhoneRe.Match('my number is 212-123-4567') ;\
Write-Host "Found phone number $Result.Value"

Found phone number 212-123-4567


### Grouping with parentheses

In [28]:
phone_re = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
match = phone_re.search('my number is 212-123-4567')
print(match)
print(match.group(1))
print(match.group(2))
print(match.group(0))
print(match.group())
area_code, number = match.groups()
print(f'Area: {area_code} Number: {number}')

<re.Match object; span=(13, 25), match='212-123-4567'>
212
123-4567
212-123-4567
212-123-4567
Area: 212 Number: 123-4567


In [76]:
# Powershell built-in
!powershell \
$PhoneRe = '(\d\d\d)-(\d\d\d-\d\d\d\d)' ;\
'my number is 212-123-4567' -Match $PhoneRe ;\
$Matches[1] ;\
$Matches[2] ;\
$Matches[0] ;\
$Matches ;\
$Full, $Area, $Number = $Matches[0..2] ;\
Write-Host "Area: $Area Number: $Number"

True
212
123-4567
212-123-4567

Name                           Value                                                                                   
----                           -----                                                                                   
2                              123-4567                                                                                
1                              212                                                                                     
0                              212-123-4567                                                                            
Area: 212 Number: 123-4567




In [29]:
# Powershell .Net
!powershell \
$PhoneRe = [regex]::new('(\d\d\d)-(\d\d\d-\d\d\d\d)') ;\
$Result = $PhoneRe.Match('my number is 212-123-4567') ;\
$Result ;\
$Result.Groups[1].Value ;\
$Result.Groups[2].Value ;\
$Result.Groups[0].Value ;\
$Result.Value ;\
$Full, $Area, $Number = $Result.Groups.Value ;\
Write-Host "Area: $Area Number: $Number"



Groups   : {0, 1, 2}
Success  : True
Name     : 0
Captures : {0}
Index    : 13
Length   : 12
Value    : 212-123-4567

212
123-4567
212-123-4567
212-123-4567
Area: 212 Number: 123-4567




### Matching multiple options with pipe

In [30]:
hero_re = re.compile (r'Batman|Tina Fey')
print(hero_re.search('Batman and Tina Fey'))
print(hero_re.search('Batwoman and Tina Fey'))

<re.Match object; span=(0, 6), match='Batman'>
<re.Match object; span=(13, 21), match='Tina Fey'>


In [79]:
# .Net
!powershell \
$Hero_re = [regex]::new('Batman^|Tina Fey') ;\
$Hero_re.match('Batman and Tina Fey') ;\
$Hero_re.match('Batwoman and Tina Fey')



Groups   : {0}
Success  : True
Name     : 0
Captures : {0}
Index    : 0
Length   : 6
Value    : Batman

Groups   : {0}
Success  : True
Name     : 0
Captures : {0}
Index    : 13
Length   : 8
Value    : Tina Fey





### Find all

In [32]:
ssn = re.compile(r'\d\d\d-\d\d-\d\d\d\d')
ssn.findall('My social is 123-54-3592. Mom\'s is 549-23-1236')

['123-54-3592', '549-23-1236']

In [89]:
# Powershell
!powershell \
$Ssn = '\d\d\d-\d\d-\d\d\d\d' ;\
$Phrase = 'My social is 123-54-3592. Mom''s is 549-23-1236' ;\
Write-Host (Select-String -Inputobject $Phrase -Pattern $Ssn -AllMatches).Matches.Value

123-54-3592 549-23-1236


In [85]:
# .Net
!powershell \
$Ssn = [regex]::new('\d\d\d-\d\d-\d\d\d\d') ;\
Write-Host $Ssn.Matches('My social is 123-54-3592. Mom''s is 549-23-1236').Value

123-54-3592 549-23-1236


### Using brackets for multiple character options

In [95]:
print('---PYTHON---')
print(re.findall(r'[abcde]', 'hi there, how are you?'))
!powershell '---POWERSHELL---' ;\
Write-Host (Select-String -Inputobject 'hi there, how are you?' -Pattern '[abcde]' -AllMatches).Matches.Value ;\
'---.NET---';\
Write-Host ([regex]::new('[abcde]').Matches('hi there, how are you?').Value)

---PYTHON---
['e', 'e', 'a', 'e']
---POWERSHELL---
e e a e
---.NET---
e e a e


re.IGNORECASE, re.DOTALL, and re.VERBOSE

In [103]:
# ignore case - note Powershell uses Case Insensitive as assumed, must change to Case Insensitive
print('---PYTHON---')
print(re.match(r'hello', 'Hello world!', re.IGNORECASE))
!powershell '---POWERSHELL---' ;\
'Hello world!' -Match 'hello';\
$Matches ;\
'---.NET---' ;\
[regex]::new('(?i:hello)').Match('Hello world!')

---PYTHON---
<re.Match object; span=(0, 5), match='Hello'>
---POWERSHELL---
True

Name                           Value                                                                                   
----                           -----                                                                                   
0                              Hello                                                                                   
---.NET---

Groups   : {0}
Success  : True
Name     : 0
Captures : {0}
Index    : 0
Length   : 5
Value    : Hello





In [107]:
# Ignore whitespace
print('---PYTHON---')
print(re.match(r'\d\d\d-\d\d     -\d\d\d\d', '123-45-6788', re.VERBOSE))
!powershell '---POWERSHELL---';\
'123-45-6788' -Match '(?x:\d\d\d-\d\d-     \d\d\d\d)' ;\
$Matches ;\
'---.NET---' ;\
[regex]::new('(?x:\d\d\d-\d\d-     \d\d\d\d)').Match('123-45-6788')

---PYTHON---
<re.Match object; span=(0, 11), match='123-45-6788'>
---POWERSHELL---
True

Name                           Value                                                                                   
----                           -----                                                                                   
0                              123-45-6788                                                                             
---.NET---

Groups   : {0}
Success  : True
Name     : 0
Captures : {0}
Index    : 0
Length   : 11
Value    : 123-45-6788





In [37]:
with open('regex_data.txt', 'w') as f:
    f.write('''LINE 1
LINE 2
LINE 3''')

In [110]:
# Dotall includes new-lines; Powershell.. not sure how to get this one to work
# *** TODO - figure out how to get .* regex working! get all lines!!!!????
with open('regex_data.txt') as f:
    data = f.read()
    print('---PYTHON---')
    print(re.search('.*', data))
    print(re.search('.*', data, re.DOTALL))
!powershell '---POWERSHELL---';\
(Get-Content regex_data.txt -Raw) -Match '.*' ;\
$Matches.Values ;\
(Get-Content regex_data.txt -Raw) -Match '(.^|\n)*' ;\
$Matches.Values ;\
'---.NET---' ;\
([regex]::new('.*').Match((Get-Content regex_data.txt -Raw))) ;\
([regex]::new('(.^|\n)*').Match((Get-Content regex_data.txt -Raw)))

---PYTHON---
<re.Match object; span=(0, 6), match='LINE 1'>
<re.Match object; span=(0, 20), match='LINE 1\nLINE 2\nLINE 3'>
---POWERSHELL---
True
LINE 1

True
3
LINE 1
LINE 2
LINE 3
---.NET---


Groups   : {0}
Success  : True
Name     : 0
Captures : {0}
Index    : 0
Length   : 7
Value    : LINE 1

Groups   : {0, 1}
Success  : True
Name     : 0
Captures : {0}
Index    : 0
Length   : 22
Value    : LINE 1
           LINE 2
           LINE 3





### Substituting Strings with the sub() Method

#### Python

In [116]:
names_re = re.compile(r'Agent \w+')
names_re.sub(r'CENSORED', 
             'Agent Alice told Agent Bob about Agent Carol getting wasted on the job.')

'CENCORED told CENCORED about CENCORED getting wasted on the job.'

In [115]:
names_re = re.compile(r'Agent (\w)\w+')
names_re.sub(r'Agent \1******', 
             'Agent Alice told Agent Bob about Agent Carol getting wasted on the job.')

'Agent A****** told Agent B****** about Agent C****** getting wasted on the job.'

#### Powershell

In [122]:
!powershell \
'Agent Alice told Agent Bob about Agent Carol getting wasted on the job.' -Replace \
'Agent \w+', 'CENSORED'

CENSORED told CENSORED about CENSORED getting wasted on the job.


In [123]:
!powershell \
'Agent Alice told Agent Bob about Agent Carol getting wasted on the job.' -Replace \
'Agent (\w)\w+', 'Agent $1******'

Agent A****** told Agent B****** about Agent C****** getting wasted on the job.


#### .Net in Powershell

In [121]:
!powershell \
$NamesRe = [Regex]::New('Agent \w+') ;\
$NamesRe.Replace('Agent Alice told Agent Bob about Agent Carol getting wasted on the job.', \
                 'CENSORED')

CENSORED told CENSORED about CENSORED getting wasted on the job.


In [119]:
!powershell \
$NamesRe = [Regex]::new('Agent (\w)\w+') ;\
$NamesRe.Replace('Agent Alice told Agent Bob about Agent Carol getting wasted on the job.', \
                 'Agent $1******')

Agent A****** told Agent B****** about Agent C****** getting wasted on the job.
