# match(), search(), findall(), finditer()


In [None]:
import re

test_str="123abc7263gshbfcabc234894"

In [None]:
### 1st way to search pattern
### we will get all matches and return objects
pattern=re.compile(r"abc")
matches=pattern.finditer(test_str)

for _ in matches:
  print(_)

<_sre.SRE_Match object; span=(3, 6), match='abc'>
<_sre.SRE_Match object; span=(16, 19), match='abc'>


In [None]:
 

### 2nd way to search pattern
matches=re.finditer(r"abc", test_str)

print(matches)


<callable_iterator object at 0x7f7a1afd72e8>


In [None]:

### 3rd way to search pattern
### ''' we will get all matches and return strings '''
pattern=re.compile(r"abc")
matches=pattern.findall(test_str)
print(matches)


['abc', 'abc']


In [None]:

### 4TH way to search pattern
### matche at the begining of the string, if pattern is present
### return object
pattern=re.compile(r"123")
matches=pattern.match(test_str) 
print(matches)


<_sre.SRE_Match object; span=(0, 3), match='123'>


In [None]:
### 4TH way to search pattern
### matches the first pattern of the string, not necessary at start of string
### return object
pattern=re.compile(r"123")
matches=pattern.match(test_str) 
print(matches)

<_sre.SRE_Match object; span=(0, 3), match='123'>


# group, start, end, span

In [None]:
import re

test_str="123abc7263gshbfcabc234894"

In [None]:
pattern=re.compile(r"abc")
matches=pattern.finditer(test_str) 

for _ in matches:
  print(_.group(), 
        _.span(), 
        _.start(), 
        _.end(), sep=" - ")

abc - (3, 6) - 3 - 6
abc - (16, 19) - 16 - 19


# Meta characters

Metacharacters are characters with a special meaning:
All meta characters: . ^ $ * + ? { } [ ] \ | ( )
Meta characters need need to be escaped (with ) if we actually want to search for the char.

    . Any character (except newline character) "he..o"
    ^ Starts with "^hello"
    $ Ends with "world$"
    * Zero or more occurrences "aix*"
    + One or more occurrences "aix+"
    { } Exactly the specified number of occurrences "al{2}"
    [] A set of characters "[a-m]"
    \ Signals a special sequence (can also be used to escape special characters) "\d"
    | Either or "falls|stays"
    ( ) Capture and group

In [None]:
import re

test_str="123abc7263gshbfcabc234894"

In [None]:
pattern=re.compile(r".")
matches=pattern.match(test_str) 
print(matches)

# More Metacharacters / Special Sequences

A special sequence is a \ followed by one of the characters in the list below, and has a special meaning:

    \d :Matches any decimal digit; this is equivalent to the class [0-9].
    \D : Matches any non-digit character; this is equivalent to the class [^0-9].
    \s : Matches any whitespace character;
    \S : Matches any non-whitespace character;
    \w : Matches any alphanumeric (word) character; this is equivalent to the class [a-zA-Z0-9_].
    \W : Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_].
    \b Returns a match where the specified characters are at the beginning or at the end of a word r"\bain" r"ain\b"
    \B Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word r"\Bain" r"ain\B"
    \A Returns a match if the specified characters are at the beginning of the string "\AThe"
    \Z Returns a match if the specified characters are at the end of the string "Spain\Z"

In [None]:
pattern=re.compile(r"\w")
matches=pattern.findall(test_str) 
for match in matches:
  print(match)

# grouping ( )

In [None]:
emails = """
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
pattern = re.compile('[a-zA-Z1-9-]+@[a-zA-Z-]+\.[a-zA-Z]+')
pattern = re.compile('[a-zA-Z1-9-]+@[a-zA-Z-]+\.(com|de)')
pattern = re.compile('([a-zA-Z1-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)')
matches = pattern.finditer(emails)
for match in matches:
    print(match)
    print(match.group(0))
    print(match.group(1))
    print(match.group(2))
    print(match.group(3))

<_sre.SRE_Match object; span=(1, 25), match='pythonengineer@gmail.com'>
pythonengineer@gmail.com
pythonengineer
gmail
com
<_sre.SRE_Match object; span=(26, 48), match='Python-engineer@gmx.de'>
Python-engineer@gmx.de
Python-engineer
gmx
de
<_sre.SRE_Match object; span=(49, 81), match='python-engineer123@my-domain.org'>
python-engineer123@my-domain.org
python-engineer123
my-domain
org


# Modifying strings

    split(): Split the string into a list, splitting it wherever the RE matches
    sub(): Find all substrings where the RE matches, and replace them with a different string

In [None]:
my_string = 'abc123ABCDEF123abc'
pattern = re.compile(r'123') #  no escape for the . here in the set
matches = pattern.split(my_string)
print(matches)

my_string = "hello world, you are the best world"
pattern = re.compile(r'world')
subbed_string = pattern.sub(r'planet', my_string)
print(subbed_string)

['abc', 'ABCDEF', 'abc']
hello planet, you are the best planet


# Compilation Flags

    ASCII, A : Makes several escapes like \w, \b, \s and \d match only on ASCII characters with the respective property.
    DOTALL, S : Make . match any character, including newlines.
    IGNORECASE, I : Do case-insensitive matches.
    LOCALE, L : Do a locale-aware match.
    MULTILINE, M : Multi-line matching, affecting ^ and $.
    VERBOSE, X (for ‘extended’) : Enable verbose REs, which can be organized more cleanly and understandably.

In [None]:
my_string = "Hello World"
pattern = re.compile(r'world', re.IGNORECASE) # No match without I flag
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

my_string = '''
hello
cool
Hello
'''
# line starts with ...
pattern = re.compile(r'^[a-z]', re.MULTILINE) # No match without M flag
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(6, 11), match='World'>
<_sre.SRE_Match object; span=(1, 2), match='h'>
<_sre.SRE_Match object; span=(7, 8), match='c'>


# Sets

A set is a set of characters inside a pair of square brackets [] with a special meaning. Append multiple conditions back-to back, e.g. [aA-Z].
A ^ (caret) inside a set negates the expression.
A - (dash) in a set specifies a range if it is in between, otherwise the dash itself.

Examples:
- [arn] Returns a match where one of the specified characters (a, r, or n) are present
- [a-n] Returns a match for any lower case character, alphabetically between a and n
- [^arn] Returns a match for any character EXCEPT a, r, and n
- [0123] Returns a match where any of the specified digits (0, 1, 2, or 3) are present
- [0-9] Returns a match for any digit between 0 and 9
- 0-5 Returns a match for any two-digit numbers from 00 and 59
- [a-zA-Z] Returns a match for any character alphabetically between a and z, lower case OR upper case

# Quantifier

    * : 0 or more
    + : 1 or more
    ? : 0 or 1, used when a character can be optional
    {4} : exact number
    {4,6} : range numbers (min, max)