# RegEx

In [1]:
import re

In [2]:
pattern = re.compile("hello")
pattern.match("hello world")

<re.Match object; span=(0, 5), match='hello'>

## Character classes

## Character set range

In [3]:
character_set_txt = """
The first season of Indian Premiere League (IPL) was played in 2008. 
The second season was played in 2009 in South Africa. 
Last season was played in 2018 and won by Chennai Super Kings (CSK).
CSK won the title in 2010 and 2011 as well.
Mumbai Indians (MI) has also won the title 3 times in 2013, 2015 and 2017.
"""

# character_set_pattern = re.compile("[1-9][0-9][0-9][0-9]")
character_set_pattern = re.compile("[1-9]\d\d\d")
character_set_pattern.findall(character_set_txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

## Character set negation

In [None]:
# [^a-z] not between a to z

character_set_pattern_negation = re.compile("[^aeiou]")
character_set_pattern_negation.findall(character_set_txt)

In [5]:
# finding out all special symbols -- non-alpha numeric and non-whitespaces

special_symbol = re.compile("[^\w\s]")
special_symbol.findall(character_set_txt)

['(', ')', '.', '.', '(', ')', '.', '.', '(', ')', ',', '.']

## Alteration

In [6]:
alteration_txt = """
the most common conjuctions are and, or and but.
"""

# finding all occurences of and, or, the
alteration_pattern = re.compile("and|or|the")
alteration_pattern.findall(alteration_txt)

['the', 'and', 'or', 'and']

## Boundary Matchers

Consider a scenario where want to find all occurences of `and`, `or` and `the` in the given text

In [7]:
boundary_matchers_txt = """
Lorem Ipsum is simply dummy text of the printing and typesetting industry. 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book. 
It has survived not only five centuries, but also the leap into electronic typesetting, 
remaining essentially unchanged. 
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, 
and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""

boundary_matchers_pattern = re.compile("and|or|the") # alteration
boundary_matchers_pattern.findall(boundary_matchers_txt)

['or',
 'the',
 'and',
 'or',
 'the',
 'and',
 'the',
 'and',
 'the',
 'the',
 'the',
 'or',
 'and',
 'or',
 'or']

In the above boundary_matchers_txt, `lorem` contains `or` and `standard` contains `and`. They are also get counted as a match where as we want to find individual strings containing `and`, `or` and `the` only.

In [8]:
# the solution is to use boundary matchers
# the boundary can be space, comma, fullstop, exlamation sign, etc.

boundary_matchers_pattern = re.compile(r"\b(and|or|the)\b")
boundary_matchers_pattern.findall(boundary_matchers_txt)

['the', 'and', 'the', 'the', 'and', 'the', 'the', 'the', 'and']

#### Example

Consider a scenario where we want to find all the lines in the given text which **start** with the pattern `Name:`.

In [12]:
boundary_matchers_txt = """
Name:
Age: 0
Roll No.: 15
Grade: S

Name: Ravi
Age: -1
Roll No.: 123 Name: ABC
Grade: K

Name: Ram
Age: N/A
Roll No.: 1
Grade: G
"""

pattern = re.compile("^Name:.*", flags=re.MULTILINE)
pattern.findall(boundary_matchers_txt)

['Name:', 'Name: Ravi', 'Name: Ram']

#### Example

Find all the sentences which do not end with a full stop(`.`) in the given text

In [13]:
txt = """
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s!
It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages
More recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."""

In [15]:
pattern = re.compile("^.*[^\.]$", flags=re.M)
pattern.findall(txt)

["Lorem Ipsum has been the industry's standard dummy text ever since the 1500s!",
 'It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages']

## Splitting

#### Example

Split a string to get individuals line in it.

In [16]:
txt = """Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated."""

In [17]:
pattern = re.compile("\n")

In [19]:
pattern.split(txt)

['Beautiful is better than ugly.',
 'Explicit is better than implicit.',
 'Simple is better than complex.',
 'Complex is better than complicated.']

#### Example

Get all the words in a given text

In [21]:
pattern = re.compile("\W")

In [22]:
pattern.split(txt)

['Beautiful',
 'is',
 'better',
 'than',
 'ugly',
 '',
 'Explicit',
 'is',
 'better',
 'than',
 'implicit',
 '',
 'Simple',
 'is',
 'better',
 'than',
 'complex',
 '',
 'Complex',
 'is',
 'better',
 'than',
 'complicated',
 '']