# Expresiones Regulares en Python

In [1]:
import re

## Match: match a string conform to a pattern

In [2]:
txt = "Regular Expressions in Python"

In [3]:
match = re.search(r'Python', txt)# r is for raw strings
print(match)

<re.Match object; span=(23, 29), match='Python'>


In [4]:
if match:
    print("found", match.group())
else:
    print("No match found")

found Python


## Compile: create a regex object

In [5]:
message = "my number is 610-742-8645"
regex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
res = regex.search(message)
print(res.group())

610-742-8645


## Findall: find all patterns in a string and returns all the matches

In [6]:
txt = "Expresiones regulares en Python"
result = re.findall('e', txt)
print(result)

['e', 'e', 'e', 'e', 'e']


In [7]:
result2 = re.findall('en', txt)
print(result2)

['en']


In [8]:
result3 = re.findall('[seo]', txt)
print(result3)

['e', 's', 'o', 'e', 's', 'e', 'e', 's', 'e', 'o']


In [9]:
message1 = "my number is 610-742-8645 and also 563-852-9642"
regex1 = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
res1 = regex1.findall(message1)
print(res1)

['610-742-8645', '563-852-9642']


## Group: to get the matched string

In [10]:
regex2 = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
match1 = regex2.search(message)
print(match1)

<re.Match object; span=(13, 25), match='610-742-8645'>


In [11]:
match1.group()

'610-742-8645'

In [12]:
match1.group(1)

'610'

In [13]:
match1.group(2)

'742-8645'

## Escaping Special Characters

In [14]:
str = 'Sentences have dots. How do we escape them?'
lst = re.findall('.', str)
print(lst)

['S', 'e', 'n', 't', 'e', 'n', 'c', 'e', 's', ' ', 'h', 'a', 'v', 'e', ' ', 'd', 'o', 't', 's', '.', ' ', 'H', 'o', 'w', ' ', 'd', 'o', ' ', 'w', 'e', ' ', 'e', 's', 'c', 'a', 'p', 'e', ' ', 't', 'h', 'e', 'm', '?']


In [15]:
lst1 = re.findall('\.', str)
print(lst1)

['.']


## Pipe Character (|): match one of many possible group

In [16]:
exp = re.compile(r'Py(thon|mysql|charm)')

In [17]:
res = exp.search("Python is a great language for programming")
res.group()

'Python'

In [18]:
res = exp.search("Pymysql is a tool for connecting mysql db")
res.group()

'Pymysql'

In [19]:
res = exp.search("A great developer tool is Pycharm")
res.group()

'Pycharm'

## ? Character: zero or one time

In [20]:
expr = re.compile(r'Pyt(ho)?n')

In [21]:
match = expr.search("Python a great language")
match.group()

'Python'

In [22]:
match = expr.search("Pytn! a great language")
match.group()

'Pytn'

## * Character: zero or more time

In [23]:
expr = re.compile(r"Pytho(n)*")
match = expr.search("Welcome to the world of Pythonn")
match.group()

'Pythonn'

## + Character: must appear at least 1 or more time

In [24]:
expr = re.compile(r'Pyth(o)+')
match = expr.search("Welcome to the world of Pythoon")
match.group()

'Pythoo'

## {} : specific number of times

In [25]:
regex = re.compile(r'(Ho){3}')
match = regex.search("Santa says HoHoHo")
match.group()

'HoHoHo'

## Match longest possible string

In [26]:
digit = re.compile(r'(\d){3,8}')
match = digit.search("12345678910")
match.group()

'12345678'

## Match shortest possible string (add ? symbol)

In [27]:
digit = re.compile(r'(\d){4,8}?')
match = digit.search('123456789')
match.group()

'1234'

## Match the exact number of characters

In [28]:
string = 'The date is 22/10/2019'
lst = re.findall('[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}', string)
print(lst)

['22/10/2019']


## Our own regular expression: Two vowel in a string

In [29]:
regex = re.compile(r'[aeiouAEIOU]{2}')
string = "Welcome to the world of Ai"
regex.findall(string)

['Ai']

## Negative Character Class: ^

In [37]:
regex = re.compile(r'[^aeiouAEIOU]')
string = "The language is PythOn"
regex.findall(string)

['T', 'h', ' ', 'l', 'n', 'g', 'g', ' ', 's', ' ', 'P', 'y', 't', 'h', 'n']

## Find the start of a string : ^ 

In [41]:
r = re.compile(r'^The')
string = "The language is PythOn"
r.findall(string)

['The']

## Case Insensitive

In [33]:
string = "The COSMOS is Infinite"
regex = re.compile(r'[aeiou]',re.I)
regex.findall(string)

['e', 'O', 'O', 'i', 'I', 'i', 'i', 'e']

## Character Classes: \w, \d, \s

In [34]:
# \d: Any numeric digit[0–9]
# \w : sequence of word-like characters [a-zA-Z0–9_] that are not space
# \s: whitespace characters(space,newline,tab)

address = "568 Los Jardines 1145 Altos Dpto.501"
match = re.compile(r'\d+\s\w+\s\w+')
match.findall(address)

['568 Los Jardines', '1145 Altos Dpto']

## Reference:

https://docs.python.org/3/library/re.html