In [1]:
import re

In [2]:
''' Some useful patterns with their matching results '''

def match_string(text, pattern):
    
    # serach for string matching: search will look anywhere in the strings and try to find a match with the pattern. the span indicates the start & end indexes
    return re.search(pattern, text)

patterns = ['[A-Z]', '[A-Z]+', '[a-z]', '[a-z]+', '[a-z]*', '[a-z]?', '[A-Za-z]+', '[A-Z]$', '[A-Z][a-z]', '[A-Z]+[a-z]+', '\d', '\d+', '[0-9]+', '\D+', '.*']

for pattern in patterns:
    print(match_string('100GReetingS', pattern))

<_sre.SRE_Match object; span=(3, 4), match='G'>
<_sre.SRE_Match object; span=(3, 5), match='GR'>
<_sre.SRE_Match object; span=(5, 6), match='e'>
<_sre.SRE_Match object; span=(5, 11), match='eeting'>
<_sre.SRE_Match object; span=(0, 0), match=''>
<_sre.SRE_Match object; span=(0, 0), match=''>
<_sre.SRE_Match object; span=(3, 12), match='GReetingS'>
<_sre.SRE_Match object; span=(11, 12), match='S'>
<_sre.SRE_Match object; span=(4, 6), match='Re'>
<_sre.SRE_Match object; span=(3, 11), match='GReeting'>
<_sre.SRE_Match object; span=(0, 1), match='1'>
<_sre.SRE_Match object; span=(0, 3), match='100'>
<_sre.SRE_Match object; span=(0, 3), match='100'>
<_sre.SRE_Match object; span=(3, 12), match='GReetingS'>
<_sre.SRE_Match object; span=(0, 12), match='100GReetingS'>


In [3]:
''' Split text into strings (the size of the space differs between strings)'''
text = """111 PHY    Physics
216 MED   Medecine
106 ENG   English""" 

In [4]:
# Using split() at at least one space (one or more '\s+')
print(re.split('\s+', text))

['111', 'PHY', 'Physics', '216', 'MED', 'Medecine', '106', 'ENG', 'English']


In [5]:
# findall() gives the same result as split() by specifying a non white space in this case '\S+' as we need to get strings
print(re.findall('\S+', text))

['111', 'PHY', 'Physics', '216', 'MED', 'Medecine', '106', 'ENG', 'English']


If we use a particular pattern multiple times, then we should compile a regular expression rather than using re.split several times.

In [6]:
# Find all number in text. d stands for digit, + to match at leat one digit. finall() will return a list of all matches

pattern = re.compile('\d+')
pattern.findall(text)

['111', '216', '106']

In [7]:
# re.search will return the start and end indexes of the first matched digit 205, whereas re.match() will return None because it matches at the start of the string
text2 = """COM    Computers
205 MAT   Mathematics 189"""

print(re.search('\d+', text, flags=re.M))
print(re.match('\d+', text, flags=re.M)) # text starts by 111 ==> re.match will match this 111 at the start

print(re.search('\d+', text2, flags=re.M))
print(re.match('\d+', text2, flags=re.M)) # text2 starts by COM a non digit ==> re.match will return None

<_sre.SRE_Match object; span=(0, 3), match='111'>
<_sre.SRE_Match object; span=(0, 3), match='111'>
<_sre.SRE_Match object; span=(17, 20), match='205'>
None


In [8]:
''' Substitute a text by another - below, explicitly replace all spaces by just one space including newlines '''
re.sub('\s+', ' ', text)

'111 PHY Physics 216 MED Medecine 106 ENG English'

In [9]:
''' Remove all space but not newlines (?!\n) looks for an uppcoming newline and exclude it from a pattern '''
print(re.sub('((?!\n)\s+)', ' ', text))

111 PHY Physics
216 MED Medecine
106 ENG English


In [10]:
''' Find all elements for each line '''
re.findall('([0-9]+)\s+([A-Z]{3})\s+([A-Za-z]+)', text)

[('111', 'PHY', 'Physics'),
 ('216', 'MED', 'Medecine'),
 ('106', 'ENG', 'English')]

In [11]:
''' Find a list of all elements(characters or digits) in a text '''
re.findall('[A-Za-z0-9]+', text)

['111', 'PHY', 'Physics', '216', 'MED', 'Medecine', '106', 'ENG', 'English']

In [12]:
''' Separate number from codes and specialities '''

patterns = ['\d+', '[A-Z]{3}', '[A-Za-z]{4,}']

for pattern in patterns:
    print(re.findall(pattern, text))

['111', '216', '106']
['PHY', 'MED', 'ENG']
['Physics', 'Medecine', 'English']


In [13]:
''' Find any character except for a new line '''

print(re.findall('.', 'Datascienceisnice!.'))
print('\n')
print(re.findall('...', 'Datascienceisnice!.'))

['D', 'a', 't', 'a', 's', 'c', 'i', 'e', 'n', 'c', 'e', 'i', 's', 'n', 'i', 'c', 'e', '!', '.']


['Dat', 'asc', 'ien', 'cei', 'sni', 'ce!']


In [14]:
''' Find dots by using backslash before the dot '''
print(re.findall('\.', 'Datascienceisnice!.'))

['.']


In [15]:
''' Match any digit '''
re.findall('\d+', 'Jan 19, 2019')

['19', '2019']

In [16]:
''' Match anything except digit '''
re.findall('\D+', 'Jan 19, 2019')

['Jan ', ', ']

In [17]:
''' Match anything including digits '''
re.findall('\w+', 'Jan 19, 2019')

['Jan', '19', '2019']

In [18]:
''' Match anything except a character '''
re.findall('\W+', 'Jan 19, 2019')

[' ', ', ']

In [19]:
''' Match just the month '''
re.findall('[A-Za-z]+', 'Jan 19, 2019')

['Jan']

In [20]:
# Match one or more occurences
re.findall('Rea+l', 'This is so Reaaaaaaaal')

['Reaaaaaaaal']

In [21]:
# Match zero or more occurences
re.findall('Rea*l', 'This is so Reaaaaaaaal')

['Reaaaaaaaal']

In [22]:
# Match zero or one occurences
re.findall('The?is', 'This is so Reaaaaaaaal')

['This']

In [23]:
emails = """zuck26@facebook.com
page33@google.com
jeff42@amazon.com"""

In [24]:
re.findall('(\w+)@(\w*)\.([a-z]{3})', emails, flags=re.I)

[('zuck26', 'facebook', 'com'),
 ('page33', 'google', 'com'),
 ('jeff42', 'amazon', 'com')]

In [25]:
''' Find all the words starting by b or B '''
# Notice that \b is used as a boundary, asking for any word starting by B(or b by using flags=re.I to ignore cases), followed by the rest of the word
text = """Betty bought a bit of butter, But the butter was so bitter, So she bought some better butter, To make the bitter butter better.""" 
print(re.findall(r'\bB\w+', text, flags=re.I))

['Betty', 'bought', 'bit', 'butter', 'But', 'butter', 'bitter', 'bought', 'better', 'butter', 'bitter', 'butter', 'better']


In [26]:
''' Rewrite the sentence in a correct format '''
sentence = """A, very   very; irregular_sentence"""

print(' '.join(re.split('[,;_\s]+', sentence)))
print(' '.join(re.findall('[A-Za-z]+', sentence)))

A very very irregular sentence
A very very irregular sentence


In [27]:
tweet = '''Good advice! RT @TheNextWeb: What I would do differently if I was learning to code today http://t.co/lbwej0pxOd cc: @garybernhardt #rstats'''
tw = re.split('http.+', tweet)
tw1 = re.split('@\w+:', tw[0])
tw_final = re.sub('RT ', '', tw1[0]) + tw1[1]
tw_final = re.sub('\s+', ' ', tw_final)
tw_final

'Good advice! What I would do differently if I was learning to code today '

In [28]:
tweet1 = re.sub(' RT| cc', '', tweet) # remove RT or cc
tweet1 = re.sub('@\S+:', '', tweet1) # take off @word
tweet1 = re.sub('http\S+\s*', '', tweet1) # take off url
tweet1 = re.sub('#\w+', '', tweet1) # take off hashtag #word
tweet1 = re.sub('@\S+\s*', '', tweet1) # take off remaining url
re.sub('\s+', ' ', tweet1) # take off extra space

'Good advice! What I would do differently if I was learning to code today '

In [29]:
r = """<HTML>
<HEAD>
<TITLE>Your Title Here</TITLE>
</HEAD>

<BODY>
<HR>
<a href="http://url.com">Link Name</a>
<H1>This is a Header</H1>
<H2>This is a Medium Header</H2>
<P>This is a paragraph! </P>
<P>This is a another paragraph!</P>
<B>This is a new sentence, in bold italics.</B>
<HR>
</BODY>
</HTML>"""


re.findall('<.*?>(.*)</.?>', r)

['Link Name',
 'This is a paragraph! ',
 'This is a another paragraph!',
 'This is a new sentence, in bold italics.']

In [30]:
text = "In persuasive or argumentative speaking, we try to convince others to agree with our facts, believe our claim, share our values, accept our conclusions, buy our product, or adopt our way of thinking, Price says. “One proven approach to convince your audience is cause-and-effect reasoning. It’s a method that helps your listeners see why things have happened or will happen as they do. It shows the inevitable linkage between what happens first and what happens next as a result. Cause-and-effect words make your claims sound objective and rational rather than biased and subjective[1].”"


In [31]:
print(re.split("[a-z]+", "0a9z123U", flags=re.IGNORECASE))

print(' \n ')
print(re.split(r'(\W+)', text))

print(' \n ')
print(re.split(r'\W+', text))

print(' \n ')
print(re.split('(\W*)', 'hahaha ...'))

['0', '9', '123', '']
 
 
['In', ' ', 'persuasive', ' ', 'or', ' ', 'argumentative', ' ', 'speaking', ', ', 'we', ' ', 'try', ' ', 'to', ' ', 'convince', ' ', 'others', ' ', 'to', ' ', 'agree', ' ', 'with', ' ', 'our', ' ', 'facts', ', ', 'believe', ' ', 'our', ' ', 'claim', ', ', 'share', ' ', 'our', ' ', 'values', ', ', 'accept', ' ', 'our', ' ', 'conclusions', ', ', 'buy', ' ', 'our', ' ', 'product', ', ', 'or', ' ', 'adopt', ' ', 'our', ' ', 'way', ' ', 'of', ' ', 'thinking', ', ', 'Price', ' ', 'says', '. “', 'One', ' ', 'proven', ' ', 'approach', ' ', 'to', ' ', 'convince', ' ', 'your', ' ', 'audience', ' ', 'is', ' ', 'cause', '-', 'and', '-', 'effect', ' ', 'reasoning', '. ', 'It', '’', 's', ' ', 'a', ' ', 'method', ' ', 'that', ' ', 'helps', ' ', 'your', ' ', 'listeners', ' ', 'see', ' ', 'why', ' ', 'things', ' ', 'have', ' ', 'happened', ' ', 'or', ' ', 'will', ' ', 'happen', ' ', 'as', ' ', 'they', ' ', 'do', '. ', 'It', ' ', 'shows', ' ', 'the', ' ', 'inevitable', ' ', 'li

  return _compile(pattern, flags).split(string, maxsplit)


In [32]:
print(re.findall(r'[a-zA-Z]+', text))

['In', 'persuasive', 'or', 'argumentative', 'speaking', 'we', 'try', 'to', 'convince', 'others', 'to', 'agree', 'with', 'our', 'facts', 'believe', 'our', 'claim', 'share', 'our', 'values', 'accept', 'our', 'conclusions', 'buy', 'our', 'product', 'or', 'adopt', 'our', 'way', 'of', 'thinking', 'Price', 'says', 'One', 'proven', 'approach', 'to', 'convince', 'your', 'audience', 'is', 'cause', 'and', 'effect', 'reasoning', 'It', 's', 'a', 'method', 'that', 'helps', 'your', 'listeners', 'see', 'why', 'things', 'have', 'happened', 'or', 'will', 'happen', 'as', 'they', 'do', 'It', 'shows', 'the', 'inevitable', 'linkage', 'between', 'what', 'happens', 'first', 'and', 'what', 'happens', 'next', 'as', 'a', 'result', 'Cause', 'and', 'effect', 'words', 'make', 'your', 'claims', 'sound', 'objective', 'and', 'rational', 'rather', 'than', 'biased', 'and', 'subjective']


In [33]:
print(re.findall(r'[0-9]+', text))

['1']


In [34]:
re.split('[a-z]+', 'a23c489Hn', flags=re.IGNORECASE)

['', '23', '489', '']

In [35]:
re.split('[0-9]+', 'a23c489Hn', flags=re.IGNORECASE)

['a', 'c', 'Hn']

In [36]:
re.sub(r'AND', '&', 'Backet and spade', flags=re.IGNORECASE)

'Backet & spade'

In [37]:
re.sub(r'\sAND\s', '&', 'Backet and spade', flags=re.IGNORECASE)

'Backet&spade'

In [38]:
re.search('o', 'you')

<_sre.SRE_Match object; span=(1, 2), match='o'>

In [39]:
pattern = re.compile('o')
pattern.match('you', 1)

<_sre.SRE_Match object; span=(1, 2), match='o'>

In [40]:
# No exact matching at 0 index
pattern.match('you')

In [41]:
m = re.match(r'(\w+)', 'Issac Newton, physist')

In [42]:
m

<_sre.SRE_Match object; span=(0, 5), match='Issac'>

In [43]:
m.group(0)

'Issac'

In [44]:
m = re.match(r'(\w+) (\w+)', 'Issac Newton, physist')

In [45]:
m

<_sre.SRE_Match object; span=(0, 12), match='Issac Newton'>

In [46]:
m.group(0)

'Issac Newton'

In [47]:
m.group(1)

'Issac'

In [48]:
m.group(2)

'Newton'

In [49]:
m = re.match(r'(?P<first_name>\w+) (?P<last_name>\w+)', 'Issac Newton, physist')

In [50]:
m.group('first_name', 'last_name')

('Issac', 'Newton')

In [51]:
m[0]

'Issac Newton'

In [52]:
m.groupdict()

{'first_name': 'Issac', 'last_name': 'Newton'}

In [53]:
email = "tony@tiremove_thisger.net"

m = re.search('remove_this', email)
print(m)

<_sre.SRE_Match object; span=(7, 18), match='remove_this'>


In [54]:
email[:m.start()] + email[m.end():]

'tony@tiger.net'

In [55]:
text = """Ross McFluff: 834.345.1254 155 Elm Street
...
... Ronald Heathmore: 892.345.3428 436 Finley Avenue
... Frank Burger: 925.541.7625 662 South Dogwood Way
...
...
... Heather Albrecht: 548.326.4584 919 Park Place"""

In [56]:
rows = re.split('\n+', text)
rows

['Ross McFluff: 834.345.1254 155 Elm Street',
 'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
 'Frank Burger: 925.541.7625 662 South Dogwood Way',
 'Heather Albrecht: 548.326.4584 919 Park Place']

In [57]:
m = [re.split(':? ', row, 3) for row in rows]
m

[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]

In [58]:
m = [re.split(':? ', row, 4) for row in rows]
m

[['Ross', 'McFluff', '834.345.1254', '155', 'Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436', 'Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662', 'South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919', 'Park Place']]

In [59]:
text = "I was slowly driving until another driver hit me on the back. Luckly, I wasn't injured!"

In [60]:
re.findall(r'\w+ly', text)

['slowly', 'Luckly']

In [61]:
for m in re.finditer(r'\w+ly', text):
    print('%02d-%2d: %s' % (m.start(), m.end(), m.group(0)))

06-12: slowly
62-68: Luckly


In [2]:
# finditer find the matches and the location of each match

pattern = re.compile(r'\d{3}[-.*]+\d{3}[-.*]\d{4}')

text = """234.345.4566
546--546-9876
546*645*7564
456-987-9000"""

matches = pattern.finditer(text)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 12), match='234.345.4566'>
<_sre.SRE_Match object; span=(13, 26), match='546--546-9876'>
<_sre.SRE_Match object; span=(27, 39), match='546*645*7564'>
<_sre.SRE_Match object; span=(40, 52), match='456-987-9000'>


In [3]:
text = """
Mr. Dupont
Mr John
M. G
Ms Stone
Mrs. Clark """

pattern = re.compile(r'M(|r|s|rs).?\s[A-Z]\w*')
matches = pattern.finditer(text)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 11), match='Mr. Dupont'>
<_sre.SRE_Match object; span=(12, 19), match='Mr John'>
<_sre.SRE_Match object; span=(20, 24), match='M. G'>
<_sre.SRE_Match object; span=(25, 33), match='Ms Stone'>
<_sre.SRE_Match object; span=(34, 44), match='Mrs. Clark'>


In [None]:
''' Matching emails '''
emails = """
cam.nicol@blogamil.com
zaf-123-ed@edumail.com
Fany_Stone@university.co.com
AndrewClark@univmail@my-mail.uk"""

# The 2 patterns below will match the email addresses
pattern = re.compile(r'[A-Za-z0-9.-_]+@[A-Za-z-]+\.(com|co.com|uk)')
pattern2 = re.compile(r'[A-Za-z0-9.-_+]+@[A-Za-z0-9-]+\.[A-Za-z0-9-.]+')

matches = pattern2.finditer(emails)

for match in matches:
    print(match)

In [None]:
''' Matching urls '''

urls = """
https://www.google.com
http://cam.com
https://youtube.com
https://www.unicef.org
http://www.un.org
"""

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

matches = pattern.finditer(urls)

#for match in matches:
 #   print(match.group(3))
    
    
# Extract the domain name and the top level domain
sub_urls = pattern.sub(r'\2\3', urls)

print(sub_urls)