# <center>Regular Expressions</center>
### Documentation - https://docs.python.org/3.7/howto/regex.html

"re" module primarily used for string searching and manipulation,also frequently for web scraping

In [1]:
import re

The "re" package provides several methods to actually perform queries or manipulations on an input string.

### re.match()

In [2]:
print(re.match('a','abcdef')) # match always checks from the starting character

<_sre.SRE_Match object; span=(0, 1), match='a'>


In [3]:
print(re.match('ab','abcdef'))

<_sre.SRE_Match object; span=(0, 2), match='ab'>


In [4]:
print(re.match('bc','abcdef'))

None


### re.search()

In [5]:
re.search('ij','abcdef ghij')

<_sre.SRE_Match object; span=(9, 11), match='ij'>

In [6]:
re.search('six','Rohit sharma hit 6 sixes yesterday, what a man!')

<_sre.SRE_Match object; span=(19, 22), match='six'>

In [7]:
re.search('5','Rohit sharma hit 6 sixes yesterday, what a man!')

### re.findall()

In [8]:
re.findall('hit','Rohit sharma hit 6 sixes yesterday, what a man!') 
# returns a list - all matching patterns will be returned

['hit', 'hit']

In [29]:
# return all the matching words, ignore case sensitivity
re.findall('hit', 'roHit sharma hit 6 sixes yesterday', flags = re.I)

['Hit', 'hit']

Some useful flags : 

- re.A (ASCII-only matching)
- re.I (ignore case)
- re.M (multi-line)
- re.U (Unicode matching)

### re.split()

In [32]:
re.split('\s+','natural    language processing')

['natural', 'language', 'processing']

### re.sub()

In [11]:
re.sub('pressure', 'pleasure', 
       'It was great pressure working with you!')


'It was great pleasure working with you!'

### re.compile()

In [12]:
# you can combine a regular expression pattern into pattern objects to avoid rewriting the pattern.
pattern = re.compile('hit')

print(pattern.findall('Rohit Sharma hit 6 Sixes Yesterday!'))

print(pattern.findall('Bahubali is a superhit film!'))

['hit', 'hit']
['hit']


___


Lets practice on a sentence

In [13]:
import nltk
from nltk import word_tokenize

In [14]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
    though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
    well without bread-and-butter--Maybe it's always pepper that makes people hot-tempered,dejected'...
    Ross McFluff: 12-point 834.345 155 Elm Street golf
    Ronald Heathmore: 892.345 3428.436 Finley Avenue
    Frank Burger: 925.541.7625 662 South Dogwood Way
    Heather Albrecht: 548.326.4584 919 Park Place$"""

In [15]:
# Tokenize the data
tokens = nltk.word_tokenize(raw)

tokens

["'When",
 'I',
 "'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'",
 'I',
 'wo',
 "n't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'bread-and-butter',
 '--',
 'Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot-tempered',
 ',',
 'dejected',
 "'",
 '...',
 'Ross',
 'McFluff',
 ':',
 '12-point',
 '834.345',
 '155',
 'Elm',
 'Street',
 'golf',
 'Ronald',
 'Heathmore',
 ':',
 '892.345',
 '3428.436',
 'Finley',
 'Avenue',
 'Frank',
 'Burger',
 ':',
 '925.541.7625',
 '662',
 'South',
 'Dogwood',
 'Way',
 'Heather',
 'Albrecht',
 ':',
 '548.326.4584',
 '919',
 'Park',
 'Place',
 '$']

In [16]:
# nltk.download('punkt')

### Use symbols for pattern matching

| Symbol | Match Behaviour |
| --- | --- |
| . | matches any character except for line breaks |
| * | matches for 0 or more times of preceeding charcter |
| + | matches for 1 or more times of preceeding character |
| ? | preceeding character is optional. Matches 0 or 1 occurance |
| \d | matches any single digit |
| \D | matches anything but digit |
| \w | matches any word character |
| \W | mathches anything but word character |
| [XYZ] | matches any single character from the list |
| [XYZ]+ | matches one or more of characters from the list |
| [^a-z] | matches everything but not lowercase letters |
| $ | matches end of the string |
| ^ | matches begining of the string |
| \s | matches a space, a tab, a line break |
| \S | anything except space |

In [17]:
re.split('s','natural language processing')

['natural language proce', '', 'ing']

In [18]:
re.split('\s', 'natural    language processing')

['natural', '', '', '', 'language', 'processing']

In [19]:
# abc$ - Matches some pattern abc at the end of a string

[w for w in tokens if re.search('ed$', w)]

['hot-tempered', 'dejected']

In [20]:
# ^abc - Matches some pattern abc at the beginning of a string

[w for w in tokens if re.search('^to', w)]

['to', 'tone']

In [21]:
# + - One or more of previous item

[w for w in tokens if re.search('p+er$', w)]

['pepper', 'pepper']

In [43]:
# . (Wildcard) - Matches any character

[w for w in tokens if re.search('t..$', w)]

['bread-and-butter', 'dejected']

In [23]:
# The first part of the expression, ^[ghi]  matches the start of a word followed by g, h, or i. 
# The next part of the expression, [mno] constrains the second character to be m, n, or o. 

[w for w in tokens if re.search('^[ghi][mno][jlk][def]$', w)]

['golf']

In [24]:
# [A-Z0-9] - Matches one of a range of characters

[w for w in tokens if re.search('^[0-9]+\.[0-9]+\.[0-9]+$', w)]

['925.541.7625', '548.326.4584']

In [25]:
# extract all capital words

[w for w in tokens if re.search('^[A-Z]+$', w)]

['I', 'I', 'AT', 'ALL']

In [54]:
# {n} - Exactly n repeats where n is a non-negative integer
# {n,} - At least n repeats
# {, n} - No more than n repeats
# {m, n} - At least m and no more than n repeats


[w for w in tokens if re.search('^[0-9]+-[a-z]{3,5}$', w)]

['12-point']