In [58]:
import re

text = """101 COM    Computers
205 MAT   Mathematics
189 ENG   English"""

## Why do we compile regex pattern?

- Compile process parses the regular expression and builds an in-memory representation of the regex string. 
- The overhead to compile is significant compared to a match. 
- If you're using a pattern repeatedly it will gain some performance to cache the compiled pattern.

### Example 1.

In [2]:
pattern = r"\s+"

regex = re.compile(pattern)

listobj = regex.split(text)  # spilt when you find one or nore whitespaces

print(listobj)

['101', 'COM', 'Computers', '205', 'MAT', 'Mathematics', '189', 'ENG', 'English']


### Example 2. Alternate Method without compiling the pattern

In [4]:
listobj = re.split('\s+', text) # not a good practice as mentioned reason.
print(listobj)


['101', 'COM', 'Computers', '205', 'MAT', 'Mathematics', '189', 'ENG', 'English']


### Example 3.

In [6]:
pattern = r"\d+"

regex = re.compile(pattern)

listobj = regex.findall(text) # Find one or nore digits.

print(listobj)

['101', '205', '189']


### Example 4. findall vs search vs match


- findall - returns the matched portions of the text as a list.
             
          scanned left-to-right
          non-overlapping matches of pattern 
          
- search - returns a match object that contains 
          
          [starting and ending positions of the first occurrence of the pattern]
          returns None (if the pattern not present)
          This method stops after the first match.
         
- match - returns a match object. But the difference is, 
         
          it requires the pattern to be present at the beginning of the text itself.
          in between is not matched,
          in a nutshell, string must start with the pattern,
          else None

In [8]:
listobj = regex.findall(text)
print(listobj)

['101', '205', '189']


In [9]:
matchobj =  regex.search(text)
# stops after the first match ie 101

print(matchobj.start()) # 0
print(matchobj.end())   # 3
print(matchobj.span())  # (0,3)
print(matchobj.group()) # 101
print(matchobj.group(0)) # 101



0
3
(0, 3)
101
101


In [19]:
matchobj =  regex.match(text)
print(matchobj.group(0))
#print(matchobj.group(1)) # error no such group as it stops after first match.

matchobj =  regex.match("Rohan 101")
print(matchobj) # it requires the pattern to be present at the beginning of the text itself.



101
None


### HACK 1

In [21]:
re.findall('[a-z]+', text, flags=re.IGNORECASE)

['COM', 'Computers', 'MAT', 'Mathematics', 'ENG', 'English']

### Example 5. sub

In [24]:
pattern = "\d+"

regex = re.compile(pattern)

string = regex.sub('COURSE_CODE', text) # returns string

print(string)


COURSE_CODE COM    Computers
COURSE_CODE MAT   Mathematics
COURSE_CODE ENG   English


In [26]:
print(re.sub('\d+', 'COURSE_CODE', text))

COURSE_CODE COM    Computers
COURSE_CODE MAT   Mathematics
COURSE_CODE ENG   English


### Example 6.

In [32]:
text1 = 'W!@isdmi&|C P@:an,@da'
list = re.split('[!@,:\s\'|&]+',text1) 
''.join(list) # 'WisdmiCPanda'

'WisdmiCPanda'

In [34]:
listobj = re.findall('[A-Z]{3}', text)

print(listobj)

['COM', 'MAT', 'ENG']


In [35]:
listobj = re.findall('[A-Z]{2}', text)
print(listobj)

['CO', 'MA', 'EN']


In [40]:
listobj = re.findall('[A-Za-z]{2}', text)
print(listobj)

['CO', 'Co', 'mp', 'ut', 'er', 'MA', 'Ma', 'th', 'em', 'at', 'ic', 'EN', 'En', 'gl', 'is']


In [44]:
stobj = re.findall('[A-Z]{4}', text)
print(stobj)

[]


In [42]:
listobj = re.findall('[A-Z]{1,}', text) #will have at least 1 or more characters.
print(listobj)

['COM', 'C', 'MAT', 'M', 'ENG', 'E']


#### course_pattern = '([0-9]+)\s*([A-Z]{3})\s*([A-Za-z]{4,})'

        '''
        ([0-9]+)
        ([A-Z]{3})
        ([A-Za-z]{4,})
        '''

### Example 7.

In [46]:
text2 = '01, Jan 2015'

pattern = '\d{4}'

regex = re.compile(pattern)

regex.findall(text2)

['2015']

In [48]:
re.findall('\d{2,4}',text2)

['01', '2015']

In [51]:
re.findall('J?an',text2) # 0 or exactly 1 occurence of J'

['Jan']

### Example 8. Word boundry

- \b is commonly used to detect and match the beginning or end of a word.
- **\bpanda** will match the *panda*  in **‘pandarohan’** and not in **rohanpanda.**
- In order to match the *panda* in **rohanpanda**, you should use **panda\b**

In [52]:
re.findall(r'\bpanda', 'wisdomic panda not wisdomic pandarohan')

['panda', 'panda']

In [54]:
re.findall(r'\bpanda\b', 'wisdomic panda not wisdomic pandarohan')
# gets the exact word panda as we have boundries on both ends.


['panda']

### Exercise

1. Find Email Address

In [56]:
string  = 'My name is CR7, and wisdomic@panda.com is my email. robagwe@gmail.com'

pattern = r"\w+@[a-z]+\.[a-z]+"

regex = re.compile(pattern)

listobj = regex.findall(string)
print(listobj)

['wisdomic@panda.com', 'robagwe@gmail.com']


### Sample Regex used frequently in data preprocessing

In [60]:

string1 = ''

tmp = re.sub(r'\$\w*','',string1) # Remove tickers
tmp = re.sub(r'https?:\/\/.*\/\w*','',string1) # Remove hyperlinks
tmp = re.sub(r'['+string.punctuation+']+', ' ',string1) # Remove puncutations like 's