### Regular Expressions - regex

In [2]:
# Normal Expression

text = "My Phone number is 983-026-8966"

'Phone' in text

True

#### Regex Expression - Single Match

In [29]:
import re

text = "My Phone number is 983-026-8966"

pattern = 'Phone'

# Now use re and pass in the Pattern and the Text that needs to be searched
# re.search(thing to search, source)
re.search(pattern, text)

# It gives out Multiple Outputs - which index to which index(span) the match has been found, what the Match is

<re.Match object; span=(3, 8), match='Phone'>

#### For Multiple Matches

In [10]:
another_text = "My Phone number is 983-026-8966. I have another Phone number."
pattern = 'Phone'

# Using re.search(thing to search, source) here will yield only the first match and ignore the next matches

re.search(pattern, another_text)

<re.Match object; span=(3, 8), match='Phone'>

#### Use re.findall(thing to search, source) for multiple searches

In [13]:
# use re.findall(thing to search, source) for multiple searches
re.findall(pattern, another_text)

['Phone', 'Phone']

#### USe re.finditer() to iterate through the String

In [35]:
# To get details of all Matches

for match in re.finditer(pattern, another_text):
    print (match.span())
    print (match.group())
    print (match)

(3, 8)
Phone
<re.Match object; span=(3, 8), match='Phone'>
(48, 53)
Phone
<re.Match object; span=(48, 53), match='Phone'>


In [36]:
match.start()

48

In [37]:
match.end()

53

In [38]:
match.group()

'Phone'

#### Using Regex to find different things - CHaracter Identifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [90]:
# To find a Mobile Number

another_text = "My Phone number is 983-026-8966. I have another Phone number."

# Putting in direct Phone number will show the output
finder = re.search('983-026-8966', another_text)

In [91]:
finder

<re.Match object; span=(19, 31), match='983-026-8966'>

#### Now If we use different characters as described in the table

In [92]:
finder = re.search(r'\d\d\d-\w\w\w-\d\d\w\w',another_text)

In [93]:
finder

<re.Match object; span=(19, 31), match='983-026-8966'>

In [94]:
# Using Quantifiers

finder = re.search(r'\d{3}-\d{3}-\w{4}',another_text)

In [95]:
finder

<re.Match object; span=(19, 31), match='983-026-8966'>

#### Using () for Grouping. .group() This helps in grouping different datas. 
#### If we use re.compile() with grouping, we can call each group individually

In [107]:
finder = re.compile(r'(\w{3})-(\d{3})-(\d{4})')
output = re.search(finder,another_text)

In [108]:
output

<re.Match object; span=(19, 31), match='983-026-8966'>

In [109]:
# Calling all groups
output.group()

'983-026-8966'

In [110]:
# Individual group - Indexing starts from 1
output.group(1)

'983'

In [111]:
# Putting re.compile() directly inside re.search()
output = re.search(re.compile(r'(\w{3})-(\d{3})-(\d{4})'),another_text)

In [112]:
output

<re.Match object; span=(19, 31), match='983-026-8966'>

In [113]:
output.group(2)

'026'

#### Quantifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [79]:
# Using Different Quantifiers

finder = re.search(r'(\d{2,4})-(\d+)-(\w{3,})',another_text)

In [80]:
finder

<re.Match object; span=(19, 31), match='983-026-8966'>

#### Using OR operator / PIPE ( | ) Operator

In [6]:
import re

re.search('love', "I am in love with you love")

<re.Match object; span=(8, 12), match='love'>

In [7]:
re.findall('love',"I am in love with you love")

['love', 'love']

In [16]:
for expressions in re.finditer('love',"I am in love with you love"):
    print (expressions.group())
    print (expressions.span())
    print (expressions)

love
(8, 12)
<re.Match object; span=(8, 12), match='love'>
love
(22, 26)
<re.Match object; span=(22, 26), match='love'>


##### OR / Pipe Operator

In [23]:
# Using '|' (pipe operator) or OR operator to choose from either of the two 

re.search('love|like', "I like you")

<re.Match object; span=(2, 6), match='like'>

#### Using 'WILD CARD' (.) Operator

In [26]:
re.findall('ve|ke', "I like you as you like me and even when you don't like me")

['ke', 'ke', 've', 'ke']

In [34]:
# Using . as wild card - It will take anything that is there either after or before  the 'match' - Even spaces

re.findall('..ve.|..ke.', "I like you as you like me and even when you don't like me")

['like ', 'like ', ' even', 'like ']

#### Using Character Identifiers with Wild Card and Pipe

In [45]:
'''Using the following -
   \w - alphanumeric
   + - Occures one or more times
   . - Wild card
   | - pipe
'''

re.findall('\w+ve.|..ke', "I like you as you like me and even when you don't like me")

['like', 'like', 'even', 'like']

In [46]:
for words in re.finditer('\w+ve.|..ke', "I like you as you like me and even when you don't like me"):
    print(words.span())
    print(words.group())
    print(words)

(2, 6)
like
<re.Match object; span=(2, 6), match='like'>
(18, 22)
like
<re.Match object; span=(18, 22), match='like'>
(30, 34)
even
<re.Match object; span=(30, 34), match='even'>
(50, 54)
like
<re.Match object; span=(50, 54), match='like'>


### Some other symbols

Starts With - ^

Ends With - $

Exclude things/numbers/words/punctuations,etc - r'[^\d\w]+'

Join things - ' '.join(parameter)

Include things/words/punctuations,etc - r'[\w]+'

In [174]:
Sample = "1 sample 2 It has 20+ words. 2 tic-toc , tic-toc clock ? perfoms 2"

In [175]:
# Starts with - '^'
re.findall(r'^\d',Sample)

['1']

In [176]:
# Ends with -'$'
re.findall(r'\d$',Sample)

['2']

In [177]:
# Ends with -'$'
re.search(r'\d$',Sample)

<re.Match object; span=(65, 66), match='2'>

In [217]:
# Exclude - [adding all that is to be removed] >>> ^\d+.-,(space to be remove additional white spaces) 
sentence = re.findall(r'[^\d+.,? ]+', Sample) 

In [218]:
sentence

['sample', 'It', 'has', 'words', 'tic-toc', 'tic-toc', 'clock', 'perfoms']

In [180]:
new_sentence =' '.join(sentence)

In [181]:
new_sentence

'sample It has words tic-toc tic-toc clock perfoms'

In [182]:
# Excluding '-' from the sentence
re.findall('[\w]+',new_sentence)

['sample',
 'It',
 'has',
 'words',
 'tic',
 'toc',
 'tic',
 'toc',
 'clock',
 'perfoms']

In [183]:
' '.join(re.findall('[\w]+',new_sentence))

'sample It has words tic toc tic toc clock perfoms'

In [189]:
new_sentence = 'sample It has words tic-toc tic-toc clock perfoms'

check = re.findall(r'\w+',new_sentence)

In [187]:
check

['sample',
 'It',
 'has',
 'words',
 'tic',
 'toc',
 'tic',
 'toc',
 'clock',
 'perfoms']

In [195]:
words = r'[\w]+-[\w]+'

In [198]:
re.findall(words,new_sentence)

['tic-toc', 'tic-toc']