In [8]:
import re

In [9]:
s1 = "foo123barhello"
print(re.search('123',s1))
# span() will return the range between the pattern lies
print(re.search('123',s1).span())

<re.Match object; span=(3, 6), match='123'>
(3, 6)


In [10]:
'''The real power of regex matching in Python emerges when <regex> contains special characters called metacharacters. These have a unique meaning to the regex matching engine and vastly enhance the capability of the search.'''

# '[0-9][0-9][0-9]' 3 decimal between 0-9

print(re.search('[0-9][0-9][0-9]',s1))
print(re.search('[0-9][0-9][0-9]',s1).span())
print(re.search('[0-9][0-9][0-9]', 'foo456bar'))
print(re.search('[0-9][0-9][0-9]', '12foo34'))


<re.Match object; span=(3, 6), match='123'>
(3, 6)
<re.Match object; span=(3, 6), match='456'>
None


In [11]:
# regex (.) dot metacharacter works like wildcard  (.) can be any literal one character. Matches any single character except newline
print(re.search('1.3',"foo153bar"))
print(re.search('1..3',"foo1753bar"))
print(re.search('1.2.3',"foo15283bar"))
print(re.search('1.3',"foo13bar"))
print(re.search('1..3',"foo15a3bar"))

<re.Match object; span=(3, 6), match='153'>
<re.Match object; span=(3, 7), match='1753'>
<re.Match object; span=(3, 8), match='15283'>
None
<re.Match object; span=(3, 7), match='15a3'>


In [12]:
# ^ Anchors a match at the start of a string. $ Anchors a match at the end of a string
print(re.search('^1.3',"123foo153bar"))
'''Here, we used re.match() function to search pattern within the test_string. The method returns a match object if the search is successful. If not, it returns None.'''

pattern = '^a...s$'
test_string = 'abyss'
result = re.match(pattern, test_string)

if result:
  print("Search successful.")
else:
  print("Search unsuccessful.")	



<re.Match object; span=(0, 3), match='123'>
Search successful.


In [13]:
# $ Anchors a match at the end of a string
print(re.match('^1..3','12f3'))
'''
MetaCharacters
Metacharacters are characters that are interpreted in a special way by a RegEx engine. Here's a list of metacharacters:

[] . ^ $ * + ? {} () \ |
'''

<re.Match object; span=(0, 4), match='12f3'>


"\nMetaCharacters\nMetacharacters are characters that are interpreted in a special way by a RegEx engine. Here's a list of metacharacters:\n\n[] . ^ $ * + ? {} () \\ |\n"

In [14]:
# [] - Square brackets
# Square brackets specifies a set of characters you wish to match.
'''Here, [abc] will match if the string you are trying to match contains any of the a, b or c.

You can also specify a range of characters using - inside square brackets.

[a-e] is the same as [abcde].
[1-4] is the same as [1234].
[0-39] is the same as [01239].
You can complement (invert) the character set by using caret ^ symbol at the start of a square-bracket.

[^abc] means any character except a or b or c.
[^0-9] means any non-digit character.'''

print(re.match('^abc',"abc de ca abc"))
print(re.match('[^abc]',"abc de ca abc"))


<re.Match object; span=(0, 3), match='abc'>
None


In [16]:
# '''* - Star
# The star symbol * matches zero or more occurrences of the pattern left to it.
# re.match()
# re.match() function will search the regular expression pattern and return the first occurrence. This method checks for a match only at the beginning of the string. So, if a match is found in the first line, it returns the match object. But if a match is found in some other line, it returns null.'''

print(re.search('ma*n',"womanknmanwomenmaaan"))
print(re.search('ma+n',"mnmawnmaaan"))
print(re.search('ma?n',"mn"))
print(re.search('ma?n',"masanman"))

print(re.findall('a{3,5}',"maaan maan mamaaaaaam aaaa aaaaa"))

print(re.findall('[0-9]{3,4}',"827 138 8851"))

print(re.search('ba[artz]', 'foobarqux'))
print(re.search('ba[artz]', 'foobazqux'))

<re.Match object; span=(2, 5), match='man'>
<re.Match object; span=(6, 11), match='maaan'>
<re.Match object; span=(0, 2), match='mn'>
<re.Match object; span=(5, 8), match='man'>
['aaa', 'aaaaa', 'aaaa', 'aaaaa']
['827', '138', '8851']
<re.Match object; span=(3, 6), match='bar'>
<re.Match object; span=(3, 6), match='baz'>


In [20]:
'''re.findall()
The re.findall() method returns a list of strings containing all matches.
'''

# Program to extract numbers from a string

import re

string = 'hello 12 hi 89. Howdy 34'
pattern = '\d+'

result = re.findall(pattern, string) 
print(result)

['12', '89', '34']


In [24]:
'''
re.split()
The re.split method splits the string where there is a match and returns a list of strings where the splits have occurred.
'''
import re

string = 'Twelve:12 Eighty nine:89.'
pattern = '\d+'

result = re.split(pattern, string) 
print(result)

# If the pattern is not found, re.split() returns a list containing the original string.
# You can pass maxsplit argument to the re.split() method. It's the maximum number of splits that will occur.

import re

string = 'Twelve:12 Eighty nine:89 Nine:9.'
pattern = '\d+'

# maxsplit = 1
# split only at the first occurrence
result = re.split(pattern, string, 1) 
print(result)



['Twelve:', ' Eighty nine:', '.']
['Twelve:', ' Eighty nine:89 Nine:9.']


In [28]:
'''
re.sub()
The syntax of re.sub() is:

re.sub(pattern, replace, string)
The method returns a string where matched occurrences are replaced with the content of replace variable.
If the pattern is not found, re.sub() returns the original string.
'''

# Program to remove all whitespaces
import re

# multiline string
string = 'abc 12de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.sub(pattern, replace, string) 
print(new_string)

# You can pass count as a fourth parameter to the re.sub() method. If omited, it results to 0. This will replace all occurrences.


import re

# multiline string
string = 'abc 12de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'
replace = ''

new_string = re.sub(r'\s+', replace, string, 1) 
print(new_string)

abc12de23f456
abc12de 23 
 f45 6


In [29]:
'''
re.subn()
The re.subn() is similar to re.sub() expect it returns a tuple of 2 items containing the new string and the number of substitutions made.'''


# Program to remove all whitespaces
import re

# multiline string
string = 'abc 12\
de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.subn(pattern, replace, string) 
print(new_string)

('abc12de23f456', 4)


In [30]:
'''
re.search()
The re.search() method takes two arguments: a pattern and a string. The method looks for the first location where the RegEx pattern produces a match with the string.

If the search is successful, re.search() returns a match object; if not, it returns None.
'''


import re

string = "Python is fun"

# check if 'Python' is at the beginning
match = re.search('\APython', string)

if match:
  print("pattern found inside the string")
else:
  print("pattern not found")  

pattern found inside the string


In [64]:
# match
dir(match)


import re

string = '39801 356, 2102 1111'

# Three digit number followed by space followed by two digit number
pattern = '(\d{5}) (\d{2})'

# match variable contains a Match object.
match = re.search(pattern, string)
print(match)

if match:
  print(match.group())
else:
  print("pattern not found")

<re.Match object; span=(0, 8), match='39801 35'>
39801 35


In [82]:
s1 = '83.149.9.216 - - [17/May/2015:10:05:03 +0000] "GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1" 200 203023 "http://semicomplete.com/presentations/logstash-monitorama-2013/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36" '


z = re.search(r'\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}',s1)
print(z)
print(z.group())

<re.Match object; span=(0, 12), match='83.149.9.216'>
83.149.9.216


In [100]:
from collections import Counter

def apache_log_reader(logfile):
    myregex = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'

    with open(logfile) as f:
        log = f.read()
        my_iplist = re.findall(myregex,log)
        # print("list: ",my_iplist)
        ipcount = Counter(my_iplist)
        for k, v in ipcount.items():
            print("IP Address " + "=> " + str(k) + " " + "Count "  + "=> " + str(v))

# Create entry point of our code
if __name__ == '__main__':
    apache_log_reader("/home/addy/workspace/for-placement/python/file-handling/demo.txt")

=> 1
IP Address => 95.153.95.223 Count => 1
IP Address => 46.65.248.177 Count => 6
IP Address => 123.125.71.82 Count => 1
IP Address => 79.101.87.86 Count => 16
IP Address => 88.61.50.114 Count => 1
IP Address => 65.52.104.233 Count => 5
IP Address => 89.238.233.38 Count => 7
IP Address => 109.195.177.171 Count => 6
IP Address => 77.241.193.88 Count => 10
IP Address => 193.77.158.208 Count => 6
IP Address => 178.222.167.128 Count => 2
IP Address => 213.87.123.186 Count => 1
IP Address => 89.212.62.88 Count => 6
IP Address => 173.176.91.196 Count => 1
IP Address => 201.22.249.225 Count => 3
IP Address => 118.92.157.140 Count => 1
IP Address => 212.227.66.11 Count => 1
IP Address => 199.247.189.63 Count => 6
IP Address => 46.53.98.24 Count => 1
IP Address => 5.0.375.55 Count => 1
IP Address => 194.14.211.19 Count => 6
IP Address => 20.137.2.50 Count => 1
IP Address => 176.198.189.108 Count => 1
IP Address => 180.76.6.49 Count => 1
IP Address => 128.118.108.67 Count => 32
IP Address => 21