In [2]:
#!/usr/bin/python
import re

# match checks for a match only at the beginning of the string, 
# search checks for a match anywhere in the string

line = "Cats are smarter than dogs";

matchObj = re.match( r'dogs', line, re.M|re.I)
if matchObj:
   print ("match --> matchObj.group() : ", matchObj.group())
else:
   print ("No match!! try with pattern now")
   matchObj = re.match( r'(.*) are (.*?) .*', line, re.M|re.I)
   print ("match --> matchObj.group() : ", matchObj.group())
   print ("match --> matchObj.group(1) : ", matchObj.group(1))
   print ("match --> matchObj.group(2) : ", matchObj.group(2), "\n") 
    

searchObj = re.search( r'dogs', line, re.M|re.I)
if searchObj:
   print ("search --> searchObj.group() : ", searchObj.group(), "\n") 
else:
   print ("Nothing found!!")


# sub  replaces all occurrences of the RE pattern in string with repl, 
# substituting all occurrences unless max provided. This method returns modified string.

phone = "2004-959-559 # This is Phone Number"

# Delete Python-style comments
num = re.sub(r'#.*$', "", phone)
print ("Phone Num : ", num)

# Remove anything other than digits
num = re.sub(r'\D', "", phone)    
print ("Phone Num : ", num)

No match!! try with pattern now
match --> matchObj.group() :  Cats are smarter than dogs
match --> matchObj.group(1) :  Cats
match --> matchObj.group(2) :  smarter 

search --> searchObj.group() :  dogs 

Phone Num :  2004-959-559 
Phone Num :  2004959559


In [5]:
import re

Non-special chars match themselves. Exceptions are special characters:

\       Escape special char or start a sequence.
.       Match any char except newline, see re.DOTALL
^       Match start of the string, see re.MULTILINE
$       Match end of the string, see re.MULTILINE
[]      Enclose a set of matchable chars
R|S     Match either regex R or regex S.
()      Create capture group, & indicate precedence

After '[', enclose a set, the only special chars are:

]   End the set, if not the 1st char
-   A range, eg. a-c matches a, b or c
^   Negate the set only if it is the 1st char

Quantifiers (append '?' for non-greedy):

{m}     Exactly m repetitions
{m,n}   From m (default 0) to n (default infinity)
*       0 or more. Same as {,}
+       1 or more. Same as {1,}
?       0 or 1. Same as {,1}

Special sequences:

\A  Start of string
\b  Match empty string at word (\w+) boundary
\B  Match empty string not at word boundary
\d  Digit
\D  Non-digit
\s  Whitespace [ \t\n\r\f\v], see LOCALE,UNICODE
\S  Non-whitespace
\w  Alphanumeric: [0-9a-zA-Z_], see LOCALE
\W  Non-alphanumeric
\Z  End of string
\g<id>  Match prev named or numbered group,
        '<' & '>' are literal, e.g. \g<0>
        or \g<name> (not \g0 or \gname)
        
Special character escapes are much like those already escaped in Python string literals. 
Hence regex '\n' is same as regex '\\n':

\a  ASCII Bell (BEL)
\f  ASCII Formfeed
\n  ASCII Linefeed
\r  ASCII Carriage return
\t  ASCII Tab
\v  ASCII Vertical tab
\\  A single backslash
\xHH   Two digit hexadecimal character goes here
\OOO   Three digit octal char (or just use an
               initial zero, e.g. \0, \09)
\DD      Decimal number 1 to 99, match
              previous numbered group
    
Extensions. Do not cause grouping, except 'P<name>':

(?iLmsux)     Match empty string, sets re.X flags
(?:...)              Non-capturing version of regular parens
(?P<name>...)     Create a named capturing group.
(?P=name)           Match whatever matched prev named group
(?#...)          A comment; ignored.
(?=...)         Lookahead assertion, match without consuming
(?!...)          Negative lookahead assertion
(?<=...)      Lookbehind assertion, match if preceded
(?<!...)       Negative lookbehind assertion
(?(id)y|n)    Match 'y' if group 'id' matched, else 'n'
 
Flags for re.compile(), etc. Combine with '|':

re.I == re.IGNORECASE   Ignore case
re.L == re.LOCALE       Make \w, \b, and \s locale dependent
re.M == re.MULTILINE    Multiline
re.S == re.DOTALL          Dot matches all (including newline)
re.U == re.UNICODE      Make \w, \b, \d, and \s unicode dependent
re.X == re.VERBOSE      Verbose (unescaped whitespace in pattern
                                              is ignored, and '#' marks comment lines)
 
Module level functions:

compile(pattern[, flags]) -> RegexObject
match(pattern, string[, flags]) -> MatchObject
search(pattern, string[, flags]) -> MatchObject
findall(pattern, string[, flags]) -> list of strings
finditer(pattern, string[, flags]) -> iter of MatchObjects
split(pattern, string[, maxsplit, flags]) -> list of strings
sub(pattern, repl, string[, count, flags]) -> string
subn(pattern, repl, string[, count, flags]) -> (string, int)
escape(string) -> string
purge() # the re cache
 
RegexObjects (returned from compile()):

.match(string[, pos, endpos]) -> MatchObject
.search(string[, pos, endpos]) -> MatchObject
.findall(string[, pos, endpos]) -> list of strings
.finditer(string[, pos, endpos]) -> iter of MatchObjects
.split(string[, maxsplit]) -> list of strings
.sub(repl, string[, count]) -> string
.subn(repl, string[, count]) -> (string, int)
.flags      # int, Passed to compile()
.groups     # int, Number of capturing groups
.groupindex # {}, Maps group names to ints
.pattern    # string, Passed to compile()
 
MatchObjects (returned from match() and search()):

.expand(template) -> string, Backslash & group expansion
.group([group1...]) -> string or tuple of strings, 1 per arg
.groups([default]) -> tuple of all groups, non-matching=default
.groupdict([default]) -> {}, Named groups, non-matching=default
.start([group]) -> int, Start/end of substring match by group
.end([group]) -> int, Group defaults to 0, the whole match
.span([group]) -> tuple (match.start(group), match.end(group))
.pos           int, Passed to search() or match()
.endpos    int, "
.lastindex int, Index of last matched capturing group
.lastgroup string, Name of last matched capturing group
.re              regex, As passed to search() or match()
.string       string, "
 
 
# Simple email expression. Doesn't allow numbers in the domain name 
# and doesn't allow for top level domains that are less than 2 or more than 3 letters 
# (which is fine until they allow more). 
# Doesn't handle multiple ‘.’ in the domain (joe@abc.co.uk)
 

p = re.compile('^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$')


In [1]:
# ^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$
# Simple email expression. Doesn't allow numbers in the domain name 
# and doesn't allow for top level domains that are less than 2 or more than 3 letters 
# (which is fine until they allow more). 
# Doesn't handle multiple "." in the domain (joe@abc.co.uk)

# \b      is searching for the absence of a word-character    [ a-zA-Z0-9_ ]
# (?:...) non-capturing version, you're looking for this values, but don't need it
# ()      capturing version, you want this values! You're searching for it

import re, os
# The os and os.path modules.
# The __file__ constant
# os.path.realpath(path) (returns "the canonical path of the specified filename, 
#                         eliminating any symbolic links encountered in the path")
# os.path.dirname(path) (returns "the directory name of pathname path")
# os.getcwd() (returns "a string representing the current working directory")
# os.chdir(path) ("change the current working directory to path")

#    __file__ will not work if invoked from an IDE (say IDLE)
#print(os.path.dirname(os.path.realpath(__file__)))

# get the current working directory in use
print(os.getcwd())

with open('text.txt', 'r') as f:
    test_string = f.read()
    #regex = re.compile(r'^([0-9]+)')
    regex = re.compile(r'^([0-9]+)$', re.MULTILINE)
    result = regex.findall(test_string)
    print(result)
    
regex = r'\b(0?[1-9]|[12]\d|3[01])([ \/\-])(0?[1-9]|1[012])\2(\d{4})'
test_str = "Today's date is 18/09/2017"
subst = r'\3\2\1\2\4'
result = re.sub(regex, subst, test_str)
print(result)
x = re.sub("\s", "_", test_str, 2)
print(x)
x = re.split("\s", test_str, 1)
print(x)
x = re.split("\s", test_str)
print(x)


C:\Users\Tiger KT\jupyter-notebook
['1234', '5362', '1']
Today's date is 09/18/2017
Today's_date_is 18/09/2017
["Today's", 'date is 18/09/2017']
["Today's", 'date', 'is', '18/09/2017']


In [1]:

>>> import re, os
# The os and os.path modules.
# The __file__ constant
# os.path.realpath(path) (returns "the canonical path of the specified filename, 
#                         eliminating any symbolic links encountered in the path")
# os.path.dirname(path) (returns "the directory name of pathname path")

# os.getcwd() (returns "a string representing the current working directory")

# os.chdir(path) ("change the current working directory to path")

#    __file__ will not work if invoked from an IDE (say IDLE)

#print(os.path.dirname(os.path.realpath(__file__)))

# get the current working directory in use
print(os.getcwd())

with open('text.txt', 'r') as f:
    test_string = f.read()
    regex = re.compile(r'^([0-9]+)$', re.MULTILINE)
    result = regex.findall(test_string)
    print(result)
    
regex = r'\b(0?[1-9]|[12]\d|3[01])([ \/\-])(0?[1-9]|1[012])\2(\d{4})'
test_str = "Today's date is 18/09/2017"
subst = r'\3\2\1\2\4'
result = re.sub(regex, subst, test_str)
print(result)
x = re.sub("\s", "_", test_str, 2)
print(x)
x = re.split("\s", test_str, 1)
print(x)
x = re.split("\s", test_str)
print(x)

C:\Users\kyung.lee\jupyter-notebook
['1234', '5362', '1']
Today's date is 09/18/2017
Today's_date_is 18/09/2017
["Today's", 'date is 18/09/2017']
["Today's", 'date', 'is', '18/09/2017']


In [None]:
#validate credit card numbers(16 or 19 including 3 '-'s)
def check(s):
    ss = ''.join(s.split('-'))
    for i in range(len(ss)-3):
        if ss[i] == ss[i+1] and ss[i] == ss[i+2] and ss[i] == ss[i+3]:
            return False
    return True

import re
p1 = re.compile('^[4-6][0-9]{15}$')
p2 = re.compile('^[4-6][0-9]{3}(-[0-9]{4}){3}$')

for _ in range(int(input())):
    s = input()
    if s == "exit":
      break
    
    if (p1.match(s) or p2.match(s)) and check(s):
        print("Valid")
    else:
        print("Invalid")

In [None]:
import re

x = 'one two three'
y = re.search(r"\btwo\b", x)
print(y.group())
print(y)


###################

import urllib.request
import operator

# Download wiki page
url = "https://en.wikipedia.org/wiki/Diplomatic_history_of_World_War_II"
html = urllib.request.urlopen(url).read()

# Find all mentioned years in the 20th or 21st century
regex = r"\b(?:19|20)\d{2}\b"     # to create a non-capturing group, use the syntax (?:pattern)
matches = re.findall(regex, str(html))

# Form a dict of the number of occurrences of each year
year_counts = dict((year, matches.count(year)) for year in set(matches))

# Print the dict sorted in descending order
for year in sorted(year_counts, key=year_counts.get, reverse=True):
  #print(year, year_counts[year])
  pass
  
print(matches[0:5])


#########################

s = "aaa_bbb"
print(re.match(r"(?:aaa)(_bbb)", s).group())
print(re.match(r"(?:aaa)(_bbb)", s).group(1))


#########################

p = r'\b(0?[1-9]|[12]\d|3[01])([\/\-])(0?[1-9]|1[012])\2(\d{4})'
#p = r'\b(0?[1-9]|[12]\d|3[01])([\/\-])(0?[1-9]|1[012])([\/\-])(\d{4})'
print(re.findall(p, '29/07/2099  01-03-2011 22/12-10999'))

########################

str1 = "fruits:apple,banana;pets:cat,dog,bird;colors:green,blue"
#print(re.match(r"(?:pets:)([a-zA-Z,]+)", str1).group())
#m1 = re.match(r"(?:pets:)([a-zA-Z,]+)", str1)
#print(m1.group())
#str2 = "Average fast food wage is $9.08."
#m2 = re.match(r"(?:\$|(?:USD))([0-9]+)\.([0-9]{2})", str2)
#print(m2.group())



#####################
# phone number validation #

phonelist = ['800-5551212 x1234', '(800)5551212 x1234', '(800)-5551212xxxxx1234']

#phonePattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})$')
#phonePattern = re.compile(r'^\(?(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$')
#phonePattern.search('(800)-5551212 x1234').groups()

#x = phonePattern.search('800-5551212 x1234')
#print(x.group(1),)

phonePattern = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$')
#phonePattern.search('work 1-(800) 555.1212 #12345').groups()

lgroups = []
for phone in phonelist:
    x = phonePattern.search('work 1-(800) 555.1212 #12345').groups()
    lgroups.append(x)    
    
print(lgroups)




In [None]:
import re

# Q?: Valid email addresses must follow these rules:

# It must have the username@websitename.extension format type.
# The username can only contain letters, digits, dashes and underscores.
# The website name can only have letters and digits.
# The maximum length of the extension is 3. 

def fun(s):
    # return True if s is a valid email, else return False
    match = re.search(r'^[\w0-9\-]+@[a-zA-Z0-9]+\.[a-zA-Z0-9]+(\.[a-zA-Z0-9]*){0,2}\b', s, re.I)
    if match != None:
        print(match.group(1))
        return match.group()
    return[]

def filter_mail(emails):
    return list(filter(fun, emails))

if __name__ == '__main__':
    n = int(input())
    emails = []
    for _ in range(n):
        emails.append(input())

filtered_emails = filter_mail(emails)
filtered_emails.sort()
print(filtered_emails)