<a href="https://colab.research.google.com/github/digitechit07/Python-Tutorial-with-Excercise/blob/main/Python_RegEx_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **What are Python Regular Expressions (Regex)?**

A Python regex, then, enables you to specify a pattern for the string you are looking for. This search pattern may be applied to searching for characters, words or even patterns, such as an email address and a phone number. Regex is helpful whenever you have to match anything and get some information on the structure of the match or just need to replace things.

In [6]:
import re

pattern = re.compile(r"\d+")
matches = pattern.findall("There are 42 apples and 36 oranges.")
print(matches)  # Output: ['42', '36']

import re

text = "Price is $10.50"
pattern = r"\$10\.50"

match = re.search(pattern, text)
if match:
    print("Price found!")


import re

text = "John's birthday is on 12/08/1990 and his sister's is on 03/11/1995."
pattern = r"\b\d{2}/\d{2}/\d{4}\b"  # Matches date format dd/mm/yyyy

matches = re.findall(pattern, text)
print(matches)  # Output: ['12/08/1990', '03/11/1995']


import re

text = "Contact us at support@example.com or sales@company.org."
pattern = r"\b[\w.-]+@[\w.-]+\.\w{2,}\b"  # Matches email addresses

result = re.sub(pattern, "[EMAIL]", text)
print(result)  # Output: Contact us at [EMAIL] or [EMAIL].


import re

text = "Python is fun. John and Mary are learning Machine Learning."
pattern = r"\b[A-Z][a-z]*\b"  # Matches words starting with a capital letter

matches = re.findall(pattern, text)
print(matches)  # Output: ['Python', 'John', 'Mary', 'Machine', 'Learning']


pattern = r'word(?=\s)'  # Matches 'word' only if followed by a space
text = 'word followed by space'
match = re.search(pattern, text)


import re

pattern = r"hello"
text = "hello world"
result = re.match(pattern, text)
if result:
    print("Match successful:", result.group())  # Output: hello


text = "Python3.9"
match = re.search(r'\d+.\d+', text)
if match:
    print("Found version number:", match.group())  # Output: 3.9

contact_info = "user@domain.com,  support@company.org"
emails = re.findall(r'[\w\.-]+@[\w\.-]+', contact_info)
print(emails)  # ['user@domain.com', 'support@company.org']



log_entry = "2023-05-15 14:30:22 [ERROR] System crash"
match = re.match(r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\]', log_entry)
if match:
    date, time, level = match.groups()
    print(f"Error occurred on {date} {time}, Level: {level}")

html_content = "<p>Hello</p>"
# Greedy mode
print(re.findall(r'<p>(.*)</p>', html_content))
# Non-greedy mode
print(re.findall(r'<p>(.*?)</p>', html_content))

# Extract Python followed by a digit
code_text = "Python3 Python2 Python"
print(re.findall(r'Python(?=\d)', code_text))

# Extract Python not followed by a digit
print(re.findall(r'Python(?!\d)', code_text))


contact_text = "010-87654321, 13912345678"
phone_numbers = re.findall(r'\b\d{3}-\d{8}\b|\b1[3-9]\d{9}\b', contact_text)
print(phone_numbers)  # ['010-87654321', '13912345678']


def check_password_strength(password):
    """Validate password contains uppercase and lowercase letters and digits, length 8-20 characters"""
    pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[\w]{8,20}$'
    return re.match(pattern, password) is not None

print(check_password_strength("Secure123"))  # True
print(check_password_strength("weak"))       # False

## Simple pattern matching
text = "Hello, Python programming in LabEx!"
pattern = r"Python"
match = re.search(pattern, text)

if match:
    print("Pattern found!")
else:
    print("Pattern not found.")


## Compiling a regex pattern
compiled_pattern = re.compile(r'\d+')
text = "There are 42 apples in the basket"
matches = compiled_pattern.findall(text)
print(matches)  ## Output: ['42']


## Matching digits
text = "LabEx has 100 programming courses"
digits = re.findall(r'\d+', text)
print(digits)  ## Output: ['100']

## Matching word characters
words = re.findall(r'\w+', text)
print(words)  ## Finds all word sequences

import re

## Capturing groups
text = "Contact email: john.doe@labex.io"
pattern = r"(\w+)\.(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, text)

if match:
    username = match.group(1)
    lastname = match.group(2)
    domain = match.group(3)
    tld = match.group(4)
    print(f"Username: {username}, Domain: {domain}")


## Non-capturing groups
pattern = r"(?:Mr\.|Mrs\.) \w+ \w+"
names = re.findall(pattern, "Mr. John Smith and Mrs. Jane Doe")

## Password validation example
def validate_password(password):
    pattern = r'^(?=.*[A-Z])(?=.*[a-z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'
    return re.match(pattern, password) is not None

## Test passwords
passwords = [
    "WeakPass",
    "StrongP@ssw0rd",
    "labex2023!"
]

for pwd in passwords:
    print(f"{pwd}: {validate_password(pwd)}")


## Case-insensitive matching
text = "Python in LabEx is AWESOME"
pattern = re.compile(r'python', re.IGNORECASE)
matches = pattern.findall(text)


## Extracting structured data
log_entry = "2023-06-15 14:30:45 [ERROR] Database connection failed"
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)'
match = re.match(pattern, log_entry)

if match:
    date, time, level, message = match.groups()
    print(f"Date: {date}, Time: {time}, Level: {level}")


import re

def validate_input(input_type, value):
    validators = {
        'email': r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
        'phone': r'^\+?1?\d{10,14}$',
        'url': r'^https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/\S*)?$'
    }

    return re.match(validators[input_type], value) is not None

## LabEx input validation examples
print(validate_input('email', 'user@labex.io'))
print(validate_input('phone', '+1234567890'))
print(validate_input('url', 'https://labex.io'))


def parse_log_file(log_path):
    error_pattern = r'(\d{4}-\d{2}-\d{2}) .*\[ERROR\] (.+)'
    errors = []

    with open(log_path, 'r') as file:
        for line in file:
            match = re.search(error_pattern, line)
            if match:
                errors.append({
                    'date': match.group(1),
                    'message': match.group(2)
                })

    return errors

## Example log parsing in LabEx environment
#log_errors = parse_log_file('/var/log/application.log')







['42', '36']
Price found!
['12/08/1990', '03/11/1995']
Contact us at [EMAIL] or [EMAIL].
['Python', 'John', 'Mary', 'Machine', 'Learning']
Match successful: hello
Found version number: 3.9
['user@domain.com', 'support@company.org']
Error occurred on 2023-05-15 14:30:22, Level: ERROR
['Hello']
['Hello']
['Python', 'Python']
['Python']
['010-87654321', '13912345678']
True
False
Pattern found!
['42']
['100']
['LabEx', 'has', '100', 'programming', 'courses']
Username: john, Domain: labex
WeakPass: False
StrongP@ssw0rd: True
labex2023!: False
Date: 2023-06-15, Time: 14:30:45, Level: ERROR
True
True
True


# **How to Use Python Regex in Test Automation?**

Let’s take a practical approach to using Python regex in your test automation scripts. The key is knowing how to define a regex pattern, select the right re function, and integrate it into your automated tests.

Mastering regex components in the vast landscape of Python programming provides a powerful arsenal for automation testers. Let’s delve into these components with practical examples, showcasing how they can be combined to perform versatile string manipulations and pattern matching.

In [8]:
def process_text(text):
    ## Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    ## Standardize phone numbers
    text = re.sub(r'(\d{3})[-.]?(\d{3})[-.]?(\d{4})',
                  r'(\1) \2-\3', text)

    ## Mask sensitive information
    text = re.sub(r'\b\d{4}-\d{4}-\d{4}-\d{4}\b',
                  '****-****-****-****', text)

    return text

sample_text = "Contact:  John   Doe 1234-5678-9012-3456 at 123.456.7890"
print(process_text(sample_text))


def clean_html_content(html_text):
    ## Remove HTML tags
    clean_text = re.sub(r'<[^>]+>', '', html_text)

    ## Decode HTML entities
    clean_text = re.sub(r'&[a-z]+;', ' ', clean_text)

    ## Normalize whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text


def extract_structured_data(text):
    ## Extract key-value pairs
    pattern = r'(\w+)\s*:\s*([^\n]+)'
    return dict(re.findall(pattern, text))

sample_data = """
Name: John Doe
Age: 30
Email: john@labex.io
Role: Developer
"""

structured_data = extract_structured_data(sample_data)
print(structured_data)


import re

bad_words_regex = re.compile(r'''([bB][aA][sS][tT][aA][rR][dD]|[fF][uU][cC][kK])''')
word = input("Enter your word")

if bad_words_regex.match(word) is not None:
	print("Moderate your language!")
else:
	print("Good boy")
'''
import re

from trieregex import TrieRegEx as TRE

words = ['bastard', 'fuck', 'losser']

# Add word(s)
tre = TRE(*words)  # word(s) can be added upon instance

# Create regex pattern from the trie
regex_pattern = tre.regex()

# Add boundary context and compile for matching
bad_words_regex = re.compile(f'\\b{regex_pattern}\\b')
'''
word = input("Enter your word")

if bad_words_regex.match(word) is not None:
	print("Moderate your language!")
else:
	print("Good boy")


from itertools import accumulate
from re import fullmatch


# this one function is copy-pasta from stackexchange
def baseN(num,b,numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
    return ((num == 0) and numerals[0]) or (baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b])


# used to determine whether the string arg is enclosed by parentheses
def need_par(arg):
    return not all(list(accumulate([0]+list(arg), lambda curr,x: curr+1 if x == "(" else (curr-1 if x == ")" else curr)))[1:-1])


# returns arg, enclosed in parentheses if neceassary
# TODO: sets too many parentheses, we only need them, if | is not already enclosed
def propar(arg):
    if need_par(arg) and "|" in arg:
        return "(" + arg + ")"
    return arg


class RegEq:

    def __init__(self, lhs, factor_dict):
        self.lhs = lhs
        self.factor_dict = factor_dict

    # use ardens lemma to simplify this equation
    def arden(self):
        if self.lhs in self.factor_dict:
            a,b = self.lhs, self.factor_dict[self.lhs]
            self.factor_dict = self.mult_rhs(("({})*" if need_par(b) else "{}*").format(b)).factor_dict
            self.factor_dict.pop(self.lhs, None)
        return(self)


    # prepends a character to every term in the equation
    def mult_rhs(eq, factor):
        return RegEq(eq.lhs, {key:(propar(factor) + propar(eq.factor_dict[key])) for key in eq.factor_dict})



    def add_rhs(fst, snd):
        temp_dict = {}
        for key in {**fst.factor_dict, **snd.factor_dict}:
            temp_val = "|".join([x for x in [fst.factor_dict.get(key), snd.factor_dict.get(key)] if x != None])
            temp_dict[key] = temp_val
        return RegEq(fst.lhs, temp_dict)

    def plugin(fst, snd):
        "plugs self into other and returns new equation"
        #print("plugging", fst, "into", snd)
        temp_dict = {key:value for key,value in snd.factor_dict.items() if key != fst.lhs}
        temp = RegEq(snd.lhs, temp_dict if temp_dict else {})
        return RegEq.add_rhs(temp, RegEq.mult_rhs(fst, snd.factor_dict[fst.lhs]))

    def __str__(self):
        return "R{} = ".format(self.lhs) + " + ".join(("{} R{}".format(self.factor_dict[i],i) for i in self.factor_dict))


class RegEqSystem:

    def __init__(self, start, eqns):
        self.start = start
        self.eqns = eqns

    def __str__(self):
        return "\n".join([str(eq) for eq in self.eqns.values()])

    def solvequeue(self):
        temp_q = [self.eqns[self.start].lhs]
        for cur_eq in temp_q:
            for cur_var in self.eqns[cur_eq].factor_dict:
                if not cur_var in temp_q and not cur_var == "":
                    temp_q.append(cur_var)
        return temp_q[::-1]

    def arden_all(self):
        for eq in self.eqns.values():
            eq.arden()

    def solve(self):
        sq = self.solvequeue()
        for plug_arg in sq[:]:
            self.arden_all()
            for plug_eq_rhs in sq:
                if plug_arg in self.eqns[plug_eq_rhs].factor_dict:
                    self.eqns[plug_eq_rhs] = self.eqns[plug_arg].plugin(self.eqns[plug_eq_rhs])
            sq.remove(plug_arg)
        return self.eqns[self.start].factor_dict[""]


def make_eq_sys(rem, quot, base = 2):
    eq_dict = {}
    for state in range(quot):
        state_dict = {}
        #{(state*base+inpt)%quot: str(inpt) for inpt in range(base)}
        #TODO: multiple arrows to the same state override each other, fix that
        for inpt in range(base):
            key = (state*base+inpt)%quot
            if key not in state_dict:
                state_dict[key] = str(inpt)
            else:
                state_dict[key] += ("|"+str(inpt))
        if state == rem:
            state_dict[""]=""
        eq_dict[state] = RegEq(state, state_dict)
    return RegEqSystem(0, eq_dict)


def make_test(rem, quot, base=2, testrange=301):
    expr = make_eq_sys(rem, quot, base).solve()
    test_reg = lambda x: bool(fullmatch(expr,x))
    filtered = [x for x in range(testrange) if test_reg(baseN(x,base))]
    print(filtered)
    return filtered == [x for x in range(testrange) if x%quot == rem]


print(make_eq_sys(0, 3).solve())
print()
print(make_eq_sys(0, 5).solve())
print()
print(make_eq_sys(0, 1).solve())




Contact: John Doe ****-****-****-**** at (123) 456-7890
{'Name': 'John Doe', 'Age': '30', 'Email': 'john@labex.io', 'Role': 'Developer'}
Enter your wordZahid
Good boy
Enter your wordHi
Good boy
(0*1(01*0)*1)*0*

((0*1(((((10)*(0|11))(01*01)*01*00)*)(((10)*(0|11))(01*01)*1)))*)0*

((0|1)*)


# **Choose the Right Regex Function**

Once you have your pattern, you’ll use one of the re functions to perform the search. The most used functions are:

re.match(): Matches the pattern at the start of the string.
re.search(): Searches the entire string for a match.
re.findall(): Finds all matches and returns them as a list.
re.sub(): Replaces all matches with a new string.

In [12]:
statement = "I Love You"

m = re.search("I love (\w+)", statement)
if m:
  print("He loves",m.group(1))
else:
  m = re.search("Ich liebe (\w+)", statement)
  if m:
    print("Er liebt",m.group(1))
  else:
    m = re.search("Je t'aime (\w+)", statement)
    if m:
      print("Il aime",m.group(1))





import re

class REMatcher(object):
    def __init__(self, matchstring):
        self.matchstring = matchstring

    def match(self,regexp):
        self.rematch = re.match(regexp, self.matchstring)
        return bool(self.rematch)

    def group(self,i):
        return self.rematch.group(i)


for statement in ("I love Mary",
                  "Ich liebe Margot",
                  "Je t'aime Marie",
                  "Te amo Maria"):

    m = REMatcher(statement)

    if m.match(r"I love (\w+)"):
        print("He loves",m.group(1))

    elif m.match(r"Ich liebe (\w+)"):
        print("Er liebt",m.group(1))

    elif m.match(r"Je t'aime (\w+)"):
        print("Il aime",m.group(1))

    else:
        print("???")




import re

for statement in ("I love Mary",
                  "Ich liebe Margot",
                  "Je t'aime Marie",
                  "Te amo Maria"):

    if m := re.match(r"I love (\w+)", statement):
        print("He loves", m.group(1))

    elif m := re.match(r"Ich liebe (\w+)", statement):
        print("Er liebt", m.group(1))

    elif m := re.match(r"Je t'aime (\w+)", statement):
        print("Il aime", m.group(1))

    else:
        print()




m0 = re.match("I love (\w+)", statement)
m1 = re.match("Ich liebe (\w+)", statement)
m2 = re.match("Je t'aime (\w+)", statement)
if m0:
  print("He loves", m0.group(1))
elif m1:
  print("Er liebt", m1.group(1))
elif m2:
  print("Il aime", m2.group(1))




pats = [
    ("I love (\w+)", "He Loves {0}" ),
    ("Ich liebe (\w+)", "Er Liebe {0}" ),
    ("Je t'aime (\w+)", "Il aime {0}")
 ]
for p1, p3 in pats:
    m = re.match(p1, statement)
    if m:
        print(p3.format(m.group(1)))
        break




pats = {
    "I love (\w+)" : "He Loves {0}",
    "Ich liebe (\w+)" : "Er Liebe {0}",
    "Je t'aime (\w+)" : "Il aime {0}",
}
for p1 in pats:
    m = re.match(p1, statement)
    if m:
        print(pats[p1].format(m.group(1)))
        break




if match := re.search('I love (\w+)', statement):
  print(f'He loves {match.group(1)}')
elif match := re.search("Ich liebe (\w+)", statement):
  print(f'Er liebt {match.group(1)}')
elif match := re.search("Je t'aime (\w+)", statement):
  print(f'Il aime {match.group(1)}')




alist={"I love ":"He loves","Je t'aime ":"Il aime","Ich liebe ":"Er liebt"}
for k in alist.keys():
    if k in statement:
       print(alist[k],statement.split(k)[1:])




def re_match_group(pattern, str, out_groups):
    del out_groups[:]
    result = re.match(pattern, str)
    if result:
        out_groups[:len(result.groups())] = result.groups()
    return result



groups = []
if re_match_group("I love (\w+)", statement, groups):
    print("He loves", groups[0])
elif re_match_group("Ich liebe (\w+)", statement, groups):
    print( "Er liebt", groups[0])
elif re_match_group("Je t'aime (\w+)", statement, groups):
    print( "Il aime", groups[0])


import unittest

RE_INT = re.compile(r'^[-+]?([1-9]\d*|0)$')


class TestRE(unittest.TestCase):
    def test_int(self):
        self.assertFalse(RE_INT.match('+'))
        self.assertFalse(RE_INT.match('-'))

        self.assertTrue(RE_INT.match('1'))
        self.assertTrue(RE_INT.match('+1'))
        self.assertTrue(RE_INT.match('-1'))
        self.assertTrue(RE_INT.match('0'))
        self.assertTrue(RE_INT.match('+0'))
        self.assertTrue(RE_INT.match('-0'))

        self.assertTrue(RE_INT.match('11'))
        self.assertFalse(RE_INT.match('00'))
        self.assertFalse(RE_INT.match('01'))
        self.assertTrue(RE_INT.match('+11'))
        self.assertFalse(RE_INT.match('+00'))
        self.assertFalse(RE_INT.match('+01'))
        self.assertTrue(RE_INT.match('-11'))
        self.assertFalse(RE_INT.match('-00'))
        self.assertFalse(RE_INT.match('-01'))

        self.assertTrue(RE_INT.match('1234567890'))
        self.assertTrue(RE_INT.match('+1234567890'))
        self.assertTrue(RE_INT.match('-1234567890'))




import re

num="12.345678"

y = re.findall('\.[0-9.]+',num)

print(y)


# define the text
text2 = """COM Computers 205 MAT Mathematics 189"""
# compile the regex and search the pattern
regex_num = re.compile('\d+')
s = regex_num.search(text2)
print('Starting Position: ', s.start())
print('Ending Position: ', s.end())
print(text2[s.start():s.end()])

#> Starting Position:  17
#> Ending Position:  20
#> 205

He loves Mary
Er liebt Margot
Il aime Marie
???
He loves Mary
Er liebt Margot
Il aime Marie

['.345678']
Starting Position:  14
Ending Position:  17
205


  m = re.search("I love (\w+)", statement)
  m = re.search("Ich liebe (\w+)", statement)
  m = re.search("Je t'aime (\w+)", statement)
  m0 = re.match("I love (\w+)", statement)
  m1 = re.match("Ich liebe (\w+)", statement)
  m2 = re.match("Je t'aime (\w+)", statement)
  ("I love (\w+)", "He Loves {0}" ),
  ("Ich liebe (\w+)", "Er Liebe {0}" ),
  ("Je t'aime (\w+)", "Il aime {0}")
  "I love (\w+)" : "He Loves {0}",
  "Ich liebe (\w+)" : "Er Liebe {0}",
  "Je t'aime (\w+)" : "Il aime {0}",
  if match := re.search('I love (\w+)', statement):
  elif match := re.search("Ich liebe (\w+)", statement):
  elif match := re.search("Je t'aime (\w+)", statement):
  if re_match_group("I love (\w+)", statement, groups):
  elif re_match_group("Ich liebe (\w+)", statement, groups):
  elif re_match_group("Je t'aime (\w+)", statement, groups):
  y = re.findall('\.[0-9.]+',num)
  regex_num = re.compile('\d+')


# **Lookahead/Lookbehind in Python Regex**

Lookahead and lookbehind are advanced regex features that let you check if a string is followed or preceded by another pattern, without including it in the match.