In [1]:
import re

### Basic patterns
* .(a period): Matches any single character
* \w-Matches a word character( a letter, a digit, or the underscore (). Equivalent to [a-zA-Z0-9])
    * Eg. \w\w\w matches abc or a1\_
* \W: Matches any non-word character, which is anything except letters, digits, and underscore.
* \s: Matches any whitespace character (space, tab, newline, etc.).
* \S: Matches any non-whitespace character
* \d: Matches any digit
* \D: Matches any non-digit
* \\: Escape character
* ^: Matches the start of a string
* $: Matches the end of a string

In [7]:
print("r-example:Hello\rWorld") 
'''
    First output: r-example"Hello
    Now the cursor will come at the starting position: |r-example"Hello
    World will be printed at the start and it overlaps the text: World(in place of 'r-exa')mple:Hello
'''

Worldmple:Hello


In [6]:
print("Hello World")
print("Hello\rWorld")  # r is carriage which makes the text comes at the starting position
print("n-example:Hello\nWorld")

Hello World
World
n-example:Hello
World


In [32]:
# start of a string
print(re.findall(r'^Hello', 'Hello World\nHello again'))

['Hello']


In [35]:
# escape character

## match a dot .
print(re.findall(r'\.', 'www.example.com'))
print(re.findall(r'\#d', 'Hi I am #, #deep'))

['.', '.']
['#d']


In [38]:
# end of a string
print(re.findall(r'world$', 'hello world, hi World'))
print(re.findall(r'World$', 'hello world, hi World'))

[]
['World']


### Repetition
* +- 1 or more occurrences of the pattern to its left
* *- 0 or more occurrences of the pattern to its left
* ?- match 0 or 1 occurrences of the pattern at the start of a string.
* {n}-Matches exactly n repetitions of the preceding element.
* {n,}-Matches n or more repetitions of the preceding element.
* {n,m}-Matches between n and m repetitions (inclusive) of the preceding

In [47]:
# '+'
re.findall(r'ab+', 'abbbbcdacbdab')

['abbbb', 'ab']

In [48]:
# '*'
re.findall(r'ab*', 'abcd abb c abbbbb cd ab a')

['ab', 'abb', 'abbbbb', 'ab', 'a']

In [50]:
# ^- all the words starts with digit
re.findall(r'^\d', '12 ab were 123')

['1']

In [52]:
re.findall(r'\d{2}', '12 2344546')

['12', '23', '44', '54']

In [53]:
re.findall(r'\d{2,}', '123 21314124')

['123', '21314124']

In [58]:
re.findall(r'\w{2,4}', 'hello world hi __ _  7 74 12345466544545')

['hell', 'worl', 'hi', '__', '74', '1234', '5466', '5445', '45']

In [59]:
# \b- word boundary
print(re.findall(r'\bapple','apple banana apples bananas'))  # At the beginning
print(re.findall(r'apple\b','apple banana apples bananas'))  # At the end
print(re.findall(r'\bapple\b','apple banana apples bananas'))  # Whole world

['apple', 'apple']
['apple']
['apple']


In [62]:
re.findall(r'\b\d{2,4}', '12345678 123 12345')

['1234', '123', '1234']

### Square Brackets
* It can be used to indicate a set of chars, so [abc] matches 'a' or 'b' or 'c'. 
* [^a-c] means character except a,b,c
* *Note*: Inside [] .(dot) matches a literal dot/period

In [8]:
re.findall(r'[^a-c.#$]', 'abcdef123@3.#$')  # Except a, b, c, ., #, $

['d', 'e', 'f', '1', '2', '3', '@', '3']

In [42]:
m=re.search(r'[\w.-]+@[\w.]+', 'abc-def@gmail.com')
if m:
    print(m.group())

abc-def@gmail.com


In [43]:
re.findall(r'[aeiou]', 'abcdefgh')

['a', 'e']

### Regex functions

In [11]:
# search:- Find the first match for a pattern
digit_match=re.search(r'\d+', 'abc123. Room 402')
if digit_match:
    print("Found:-", digit_match.group())
else:
    print("Match not found.")

Found:- 123


In [13]:
# findall:- finds all the matches and returns them as a list of strings, with each string representing one match.
num=re.findall(r'\d+', 'Room 123, Room 456, Room 789')
if num:
    print("Found:-", num)
else:
    print("Match not found.")

Found:- ['123', '456', '789']


In [31]:
# group:- It allows us to capture specific parts of the matched text separately.
match1=re.search(r'(\d{3})\-(\d)\-(\d)', '123-4-5')
match2=re.search(r'(\w+)@(\w+).(\w+)', 'abc@gmail.com')
print(match1, '\n', match2)
print(match1.group())
print(match1.group(1))
print(match2.group(2))
print(match2.group(3))

<re.Match object; span=(0, 7), match='123-4-5'> 
 <re.Match object; span=(0, 13), match='abc@gmail.com'>
123-4-5
123
gmail
com


In [2]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [3]:
# re.search(pattern, string): Searches for the first occurrence of the pattern in the string
result = re.search(r'apple', 'I like apples and oranges')
print(result.group())  # Output: 'apple'

apple


In [4]:
# re.match(pattern, string): Matches the pattern at the beginning of the string.

result = re.match(r'apple', 'I like apples and oranges')
print(result.group())  # Output: None (because 'apple' doesn't occur at the start of the string)

AttributeError: 'NoneType' object has no attribute 'group'

In [5]:
# re.findall(pattern, string): Finds all occurrences of the pattern in the string.
results = re.findall(r'app\w*', 'I like apples and applesauce')
print(results)  # Output: ['apples', 'applesauce']

['apples', 'applesauce']


In [6]:
# re.sub(pattern, replacement, string): Replaces occurrences of the pattern in the string with the replacement.
new_string = re.sub(r'apple', 'orange', 'I like apples and applesauce')
print(new_string)  # Output: 'I like oranges and orangesauce'

I like oranges and orangesauce


### Regex Patterns:

In [7]:
# 1. Literals: Characters that match themselves (e.g., a, 5, apple).
text = "I have an apple, but I want more apples for my apple pie."

# Using a regex literal to find occurrences of the word 'apple'
matches = re.findall(r'apple', text)

print(matches)  # Output: ['apple', 'apple', 'apple']

['apple', 'apple', 'apple']


**Character Classes:**
* \d: Matches any digit (equivalent to [0-9]).
* \w: Matches any alphanumeric character or underscore (equivalent to [a-zA-Z0-9_]).
* \s: Matches any whitespace character (space, tab, newline).




**Quantifiers:**
* *: Matches zero or more occurrences.
* +: Matches one or more occurrences.
* ?: Matches zero or one occurrence.
* {n}: Matches exactly n occurrences.
* {n,}: Matches n or more occurrences.
* {n,m}: Matches between n and m occurrences.

In [8]:
text = "ab abb abbb"
matches = re.findall(r'ab*', text)
print(matches)  # Output: ['ab', 'abb', 'abbb']

['ab', 'abb', 'abbb']


In [9]:
matches = re.findall(r'ab+', text)
print(matches)  # Output: ['ab', 'abb', 'abbb']

['ab', 'abb', 'abbb']


In [10]:
text = "cat cats"
matches = re.findall(r'cats?', text)
print(matches)  # Output: ['cat', 'cats']

['cat', 'cats']


In [11]:
text = "aa aaa aaaa aaaaa"
matches = re.findall(r'a{3}', text)
print(matches)  # Output: ['aaa', 'aaa']

['aaa', 'aaa', 'aaa']


In [12]:
matches = re.findall(r'a{2,}', text)
print(matches)  # Output: ['aa', 'aaa', 'aaaa']

['aa', 'aaa', 'aaaa', 'aaaaa']


In [14]:
matches = re.findall(r'a{3,4}', text)
print(matches)  # Output: ['aa', 'aaa', 'aaaa']

['aaa', 'aaaa', 'aaaa']


**Anchors:**
* ^: Matches the start of a string.
* $: Matches the end of a string.

In [15]:
#  Match lines that start with 'Hello'
text = "Hello World\nHello Python\nHi there"
matches = re.findall(r'^Hello', text, flags=re.MULTILINE)
print(matches)  # Output: ['Hello', 'Hello']

['Hello', 'Hello']


In [19]:
#  pattern 'Python' is matched only when it appears at the end of a line.
text = "Hello Python\nPython is great\nPython programming"
matches = re.findall(r'Python$', text, flags=re.MULTILINE)
print(matches)  # Output: ['Python']

['Python']


In [18]:
text = "Hello World\nPrograaming Python is great\nProgramming with Python is fun"

# Using anchors to match lines that start with 'Python' and end with 'Python'
matches_start = re.findall(r'^Python', text, flags=re.MULTILINE)
matches_end = re.findall(r'Python$', text, flags=re.MULTILINE)

print(matches_start)  # Output: []
print(matches_end)    # Output: []

[]
[]


**Alternation:**
|: Matches either/or (e.g., apple|orange matches 'apple' or 'orange').

In [20]:
text = "I like apples, oranges, and bananas."

# Using alternation to match 'apples', 'oranges', or 'bananas'
matches = re.findall(r'apples|oranges|bananas', text)
print(matches)  # Output: ['apples', 'oranges', 'bananas']

['apples', 'oranges', 'bananas']
