# 1. Basics of Regex

Regex is essentially a sequence of characters that define a search pattern. In Python, regex patterns are typically used with the re module.

In [1]:
import re

In [7]:
#  Matching Literal Strings
pattern = "apple"
text = "I have an apple and an orange. apple"

match = re.search(pattern, text)
findall = re.findall(pattern, text)
print(findall)
print(match)


['apple', 'apple']
<re.Match object; span=(10, 15), match='apple'>


# Special Characters
In regex, some characters have special meanings:

   - . (dot) - Matches any single character (except newline).
   - \d - Matches any digit (0-9).
   - \D - Matches any non-digit character.
   - \w - Matches any alphanumeric character (a-z, A-Z, 0-9, _).
   - \W - Matches any non-alphanumeric character.
   - \s - Matches any whitespace (space, tab, newline).
   - \S - Matches any non-whitespace character.

In [13]:
pattern = r"\d\d\d"  # Matches exactly three digits
text = "The number is 123."
match = re.search(pattern, text)
print(match)
print(bool(match))  # Finds "123"


<re.Match object; span=(14, 17), match='123'>
True


# Quantifiers

##### Quantifiers specify how many times a character or group should appear.

   - *(star) => Matches 0 or more times.
   - +(plus) => Matches 1 or more times.
   - ? => Matches 0 or 1 time.
   - {n} => Matches exactly n times.
   - {n,} => Matches n or more times.
   - {n,m} => Matches between n and m times.

In [14]:
pattern = r"\d{2,4}"  # Matches 2 to 4 digits
text = "Year: 2023"
match = re.search(pattern, text)
print(match)  # Finds "2023"


<re.Match object; span=(6, 10), match='2023'>


# Anchors

Anchors match positions within the text, not characters.

   - ^ - Matches the start of a string.
   - $ - Matches the end of a string.
   - \b - Matches a word boundary.
   - \B - Matches a position that is not a word boundary.

In [None]:
pattern = r"^Hello"  # Matches if the string starts with "Hello"
text = "Hello, world!"
match = re.search(pattern, text)
print(match)  # Finds "Hello" at the start of the string


<re.Match object; span=(0, 5), match='Hello'>


# Character Classes

Character classes allow you to match one of a set of characters.

   - [abc] - Matches a, b, or c.
   - [a-z] - Matches any lowercase letter.
   - [^abc] - Matches anything except a, b, or c.

In [23]:
pattern = r"[aeiou]"  # Matches any vowel
text = "apple12"
matches = re.findall(pattern, text)
print(matches)  # Finds ['a', 'e']


['a', 'e']


# Groups and Capturing

Grouping allows you to capture parts of a match, which is useful for extracting substrings.

   - (abc) => Groups together abc.
   - (?:abc) => Non-capturing group; groups without capturing.
   - (?P<name>abc) => Named capturing group, allowing you to access the group by name.

In [24]:
pattern = r"(\d+)-(\d+)"  # Captures two groups of digits separated by a dash
text = "Phone number: 123-456"
match = re.search(pattern, text)
print(match.groups())  # Outputs ('123', '456')


('123', '456')


# Lookaheads and Lookbehinds

Lookaheads and lookbehinds allow you to match a pattern based on what's ahead or behind it, without including those in the match.

   - Lookahead: (?=...) (positive) or (?!...) (negative)
   - Lookbehind: (?<=...) (positive) or (?<!...) (negative)

In [25]:
pattern = r"\d+(?= dollars)"  # Matches digits followed by "dollars"
text = "Price: 100 dollars"
match = re.search(pattern, text)
print(match)  # Finds "100"

<re.Match object; span=(7, 10), match='100'>


# Regex Flags

Flags modify the behavior of the regex.

   - re.IGNORECASE (re.I) - Case-insensitive matching.
   - re.MULTILINE (re.M) - ^ and $ match the start/end of each line.
   - re.DOTALL (re.S) - Dot . matches newline as well.

In [26]:
pattern = r"hello"
text = "Hello"
match = re.search(pattern, text, re.IGNORECASE)  # Case-insensitive search
print(match)  # Finds "Hello" because of IGNORECASE flag

<re.Match object; span=(0, 5), match='Hello'>


In [None]:
# Practical Examples

# Validating an Email Address

pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
email = "d.debasish003@gmail.com"
is_valid = re.match(pattern, email)
print(bool(is_valid))  # True if valid, False otherwise



True


In [31]:
# Extracting Dates (dd-mm-yyyy format)

pattern = r"\b\d{2}-\d{2}-\d{4}\b"
text = "The event is on 12-05-2023."
dates = re.findall(pattern, text)
print(dates)  # ['12-05-2023']


['12-05-2023']


In [32]:
# Extracting All Words Starting with a Capital Letter

pattern = r"\b[A-Z][a-z]*\b"
text = "John went to New York."
capitalized_words = re.findall(pattern, text)
print(capitalized_words)  # ['John', 'New', 'York']


['John', 'New', 'York']


In [None]:
# Example 1:
# In a linux vm , we have list of log files, search for : Processed Record {name: anyname, type:shirt,....} . 
# Find the no of occurence of above pattern in each file and print

import os
import re

def count_pattern_in_logs(directory, pattern):
    # Compile the regex pattern for efficiency
    regex = re.compile(pattern)
    
    # Iterate over all log files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".log"):  # Check if the file is a .log file
            file_path = os.path.join(directory, filename)
            try:
                # Open and read the log file
                with open(file_path, 'r') as file:
                    content = file.read()
                
                # Find all matches of the pattern in the file
                matches = regex.findall(content)
                # Print the count of matches
                print(f"File: {filename} - Count: {len(matches)}")
            
            except Exception as e:
                print(f"Error reading {filename}: {e}")

# Define the directory containing log files and the search pattern
log_directory = "/path/to/log/files"
search_pattern = r"Processed Record \{name:.*?type:shirt.*?\}"

# Call the function
count_pattern_in_logs(log_directory, search_pattern)

In [None]:
# Example 2:
# Given a file consisting of many lines, each line consisting of many words, find longest word that matches a given pattern. 
# (No constraints provided, no optimisation needed, as mentioned above, focus was on code structure, maintainability and extensibility)

import re
from typing import Optional

class LongestWordFinder:
    def __init__(self, file_path: str, pattern: str):
        """
        Initialize the finder with the file path and regex pattern.

        :param file_path: Path to the text file.
        :param pattern: Regex pattern to match words.
        """
        self.file_path = file_path
        self.pattern = re.compile(pattern)

    def find_longest_word(self) -> Optional[str]:
        """
        Find the longest word in the file matching the pattern.

        :return: The longest matching word or None if no match is found.
        """
        longest_word = None

        try:
            with open(self.file_path, 'r') as file:
                for line in file:
                    words = line.split()
                    for word in words:
                        if self.pattern.fullmatch(word):
                            if longest_word is None or len(word) > len(longest_word):
                                longest_word = word
        except FileNotFoundError:
            print(f"Error: The file '{self.file_path}' was not found.")
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

        return longest_word

# Example usage
if __name__ == "__main__":
    # Input: Path to the file and regex pattern
    file_path = "sample.txt"
    pattern = r"[a-zA-Z]+"  # Example pattern to match only alphabetic words

    # Create an instance of the finder
    finder = LongestWordFinder(file_path, pattern)

    # Find and print the longest matching word
    result = finder.find_longest_word()
    if result:
        print(f"The longest matching word is: {result}")
    else:
        print("No matching word found.")
