# NLP Basics: Learning how to use regular expressions

### Using regular expressions in Python

Python's `re` package is the most commonly used regex resource. More details can be found [here](https://docs.python.org/3/library/re.html).

In [61]:
import re

re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up     string to test 2    different regex methods'
re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

### Splitting a sentence into a list of words

In [62]:
re.split(r'\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [14]:
re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [15]:
re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [16]:
re.split('\s+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [30]:
re.split('\W+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [18]:
re.findall('\S+', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [19]:
re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [20]:
re.findall('\S+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [42]:
re.findall('\w+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

### Replacing a specific string

In [130]:
pep8_test = 'I try to follow PEP8 guidelines!'
pep7_test = 'I try to follow PEP7 guidelines!'
peep8_test = 'I try to follow PEEP8 guidelines!'

In [132]:
import re


# re.findall('\W+', pep8_test)
input_string = "I have an apple and an apple pie."
pattern = r'[8]'  # Regex pattern to match the word "apple"
replacement = "9"  # What to replace it with

result = re.sub(pattern, replacement, pep8_test)

print(result)

I try to follow PEP9 guidelines!


### Other examples of regex methods

- re.search()
- re.match()
- re.fullmatch()
- re.finditer()
- re.escape()

In [None]:
import re

# Sample data with some text and numbers
sample_data = """
This is a sample text with numbers.
The year 2025 is coming soon.
The code for the project is 123456.
There are also years like 1990, 2005, and 2025 mentioned.
"""

# Regex to find all numbers
pattern = r'[0-9]+'
second_pattern = r'[nlp]+'
third_pattern = r'[nlp0-9]+'

# Find all numbers in the text
numbers = re.findall(pattern, sample_data)
nlp = re.findall(second_pattern, sample_data)
course_ = re.findall(third_pattern, sample_data)

print(numbers)
print(nlp)
print(course_)

# Check if 2025 is in the list of found numbers
if '2025' in numbers:
    print("Found 2025 in the data!")
else:
    print("2025 is not found.")


['2025', '123456', '1990', '2005', '2025', '8']
['pl', 'n', 'n', 'n', 'p', 'l', 'l', 'n', 'n', 'n', 'p', 'p']
['p8']
Found 2025 in the data!
