# Regex Exercises

In [1]:
import re
import pandas as pd
import numpy as np

# 1. Write a function named is_vowel. 

It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(character):
    # Starts with a character from this character class
    # Ends with a character from this character class
    regex = r'^[aeiouAEIOU]$'
    return bool(re.search(regex, character))

In [3]:
is_vowel("aeSfgxhcjv")

False

## 2. Write a function named is_valid_username that accepts a string as input. 

A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [4]:
user_name = "1code1up"

In [5]:
start_user_name = re.findall(r"^[^a-z]", user_name)
len(start_user_name)

1

In [6]:
is_character_ok = re.findall(r"[^a-z\d_]", user_name)
is_character_ok

[]

In [7]:
def is_valid_username(user_name):
    start_user_name = re.findall(r"^[^a-z]", user_name)
    is_character_ok = re.findall(r"[^a-z\d_]", user_name)
    if len(user_name) > 32:
        print("Max size for user name is 32 characters. Please make it shorter")
    elif len(start_user_name) != 0:
        print("Invalid username, please start with a lowercase letter")
    elif len(is_character_ok) != 0:
        print("Invalid username, please only use lowercase letters")
    else:
        print("This is a valid username")

In [8]:
is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')

Max size for user name is 32 characters. Please make it shorter


In [9]:
is_valid_username('codeup')

This is a valid username


In [10]:
is_valid_username('Codeup')

Invalid username, please start with a lowercase letter


In [11]:
is_valid_username('codeup123')

This is a valid username


In [12]:
is_valid_username('1codeup')

Invalid username, please start with a lowercase letter


## 3. Write a regular expression to capture phone numbers. It should match all of the following:

In [13]:
phone = "my phone number is (210) 867 5309, +1 210.867.5309, 867-5309, 210-867-5309"

In [14]:
phone_number = "My phone number is 210-867-5309, in the UK it was +44 886 44551 0077"

In [15]:
rx = r"\+?\d+?(?:[- ]+\d+)+"
re.findall(rx, phone_number)

['210-867-5309', '+44 886 44551 0077']

In [16]:
re.findall(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})", phone)

['(210) 867 5309', '210.867.5309', '867-5309', '210-867-5309']

### 4. Use regular expressions to convert the dates below to the standardized year-month-day format.

In [17]:
dates = ["02/04/19", "02/05/19", "02/06/19", "02/07/19", "02/08/19", "02/09/19", "02/10/19"]

In [18]:
for i in dates:
    new_date = re.sub(r"(\d{2})/(\d{2})/(\d{2})", r"20\3-\2-\1", i)
    print(new_date)

2019-04-02
2019-05-02
2019-06-02
2019-07-02
2019-08-02
2019-09-02
2019-10-02


### 5. Write a regex to extract the various parts of these logfile lines:

In [19]:
line = 'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58'

In [20]:
regex = '''
^
(?P<type_request>GET)
\s
(?P<destination>.+?)
\s
\[
(?P<date>.+)
\]
\s
(?P<request_info>.+?)
\s
\{
(?P<status>.+?)
\}
\s
\d+
\s
\"
?(?P<request_number>.+)
\"
\s
(?P<ip>.+)

'''

In [21]:
pd.Series(line).str.extract(regex, re.VERBOSE)

Unnamed: 0,type_request,destination,date,request_info,status,request_number,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,python-requests/2.21.0,97.105.19.58


In [22]:
access_logs = '''
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
'''

In [23]:
lines = pd.Series(access_logs.strip().split("\n"))

In [24]:
lines

0    GET /api/v1/sales?page=86 [16/Apr/2019:193452+...
1    POST /users_accounts/file-upload [16/Apr/2019:...
2    GET /api/v1/items?page=3 [16/Apr/2019:193453+0...
dtype: object

In [25]:
regex = '''
^
(?P<type_request>\w{3,4})
\s
(?P<destination>.+?)
\s
\[
(?P<date>.+)
\]
\s
(?P<request_info>.+?)
\s
\{
(?P<status>.+?)
\}
\s
\d+
\s
\"
?(?P<request_number>.+)
\"
\s
(?P<ip>.+)

'''

In [26]:
pd.Series(lines).str.extract(regex, re.VERBOSE)

Unnamed: 0,type_request,destination,date,request_info,status,request_number,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,python-requests/2.21.0,97.105.19.58


### 6.You can find a list of words on your mac at /usr/share/dict/words. 

Use this file to answer the following questions:

In [27]:
with open('/usr/share/dict/words') as f:
    words = f.read().split('\n')

In [28]:
df = pd.Series(words).str.lower()

### How many words have at least 3 vowels?

In [29]:
counter = 0
for i in words:
    vowels = re.findall(r"[aeoiu]", i)
    if len(vowels) >= 3:
        counter += 1
print(f"There are {counter:,} words with at least 3 vowels")

There are 190,293 words with at least 3 vowels


### How many words have at least 3 vowels in a row?

In [30]:
counter = 0
for i in words:
    vowels = re.findall(r"[aieou]{3,}", i)
    if len(vowels) > 0:
        counter += 1
print(f"There are {counter:,} with at least 3 vowels in a row")

There are 6,156 with at least 3 vowels in a row


### How many words have at least 4 consonants in a row?

In [31]:
counter = 0
for i in words:
    vowels = re.findall(r"[^aieou]{4,}", i)
    if len(vowels) > 0:
        counter += 1
print(f"There are {counter:,} with at least 4 consonants in a row")

There are 19,743 with at least 4 consonants in a row


### How many words start and end with the same letter?

In [32]:
counter = 0
for i in words:
    start = re.findall(r"^.", i)
    end = re.findall(r".$", i)
    if start == end:
        counter += 1
print(f"There are {counter:,} that start and end on the same letter")

There are 9,970 that start and end on the same letter


### How many words start and end with a vowel?

In [33]:
counter = 0
for i in words:
    start = re.findall(r"^[aeiou]", i)
    end = re.findall(r"[aeiou]$", i)
    if len(start) == 1 and len(end) == 1:
        counter += 1

print(f"There are {counter:,} that start and end with a vowel")    

There are 12,356 that start and end with a vowel


### How many words contain the same letter 3 times in a row?

In [34]:
counter = 0
for i in words:
    letters = re.findall(r"(\w)\1\1", i)
    if len(letters) > 0:
        counter += 1
print(f"There are {counter:,} words that contain the same letter 3 times in a row")

There are 7 words that contain the same letter 3 times in a row


## Any fun patters?

### Words with all 5 vowels

In [35]:
test = "eucosii"

In [36]:
a = re.findall(r"[a]{1,}", test)
e = re.findall(r"[e]{1,}", test)
i = re.findall(r"[i]{1,}", test)
o = re.findall(r"[o]{1,}", test)
u = re.findall(r"[u]{1,}", test)
vowels = a + e + i + o + u

In [37]:
len(vowels) >= 5

False

In [38]:
counter = 0
for word in words:
    a = re.findall(r"[a]{1,}", word)
    e = re.findall(r"[e]{1,}", word)
    i = re.findall(r"[i]{1,}", word)
    o = re.findall(r"[o]{1,}", word)
    u = re.findall(r"[u]{1,}", word)
    vowels = a + e + i + o + u
    if len(vowels) >= 5:
        counter += 1

print(f"There are {counter:,} words that have all five vowels")

There are 66,727 words that have all five vowels


### Words with daniel in them?

In [39]:
counter = 0
words_name = []
for i in words:
    word = re.findall(r"daniel", i)
    if len(word) > 0:
        counter += 1
        words_name.append(word)
print(counter)

1


In [40]:
words_name

[['daniel']]