In [1]:
import re
import pandas as pd


Exercise 1
Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:

re.search(r"^(a|e|i|o|u)$", "a", re.IGNORECASE)

<re.Match object; span=(0, 1), match='a'>

In [3]:

re.search(r"^(a|e|i|o|u)$", "aeiou", re.IGNORECASE)

In [4]:
def is_vowel(string):
    return bool(re.search(r"^[aeiou]$", string, re.IGNORECASE))
    
assert is_vowel("a") == True
assert is_vowel("E") == True
assert is_vowel("aaa") == False
assert is_vowel("aeiou") == False

Exercise 2
Write a function named is_validusername that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [5]:
def is_valid_username(string):
    pattern = r"^[a-z][a-z0-9_]{,31}$"
    return bool(re.search(pattern, string))

assert is_valid_username("codeup") == True
assert is_valid_username("codeup123") == True
assert is_valid_username("123Codeup") == False
assert is_valid_username("CodeupCodeup!") == False
assert is_valid_username("aaaCODEUPCODEUPaaaaaaaaaaaaaaaaaaaaaaaaaa") == False

Exercise 3
Write a regular expression to capture phone numbers. It should match all of the following:

(210) 867 5309
+1 210.867.5309
867-5309
210-867-5309
Problem solving process:
Put the subject strings in order of increasing complexity
Solve them one at a time and build an iterative solution
Add optionality as the pattern increases in parts

In [6]:
phone_regex = re.compile(
"""
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
""", re.VERBOSE)

In [7]:
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]
df

Unnamed: 0,number
0,(210) 867 5309
1,+1 210.867.5309
2,867-5309
3,210-867-5309
4,2108675309


In [8]:
df.number.str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [9]:
df = pd.concat([df, df.number.str.extract(phone_regex)], axis=1)
df

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


Exercise 4
Use regular expressions to convert the dates below to the standardized year-month-day format.

02/04/19
02/05/19
02/06/19
02/07/19
02/08/19
02/09/19
02/10/19

In [10]:
dates = [
    "02/04/19",
    "02/05/19",
    "02/06/19",
    "02/07/19",
    "02/08/19",
    "02/09/19",
    "02/10/19"
]

df = pd.DataFrame({"original": dates})
df

Unnamed: 0,original
0,02/04/19
1,02/05/19
2,02/06/19
3,02/07/19
4,02/08/19
5,02/09/19
6,02/10/19


In [11]:
pattern = re.compile(r"""
(?P<month>\d{2})/
(?P<day>\d{2})/
(?P<year>\d{2})
""", re.VERBOSE)

In [12]:
df = pd.concat([df, df.original.str.extract(pattern)], axis=1)
df

Unnamed: 0,original,month,day,year
0,02/04/19,2,4,19
1,02/05/19,2,5,19
2,02/06/19,2,6,19
3,02/07/19,2,7,19
4,02/08/19,2,8,19
5,02/09/19,2,9,19
6,02/10/19,2,10,19


In [13]:
df["new_format"] = df.year + "/" + df.month + "/" + df.day 
df

Unnamed: 0,original,month,day,year,new_format
0,02/04/19,2,4,19,19/02/04
1,02/05/19,2,5,19,19/02/05
2,02/06/19,2,6,19,19/02/06
3,02/07/19,2,7,19,19/02/07
4,02/08/19,2,8,19,19/02/08
5,02/09/19,2,9,19,19/02/09
6,02/10/19,2,10,19,19/02/10


Exercise 5
Write a regex to extract the various parts of these logfile lines:

GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58

POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58

GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [14]:
lines = [
    """GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58""",
    """POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58""",
    """GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58"""
]
lines

['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
 'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
 'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']

In [15]:
x = re.compile("""
\}\s(?P<bytes>\d+)\s\"
""", re.VERBOSE)
match = re.search(x, lines[0])
match

<re.Match object; span=(65, 75), match='} 510348 "'>

In [16]:

match.group("bytes")

'510348'

In [17]:
log_pattern = re.compile(r"""
(?P<method>GET|POST) 
\s
(?P<path>/[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
$
""", re.VERBOSE)

In [18]:
rows = [re.search(log_pattern, line).groupdict() for line in lines]
rows

[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]

In [19]:
df = pd.DataFrame(rows)
df

Unnamed: 0,method,path,timestamp,http_version,status_code,bytes,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


In [20]:
# let's make a column that is a boolean whether or not the path contained a query string
# simple answer on one string: look for a question mark character
def is_query_string(string):
    return "?" in string
# df["has_query_string"] = df["path"].apply(lambda string: "?" in string)
df["has_query_string"] = df["path"].apply(is_query_string)
df

Unnamed: 0,method,path,timestamp,http_version,status_code,bytes,user_agent,ip,has_query_string
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58,True
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58,False
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58,True
