In [1]:
import re
import pandas as pd
import numpy as np

In [2]:
s = 'My 1st string'

In [3]:
# re.search picks up one occurence of what we are looking for. One match per line which uses a group() method 
# Good for finding missing values.

In [4]:
re.search(r'..t', s).group()  # r means raw string, group() method returns matches

'1st'

In [5]:
re.search(r'sti', s).group() # when there is no match, we get an error

AttributeError: 'NoneType' object has no attribute 'group'

In [5]:
match = re.search(r'sti', s)
if match:
    print(match.group())
else:
    print("No match")

No match


In [18]:
# read data into a list. Each line is a list element.
with open('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/homicides.txt') as f:
    data= [row for row in f]   # list comprehension, read every line into a list's elements

In [254]:
# check number of rows
len(data)

1250

In [19]:
type(data)

list

In [20]:
data[0:5]

['39.311024, -76.674227, iconHomicideShooting, \'p2\', \'<dl><dt>Leon Nelson</dt><dd class="address">3400 Clifton Ave.<br />Baltimore, MD 21216</dd><dd>black male, 17 years old</dd><dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd></dl>\'\n',
 '39.312641, -76.698948, iconHomicideShooting, \'p3\', \'<dl><dt>Eddie Golf</dt><dd class="address">4900 Challedon Road<br />Baltimore, MD 21207</dd><dd>black male, 26 years old</dd><dd>Found on January 2, 2007</dd><dd>Victim died at scene</dd><dd>Cause: shooting</dd></dl>\'\n',
 '39.309781, -76.649882, iconHomicideBluntForce, \'p4\', \'<dl><dt>Nelsene Burnette</dt><dd class="address">2000 West North Ave<br />Baltimore, MD 21217</dd><dd>black female, 44 years old</dd><dd>Found on January 2, 2007</dd><dd>Victim died at scene</dd><dd>Cause: blunt force</dd></dl>\'\n',
 '39.363925, -76.598772, iconHomicideAsphyxiation, \'p5\', \'<dl><dt>Thomas MacKenney</dt><dd class="address">5900 Northwood Drive<br />Balti

In [9]:
age = []
for row in data:
    match = re.search(r'\d+ years? old', row)
    if match:
        age.append(match.group())
    else:
        age.append('0 years old')

In [10]:
print(age[0:10])

['17 years old', '26 years old', '44 years old', '21 years old', '61 years old', '46 years old', '27 years old', '21 years old', '16 years old', '21 years old']


In [11]:
age_numbers = []
for element in age:
    match = re.search(r'\d+', element)
    if match:
        age_numbers.append(match.group())
    else:
        print('Missing data')

In [12]:
print(age_numbers[0:20])

['17', '26', '44', '21', '61', '46', '27', '21', '16', '21', '34', '25', '23', '30', '26', '36', '21', '27', '30', '19']


In [13]:
age_numbers.remove('0')

In [14]:
# convert string elements in a list to int
ages = [eval(i) for i in age_numbers]

In [15]:
average = sum(ages) / len(ages)
average

29.67974379503603

# Parenthesis
They create match groups

In [16]:
s

'My 1st string'

In [18]:
re.search(r'\d..', s).group()

'1st'

In [21]:
# create match groups using ()
re.search(r'(\d)(..)', s).group()

'1st'

In [22]:
# create match groups using () and access the first group
re.search(r'(\d)(..)', s).group(1)

'1'

In [26]:
age_nums = []
for row in data:
    match = re.search(r'(\d+) years? old', row)
    if match:
        age_nums.append(int(match.group(1)))
    else:
        age_nums.append(0)

In [27]:
age_nums[1:10]

[26, 44, 21, 61, 46, 27, 21, 16, 21]

# Character classes using []

In [None]:
# [bf-] means b or f or dash
# if a period '.' is in the character class, it means 'period' NOT 'any character'!
# to match period '.' outside character class --> escape it --> \.  It's iused for any special characters

In [None]:
# gosia-palenga@gmail.com --> [\w.-]+@[\w.-]+

In [28]:
# Cause: blunt force  --> [\w\s]+

# Find multiple matches

In [30]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/mailbox.txt') as f:
    messages = f.read()   # read function reads the whole file as a single string
    
len(messages)  # number of characters in the string

94626

In [32]:
type(messages)

str

In [35]:
print(messages[0:500])

From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008
Return-Path: <postmaster@collab.sakaiproject.org>
Received: from murder (mail.umich.edu [141.211.14.90])
	 by frankenstein.mail.umich.edu (Cyrus v2.3.8) with LMTPA;
	 Sat, 05 Jan 2008 09:14:16 -0500
X-Sieve: CMU Sieve 2.3
Received: from murder ([unix socket])
	 by mail.umich.edu (Cyrus v2.2.12) with LMTPA;
	 Sat, 05 Jan 2008 09:14:16 -0500
Received: from holes.mr.itd.umich.edu (holes.mr.itd.umich.edu [141.211.14.79])
	by flawless.mail.umic


In [None]:
# find.all is good with strings. Read the file into a string (read() function) and use findall to get all the email addresses 
# from a string. There might be no email in each line or there can be two or more emails in one line.
# findall returns a list of all matching strings

In [43]:
emails = re.findall(r'[\w.-_]+@[\w.-_]+[\.][\w]+', messages)
emails

['stephen.marquard@uct.ac.za',
 '<postmaster@collab.sakaiproject.org',
 '<200801051412.m05ECIaH010327@nakamura.uits.iupui.edu',
 '<source@collab.sakaiproject.org',
 '<source@collab.sakaiproject.org',
 '<source@collab.sakaiproject.org',
 'source@collab.sakaiproject.org',
 'stephen.marquard@uct.ac.za',
 'source@collab.sakaiproject.org',
 'stephen.marquard@uct.ac.za',
 'stephen.marquard@uct.ac.za',
 'louis@media.berkeley.edu',
 '<postmaster@collab.sakaiproject.org',
 '<200801042308.m04N8v6O008125@nakamura.uits.iupui.edu',
 '<source@collab.sakaiproject.org',
 '<source@collab.sakaiproject.org',
 '<source@collab.sakaiproject.org',
 'source@collab.sakaiproject.org',
 'louis@media.berkeley.edu',
 'source@collab.sakaiproject.org',
 'louis@media.berkeley.edu',
 'louis@media.berkeley.edu',
 'zqian@umich.edu',
 '<postmaster@collab.sakaiproject.org',
 '<200801042109.m04L92hb007923@nakamura.uits.iupui.edu',
 '<source@collab.sakaiproject.org',
 '<source@collab.sakaiproject.org',
 '<source@collab.saka

In [44]:
type(emails)

list

In [47]:
emails = re.findall(r'([\w.-_]+)@[\w.-_]+[\.][\w]+', messages)

In [48]:
emails[0:10]  # findall with one match group returns a list of strings

['stephen.marquard',
 '<postmaster',
 '<200801051412.m05ECIaH010327',
 '<source',
 '<source',
 '<source',
 'source',
 'stephen.marquard',
 'source',
 'stephen.marquard']

In [49]:
emails = re.findall(r'([\w.-_]+)@([\w.-_]+)[\.][\w]+', messages)

In [51]:
# findall with more match groups () returns a tuple
# to convert a tuple to a DataFrame use list() --> pd.DataFrame(list(email))
emails[0:5] 

[('stephen.marquard', 'uct.ac'),
 ('<postmaster', 'collab.sakaiproject'),
 ('<200801051412.m05ECIaH010327', 'nakamura.uits.iupui'),
 ('<source', 'collab.sakaiproject'),
 ('<source', 'collab.sakaiproject')]

# Homework 1 

In [235]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/faa.txt') as f:
    tower = f.read()

In [236]:
print(tower[0:210])

FAA Contract Tower Closure List
(149 FCTs)
3-22-2013
LOC
ID Facility Name City State
DHN DOTHAN RGNL DOTHAN AL
TCL TUSCALOOSA RGNL TUSCALOOSA AL
FYV DRAKE FIELD FAYETTEVILLE AR
TXK TEXARKANA RGNL-WEBB FIELD TEX


Your assignment is to create a list of tuples containing the tower IDs and the states they are located in.

Here is the expected output:

faa = [('DHN', 'AL'), ('TCL', 'AL'), ..., ('PKB', 'WV')]

In [243]:
# faa = re.findall(r'(\b[A-Z][A-Z][A-Z]\b)\s[A-Z\-]+\s[A-Z\-]+\s[A-Z\-]+(\s\b[A-Z][A-Z]\b)', tower)
# faa = re.findall(r'([A-Z][A-Z][A-Z]) .+ ([A-Z][A-Z])', tower)  

# (.+) anything between two matche groups
# a period (.) matches any character and won't jump to a new line
# {2,3}  --> 2 or 3

faa = re.findall(r'([A-Z]{3}) .+ ([A-Z]{2})', tower)
faa

[('DHN', 'AL'),
 ('TCL', 'AL'),
 ('FYV', 'AR'),
 ('TXK', 'AR'),
 ('GEU', 'AZ'),
 ('GYR', 'AZ'),
 ('IFP', 'AZ'),
 ('RYN', 'AZ'),
 ('FUL', 'CA'),
 ('MER', 'CA'),
 ('OXR', 'CA'),
 ('RAL', 'CA'),
 ('RNM', 'CA'),
 ('SAC', 'CA'),
 ('SDM', 'CA'),
 ('SNS', 'CA'),
 ('VCV', 'CA'),
 ('WHP', 'CA'),
 ('WJF', 'CA'),
 ('BDR', 'CT'),
 ('DXR', 'CT'),
 ('GON', 'CT'),
 ('HFD', 'CT'),
 ('HVN', 'CT'),
 ('OXC', 'CT'),
 ('APF', 'FL'),
 ('BCT', 'FL'),
 ('EVB', 'FL'),
 ('FMY', 'FL'),
 ('HWO', 'FL'),
 ('LAL', 'FL'),
 ('LEE', 'FL'),
 ('OCF', 'FL'),
 ('OMN', 'FL'),
 ('PGD', 'FL'),
 ('SGJ', 'FL'),
 ('SPG', 'FL'),
 ('SUA', 'FL'),
 ('TIX', 'FL'),
 ('ABY', 'GA'),
 ('AHN', 'GA'),
 ('LZU', 'GA'),
 ('MCN', 'GA'),
 ('RYY', 'GA'),
 ('DBQ', 'IA'),
 ('IDA', 'ID'),
 ('LWS', 'ID'),
 ('PIH', 'ID'),
 ('SUN', 'ID'),
 ('ALN', 'IL'),
 ('BMI', 'IL'),
 ('DEC', 'IL'),
 ('MDH', 'IL'),
 ('UGN', 'IL'),
 ('BAK', 'IN'),
 ('GYY', 'IN'),
 ('HUT', 'KS'),
 ('IXD', 'KS'),
 ('MHK', 'KS'),
 ('OJC', 'KS'),
 ('TOP', 'KS'),
 ('OWB', 'KY'),
 ('PAH',

In [239]:
# .+  -->  .{1,}
# up to 3  -->  {,3}

# quantifier = re.findall(r'([A-Z]{3}) .{1,0} ([A-Z]{3,5})', tower)[0:10]
quantifier = re.findall(r'([A-Z]{3}) .+ ([A-Z]{3,5})', tower)[0:5]
quantifier
# .+ is greedy 
# CITY is pulled as it's between 3 and 5 characters

[('DHN', 'DOTHA'),
 ('TCL', 'TUSCA'),
 ('FYV', 'FAYET'),
 ('TXK', 'TEXAR'),
 ('GEU', 'GLEND')]

In [244]:
num = int(re.search(r'([\d]+)\sFCTs', tower).group(1))
num

149

In [246]:
# sanity check
assert num > 148
# assert(num == len(faa))

# Homework 2

In [5]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/reputation.txt') as f:
    rep = f.read()
    
print(rep[1:310])

otal votes: 36
 2  12201376 (5)
-- 2012-08-30 rep +5    = 6         
 2  13822612 (10)
-- 2012-12-11 rep +10   = 16        
 2  13822612 (10)
-- 2013-03-20 rep +10   = 26        
-- 2013-12-05 rep 0     = 26        
-- 2014-01-25 rep 0     = 26        
 16  7141669 (2)
-- 2014-03-19 rep +2    = 28        
 1


Your assignment is to create a list of tuples containing only these dated entries, including the date, reputation change (regardless of whether it is positive/negative/zero), and running total.

Here is the expected output:

rep = [('2012-08-30', '+5', '6'), ('2012-12-11', '+10', '16'), ...,  ('2015-10-14', '-1', '317')]

In [252]:
# space with + --> ' +' --> we want to capture 1 or more spaces
reputation = re.findall(r'-- (\d{4}\-\d{2}\-\d{2}) rep ([\+\-]?\d+) +\= (\d+)', rep)[0:10]
reputation

[('2012-08-30', '+5', '6'),
 ('2012-12-11', '+10', '16'),
 ('2013-03-20', '+10', '26'),
 ('2013-12-05', '0', '26'),
 ('2014-01-25', '0', '26'),
 ('2014-03-19', '+2', '28'),
 ('2014-05-11', '+2', '30'),
 ('2014-05-12', '+12', '42'),
 ('2014-06-12', '+10', '52'),
 ('2014-06-26', '+10', '62')]

In [227]:
pd.DataFrame(list(reputation), columns=['date', 'reputation_change', 'running_total'])

Unnamed: 0,date,reputation_change,running_total
0,2012-08-30,5,6
1,2012-12-11,10,16
2,2013-03-20,10,26
3,2013-12-05,0,26
4,2014-01-25,0,26
5,2014-03-19,2,28
6,2014-05-11,2,30
7,2014-05-12,12,42
8,2014-06-12,10,52
9,2014-06-26,10,62


# Greedy / Lazy quantifiers

In [None]:
# To get Cause from '<dd>Cause: blunt force</dd></dl>'  --> Cause: (.+?)<  --> get word character or space, one or more

In [256]:
with open('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/homicides.txt') as f:
    hom_data = f.read()

In [257]:
hom = re.findall(r'Cause: (.+?)<', hom_data)[1:10]
hom

['shooting',
 'blunt force',
 'asphyxiation',
 'blunt force',
 'shooting',
 'shooting',
 'shooting',
 'shooting',
 'shooting']

# Alternatives

In [258]:
# \d+ .+ (Ave|St|Dr|Hwy)

In [261]:
with open('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/yelp.csv') as f:
    reviews = f.read()

In [271]:
print(reviews[0:1000])

business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,"My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best ""toast"" I've ever had.

Anyway, I ca

In [273]:
re.findall(r'(Mr\.|Mr|Mister|Mrs|Miss) (\w+)', reviews, flags=re.IGNORECASE)[1:10]  # created 2 match groups

[('Mr.', 'Mustachio'),
 ('miss', 'food'),
 ('miss', 'with'),
 ('miss', 'the'),
 ('miss', 'the'),
 ('miss', 'if'),
 ('miss', 'the'),
 ('miss', 'the'),
 ('miss', 'the')]

# Flags

flags = re.IGNORECASE

# Substitution

In [276]:
# find a pattern and replace it with a string
re.sub(r'', r'', reviews) [0:10]

'business_i'

In [277]:
s = 'I told Mister Pebble that I like Mister Pipp'

re.sub(r'Mister', r'Mr.', s)

'I told Mr. Pebble that I like Mr. Pipp'

In [2]:
s = 'my twitter is @jimmy, my emails are john@hotmail.com and jim@yahoo.com'

In [291]:
re.findall(r'@(\w+\.\w+)', s)

['hotmail.com', 'yahoo.com']

In [292]:
re.sub(r'@(\w+\.\w+)', r'@gmail.com', s)

'my twitter is @jimmy, my emails are john@gmail.com and jim@gmail.com'

In [290]:
re.findall(r'\w+@[\w\.]+', s)

['john@hotmail.com', 'jim@yahoo.com']

In [293]:
re.sub(r'(\w+)@[\w.]+', r'\1@gmail.com', s)    #\1 refers to 1st matching group, which is (\w+)

'my twitter is @jimmy, my emails are john@gmail.com and jim@gmail.com'

# Anchors ^$
They enforce the patter, If it's not there at the start/end of the string, it'll return error

In [4]:
re.search(r'^@\w+', s).group()

AttributeError: 'NoneType' object has no attribute 'group'

# Multiline

In [19]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/faa.txt') as f:
    tower = f.read()
    
print(tower[0:200])

FAA Contract Tower Closure List
(149 FCTs)
3-22-2013
LOC
ID Facility Name City State
DHN DOTHAN RGNL DOTHAN AL
TCL TUSCALOOSA RGNL TUSCALOOSA AL
FYV DRAKE FIELD FAYETTEVILLE AR
TXK TEXARKANA RGNL-WEBB


In [20]:
print(re.sub(r'AL$', r'ALABAMA', tower, flags=re.MULTILINE))

FAA Contract Tower Closure List
(149 FCTs)
3-22-2013
LOC
ID Facility Name City State
DHN DOTHAN RGNL DOTHAN ALABAMA
TCL TUSCALOOSA RGNL TUSCALOOSA ALABAMA
FYV DRAKE FIELD FAYETTEVILLE AR
TXK TEXARKANA RGNL-WEBB FIELD TEXARKANA AR
GEU GLENDALE MUNI GLENDALE AZ
GYR PHOENIX GOODYEAR GOODYEAR AZ
IFP LAUGHLIN/BULLHEAD INTL BULLHEAD CITY AZ
RYN RYAN FIELD TUCSON AZ
FUL FULLERTON MUNI FULLERTON CA
MER CASTLE ATWATER CA
OXR OXNARD OXNARD CA
RAL RIVERSIDE MUNI RIVERSIDE CA
RNM RAMONA RAMONA CA
SAC SACRAMENTO EXECUTIVE SACRAMENTO CA
SDM BROWN FIELD MUNI SAN DIEGO CA
SNS SALINAS MUNI SALINAS CA
VCV SOUTHERN CALIFORNIA LOGISTICS VICTORVILLE CA
WHP WHITEMAN LOS ANGELES CA
WJF GENERAL WM J FOX AIRFIELD LANCASTER CA
BDR IGOR I SIKORSKY MEMORIAL BRIDGEPORT CT
DXR DANBURY MUNI DANBURY CT
GON GROTON-NEW LONDON GROTON (NEW LONDON) CT
HFD HARTFORD-BRAINARD HARTFORD CT
HVN TWEED-NEW HAVEN NEW HAVEN CT
OXC WATERBURY-OXFORD OXFORD CT
APF NAPLES MUNI NAPLES FL
BCT BOCA RATON BOCA RATON FL
EVB NEW SMYRNA BEACH

# Intermedite Regualar Expressions Exercise 1

In [1]:
# with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/imdb_100.csv') as f:
#     imdb = f.read()

In [7]:
imdb = pd.read_csv('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/imdb_100.csv')
imdb.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [15]:
# convert title Series to a list
titles = imdb['title'].to_list()
type(titles)

list

In [39]:
print(titles)

['The Shawshank Redemption', 'The Godfather', 'The Godfather: Part II', 'The Dark Knight', 'Pulp Fiction', '12 Angry Men', 'The Good, the Bad and the Ugly', 'The Lord of the Rings: The Return of the King', "Schindler's List", 'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring', 'Inception', 'Star Wars: Episode V - The Empire Strikes Back', 'Forrest Gump', 'The Lord of the Rings: The Two Towers', 'Interstellar', "One Flew Over the Cuckoo's Nest", 'Seven Samurai', 'Goodfellas', 'Star Wars', 'The Matrix', 'City of God', "It's a Wonderful Life", 'The Usual Suspects', 'Se7en', 'Life Is Beautiful', 'Once Upon a Time in the West', 'The Silence of the Lambs', 'Leon: The Professional', 'City Lights', 'Spirited Away', 'The Intouchables', 'Casablanca', 'Whiplash', 'American History X', 'Modern Times', 'Saving Private Ryan', 'Raiders of the Lost Ark', 'Rear Window', 'Psycho', 'The Green Mile', 'Sunset Blvd.', 'The Pianist', 'The Dark Knight Rises', 'Gladiator', 'Terminator 2: Judgmen

In [78]:
title = []
for row in titles:
    match = re.findall(r'^The |^A |^An ', row)
    if match:
        title.append(match)
    else:
        title.append(row)
        
title[0:10]

[['The '],
 ['The '],
 ['The '],
 ['The '],
 'Pulp Fiction',
 '12 Angry Men',
 ['The '],
 ['The '],
 "Schindler's List",
 'Fight Club']

In [76]:
# title = []
# for row in titles:
#     match = re.sub(r'^The |^A |^An ', r'',  row)
#     if match:
#         title.append(match)
#     else:
#         title.append(row)
        
# title


title = [re.sub(r'^The |^A |^An ', r'', title) for title in titles]
title[0:10]


['Shawshank Redemption',
 'Godfather',
 'Godfather: Part II',
 'Dark Knight',
 'Pulp Fiction',
 '12 Angry Men',
 'Good, the Bad and the Ugly',
 'Lord of the Rings: The Return of the King',
 "Schindler's List",
 'Fight Club']

In [77]:
# Divide the title into 2 match groups. One with the article and one with the title. Article is group 1 and title group 2
title = [re.sub(r'^(The |A |An )(.+)', r'\2, \1', title) for title in titles]
title[0:10]

['Shawshank Redemption, The ',
 'Godfather, The ',
 'Godfather: Part II, The ',
 'Dark Knight, The ',
 'Pulp Fiction',
 '12 Angry Men',
 'Good, the Bad and the Ugly, The ',
 'Lord of the Rings: The Return of the King, The ',
 "Schindler's List",
 'Fight Club']

# Verbose

In [79]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/reputation.txt') as f:
    rep = f.read()

In [81]:
print(rep[0:200])

total votes: 36
 2  12201376 (5)
-- 2012-08-30 rep +5    = 6         
 2  13822612 (10)
-- 2012-12-11 rep +10   = 16        
 2  13822612 (10)
-- 2013-03-20 rep +10   = 26        
-- 2013-12-05 rep 0 


In [84]:
# re.VERBOSE --> Escape spaces \

print(re.findall(r'''
                 --\                            # two dashes and a space      
                 (\d{4}\-\d{2}\-\d{2})\         # match group 1 is a date, then a space
                 rep\                           # rep, then a space
                 ([\+\-]?\d+)\ +                # match group 2 is rep change with optional sign, then multiple spaces
                 \=\                            # equal sign, then a space
                 (\d+)                          # match group 3 is running total
                 ''', rep, flags=re.VERBOSE))

[('2012-08-30', '+5', '6'), ('2012-12-11', '+10', '16'), ('2013-03-20', '+10', '26'), ('2013-12-05', '0', '26'), ('2014-01-25', '0', '26'), ('2014-03-19', '+2', '28'), ('2014-05-11', '+2', '30'), ('2014-05-12', '+12', '42'), ('2014-06-12', '+10', '52'), ('2014-06-26', '+10', '62'), ('2014-07-05', '0', '62'), ('2014-09-02', '0', '62'), ('2014-09-03', '+10', '72'), ('2014-10-28', '0', '72'), ('2014-11-14', '+10', '82'), ('2014-11-18', '+2', '84'), ('2014-12-08', '+2', '86'), ('2014-12-09', '+10', '96'), ('2014-12-12', '+2', '98'), ('2014-12-24', '+10', '108'), ('2015-02-03', '0', '108'), ('2015-02-20', '+10', '118'), ('2015-03-28', '+10', '128'), ('2015-04-26', '+10', '138'), ('2015-05-05', '+10', '148'), ('2015-05-26', '+10', '158'), ('2015-05-27', '+20', '178'), ('2015-06-09', '0', '178'), ('2015-07-03', '+10', '188'), ('2015-07-06', '0', '188'), ('2015-07-22', '+110', '298'), ('2015-08-21', '+10', '308'), ('2015-09-07', '+10', '318'), ('2015-10-14', '-1', '317')]


# Intermediate Regex Exercise 2

In [2]:
with open ('C:/Users/go27s/OneDrive/Documents/regex_course/regex-course/data/faa.txt') as f:
    tower = f.read()
    
print(tower[0:200])

FAA Contract Tower Closure List
(149 FCTs)
3-22-2013
LOC
ID Facility Name City State
DHN DOTHAN RGNL DOTHAN AL
TCL TUSCALOOSA RGNL TUSCALOOSA AL
FYV DRAKE FIELD FAYETTEVILLE AR
TXK TEXARKANA RGNL-WEBB


In [3]:
faa = re.findall(r'''
                ([A-Z]{3})\      # match group 1 is 3 character ID, then a space
                .+\              # multiple characters incl spaces, then a space
                ([A-Z]{2})       # match group 2 is the 2 character State
                '''     
                 , tower
                 , flags=re.VERBOSE)
faa[0:10]

[('DHN', 'AL'),
 ('TCL', 'AL'),
 ('FYV', 'AR'),
 ('TXK', 'AR'),
 ('GEU', 'AZ'),
 ('GYR', 'AZ'),
 ('IFP', 'AZ'),
 ('RYN', 'AZ'),
 ('FUL', 'CA'),
 ('MER', 'CA')]

In [6]:
print(rep[0:100])

total votes: 36
 2  12201376 (5)
-- 2012-08-30 rep +5    = 6         
 2  13822612 (10)
-- 2012-12-1


In [7]:
s = '-- 2012-08-30 rep +5    = 6'

In [8]:
re.search(r'\d{4}-\d{2}-\d{2}', s).group()

'2012-08-30'

In [9]:
date = re.compile(r'\d{4}-\d{2}-\d{2}')

In [10]:
re.search(date, rep).group()

'2012-08-30'

In [11]:
print(re.findall(date, rep))

['2012-08-30', '2012-12-11', '2013-03-20', '2013-12-05', '2014-01-25', '2014-03-19', '2014-05-11', '2014-05-12', '2014-06-12', '2014-06-26', '2014-07-05', '2014-09-02', '2014-09-03', '2014-10-28', '2014-11-14', '2014-11-18', '2014-12-08', '2014-12-09', '2014-12-12', '2014-12-24', '2015-02-03', '2015-02-20', '2015-03-28', '2015-04-26', '2015-05-05', '2015-05-26', '2015-05-27', '2015-06-09', '2015-07-03', '2015-07-06', '2015-07-22', '2015-08-21', '2015-09-07', '2015-10-14', '2015-11-08', '2015-11-14', '2015-11-01', '2015-11-30', '2015-10-01', '2015-12-31', '2015-01-01', '2015-12-31']


# span() method
finds the position of the pattern

In [14]:
re.search(date, s).span()

(3, 13)

# split() method
splits the string based space, character, ets. For example s.split(' ').
if we split by a character, this character gets removed from the pattern!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [17]:
'hello there'.split(' ')

['hello', 'there']

# re.split()
we can split a string based on a regular expression pattern

In [19]:
re.split(date, s)

['-- ', ' rep +5    = 6']

# Lookarounds

Lookahead --> (?=)

In [52]:
s = 'Quicksand is a word that makes it sounds like sand is quick. However , to say sandquick or even quick-sand would just be wrong.'

In [56]:
re.findall(r'''
           \bsand(?=\w+)   # we're looking for a word that starts with sand which is followed by any word characters
           '''
           , s
          ,flags=re.VERBOSE)

['sand']

# Intermediate Homework

In [2]:
ufo = pd.read_csv('https://raw.githubusercontent.com/planetsig/ufo-reports/master/csv-data/ufo-scrubbed-geocoded-time-standardized.csv'
                  , header=None, nrows=100)

ufo.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611


In [3]:
time = ufo[6].to_list()
time[0:10]

['45 minutes',
 '1-2 hrs',
 '20 seconds',
 '1/2 hour',
 '15 minutes',
 '5 minutes',
 'about 3 mins',
 '20 minutes',
 '3  minutes',
 'several minutes']

clean_durations = [('45 minutes', '45', 'min'), ('1-2 hrs', '1', 'hr'), ('20 seconds', '20', 'sec'), ...]

In [42]:
# title = [re.sub(r'^(The |A |An )(.+)', r'\2, \1', title) for title in titles]
# title[0:10]


match= [re.findall(r'\d+', t) for t in time]
match[0:100]

[['45'],
 ['1', '2'],
 ['20'],
 ['1', '2'],
 ['15'],
 ['5'],
 ['3'],
 ['20'],
 ['3'],
 [],
 ['5'],
 ['3'],
 ['30'],
 ['3'],
 ['30'],
 ['20'],
 ['2'],
 ['20', '30'],
 ['20'],
 ['45'],
 ['20'],
 [],
 ['5', '6'],
 ['1'],
 ['3'],
 ['30'],
 ['30'],
 ['5'],
 ['15'],
 ['4', '5'],
 ['3'],
 ['30'],
 ['3'],
 ['5'],
 ['3', '5'],
 ['2'],
 ['1'],
 [],
 ['15', '20'],
 ['10'],
 ['3'],
 ['10'],
 [],
 ['1'],
 ['2'],
 ['5'],
 ['1'],
 ['3'],
 ['2'],
 ['30'],
 ['10'],
 ['1'],
 ['10'],
 ['1', '39'],
 ['30'],
 ['20'],
 ['8'],
 ['1'],
 ['1'],
 ['2'],
 ['5'],
 ['1'],
 ['2'],
 ['1'],
 ['3'],
 ['5'],
 ['5'],
 ['1'],
 ['4'],
 ['30'],
 ['5'],
 ['1'],
 ['5'],
 ['10', '15'],
 ['30'],
 ['10'],
 ['45'],
 ['1'],
 ['10'],
 ['2'],
 ['2'],
 ['15'],
 ['1'],
 ['5', '10'],
 ['10'],
 ['1'],
 ['45'],
 ['60', '90'],
 ['3'],
 ['5'],
 [],
 ['4'],
 ['45'],
 ['3'],
 ['10'],
 ['30'],
 ['45'],
 ['15'],
 ['30'],
 ['4', '5']]