### Regular Expressions

In [1]:
import re

In [2]:
text = "This is a good day"

if re.search("good", text):
    print("Very good!")
else:
    print("Bad :(")

Very good!


In [3]:
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

re.split(r"Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [4]:
re.findall(r"Amy", text)

['Amy', 'Amy', 'Amy']

In [5]:
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

re.search(r"^Amy", text)

<re.Match object; span=(0, 3), match='Amy'>

In [6]:
grades = "ACAAAABCBCBAAACAAAAAAA"

len(re.findall(r"B", grades))

3

In [7]:
len(re.findall(r"[AB]", grades))

18

In [8]:
re.findall(r"[A][B-C]", grades)

['AC', 'AB', 'AC']

In [9]:
re.findall(r"AB|AC", grades)

['AC', 'AB', 'AC']

In [10]:
re.findall(r"[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B', 'C']

In [11]:
re.findall(r"^[^A]", grades)

[]

#### Quantifiers

In [12]:
re.findall(r"A{1,}", grades)

['A', 'AAAA', 'AAA', 'AAAAAAA']

In [13]:
re.findall(r"A{1,1}A{0,2}", grades)

['A', 'AAA', 'A', 'AAA', 'AAA', 'AAA', 'A']

In [14]:
re.findall(r"A{2, 2}", grades)

[]

In [15]:
re.findall(r"AA", grades)

['AA', 'AA', 'AA', 'AA', 'AA', 'AA']

In [16]:
re.findall(r"A{2}", grades)

['AA', 'AA', 'AA', 'AA', 'AA', 'AA']

In [17]:
re.findall(r"A{1,10}B{1,10}C{1,10}", grades)

['AAAABC']

In [18]:
with open("datasets/ferpa.txt", "r") as file:
    wiki = file.read()

wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [19]:
re.findall(r"[a-zA-Z]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [20]:
re.findall(r"[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [21]:
re.findall(r"[\w]*\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [22]:
re.findall(r"[\w ]*\[edit\]", wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [23]:
for title in re.findall(r"[\w ]*\[edit\]", wiki):
    print(re.split(r"[\[]", title)[0])

Overview
Access to public records
Student medical records


#### Groups

In [24]:
re.findall(r"([\w ]*)(\[edit\])", wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [25]:
for item in re.finditer(r"([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [26]:
for item in re.finditer(r"([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [27]:
for item in re.finditer(r"(?P<title>[\w ]*)(?P<edit>\[edit\])", wiki):
    print(item.group("title"))

Overview
Access to public records
Student medical records


In [28]:
print(item.groupdict())

{'title': 'Student medical records', 'edit': '[edit]'}


#### Look-ahead and Look-behind

In [29]:
for item in re.finditer(r"(?P<title>[\w ]*)(?=\[edit\])", wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(8, 8), match=''>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(2739, 2739), match=''>
<re.Match object; span=(3692, 3715), match='Student medical records'>
<re.Match object; span=(3715, 3715), match=''>
