In [1]:
# Import the regular expression libraries
import re

In [3]:
# There are several main processing functions in re that you might use, match() checks for a match
# that is at the beginning of the string and returns a boolean. Similarly, checks for a match
# anywhere in the string, and returns a boolean

# Lets create some text for an example
text = "This is a good day"

# let's see if its a good day
if re.search("good", text):
    print("Wonderful!")
else:
    print("Alas :")

Wonderful!


In [4]:
# In addition to checking for conditionals, we can segment a string. The work that regex does here is called
# tokenizing, where the string is separated into substrings based on patterns. Tokenizing is a core activity
# in natural language processing. 

# The findall() and split() functions will parse the string for us and return chunks. Lets try and example.
text = "Amy works diligently. Amy gets good grades. Our student Amy is Successful."

# This is a bit of a fabricated example, but lets split this on all instances of Amy
re.split("Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is Successful.']

In [5]:
# To count how many times we talked about Amy, use findall()
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [24]:
# The regex specification standard defines a markup language to describe patterns in text. Lets start with anchors.
# Anchors specify the start and/or the end of the string that you are trying to match. The caret character &
# means start and the dollar sign character $ means end. If you put ^ before a string, it means that the text
# the regex processor retrieves must start with the string you specify. For ending, you have to put the $ 
# character after the string, it means that the text Regex retrieves must end with the string you specify. 

# An example
text = "Amy works diligently. Amy gets good grades. Our student Amy is Successful."

# Let's see if this begins with Amy
re.search("^Amy", text)

<re.Match object; span=(0, 1), match='A'>

In [None]:
# Notice that re.search() actually returned to us a new object, called re.Match oject. A re.Match object 
# always has a boolean value of True, as something was found, so you can always evaluate it in an if statement like we did earlier
# The rending of the match object also tells you what pattern was matched, in this case, the word "Amy" and the location the match was in. 



## Patterns and Character Classes

In [13]:
# Create a string of a single learners' grades over a semester in one course across all of their assignments
grades = 'ACAAAABCBABAA'

# If we want to answer the question "How many B's were in the grade lst?" we would just use B
re.findall("B", grades)

['B', 'B', 'B']

In [14]:
# To count the number of A's or B's 
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'A', 'A']

In [15]:
# This is called the set operator. You can also include a range of characters, which are ordered
# alphanumerically.
re.findall("[A][B-C]", grades)

['AC', 'AB', 'AB']

In [23]:
# We can use the caret with the set operator to negate our results. For instance, if we want to parse out only 
# the grades which were not A's

re.findall("[^A]",grades)

['C', 'B', 'C', 'B', 'B']

In [20]:
re.findall ("^[^A]", grades)

[]

In [None]:
# its an empty list, because the regex says that we want to match any value at the beginning of the string 
# which is not an A. Our string though starts with an A, so there is no match found. 

## Quantifiers

In [25]:
# Quantifiers are the number of times you want a pattern to be matched in order to match.
# the most basic quantifier is expressed as e{m,n}, where e is the expression or character we are matching,
# m is the minimum number of times you want it to be matched, and n is the maximum number of times the item can be matched

# How many times this student been on a back-to-back A's streak
re.findall("A{2,10}", grades)

['AAAA', 'AA']

In [26]:
# So we see there are two streaks, one where the students had four streaks, and another of two streaks

re.findall("A{1,1}A{1,1}", grades) # Cant include spaces inside the brackes

# If you have one number inside the brackets, it assumes to be m and n

['AA', 'AA', 'AA']

In [31]:
with open("ferpa.txt", "r") as file:
    wiki = file.read()

wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [33]:
# Search through the document to get a list of all the headers
re.findall("[a-zA-Z]{1,100}\[edit\]",wiki) 
# interested in all lower and upper case a-z, 
# interested in somewhere between 1-100 of those characters, as long as they're followed by Edit


['Overview[edit]', 'records[edit]', 'records[edit]']

In [34]:
# It didnt quite work. It got all of the headers, but only the last word of the headers

# Improving using \w to match any letter, including digits and numbers

re.findall("[\w]{1,100}\[edit\]", wiki)

# \w is a metacharacter
# \s matches any whitespace character

['Overview[edit]', 'records[edit]', 'records[edit]']

In [37]:
# Now that we shortened the regex, lets improve it. add in a spaces
re.findall("[\w ]*\[edit\]", wiki) 
# this searches for any word or multiple words that are ([\w]) that are repeated 0 or more times (*) that are followed by the string"[edit]"

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [42]:
for title in re.findall("[\w ]*\[edit\]", wiki):
    print(re.split("[\[]", title)[0])

Overview
Access to public records
Student medical records


## Groups

In [43]:
# Match different pattersn, called groups, at the same time, and then refer to the groups you want
# To group patters together you use parenthese
re.findall("([\w ]*)(\[edit\])", wiki)


[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [46]:
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [56]:
# groups() method returns a tuple of the group. We can get an individual group using group(number),
# where group(0) is the whole match, and the other number is the portion of the match we are intersted in
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [60]:
# Giving a label looking at the results as a dictionary is pretty useful.
# For that we use the syntaxt (?P<name>), where the parentheses start the group, 
# the ?P indicates that this is an extension to basic regexes, and <name> is the dictionary
# key we want to use wrapped in <>

for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wiki):
    print(item.groupdict()['title'])

Overview
Access to public records
Student medical records


In [65]:
# Print the whole dictionary 
print(item.groupdict())

{'title': 'Student medical records', 'edit_link': '[edit]'}


## Look-ahead and Look-behind

In [67]:
# The pattern being given to the regex engine is for text either before or after the text we are trying to isolate
# For example, in our headers we want to isolate text which comes before the [edit] rending, 
# but we actually don't care about the [edit] text itself. Ths far we have been thrwoing the [edit] away,
# but if we want to use them to match but don't want to capture them we could put them in a group and use look ahead instead with ?= syntax

for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wiki):
    # What this regex says is match two groups, the first will be named and called title, will have any amount 
    # of whitespace or regular word characters, the second will be the characters [edit] but we don't actually
    # want this edit put in our output match objects
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


## Example: Wikipedia Data

In [74]:
with open("./buddhist.txt", "r", encoding="utf-8") as file:
    wiki_2 = file.read()


UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1589: character maps to <undefined>