In [4]:
import re

In [7]:
text = "This is a good day"

if re.search ("good", text): 
    print("Wonderful")
else:
    print("Cry about it") 

Wonderful


In [9]:
#In addition to checking for the conditionals, we can segment a string.
#The work that regex does here is called tokenizing,
#where the string is separated into substrings based on patterns.

# The findall() and split() functions will parse the string for us and return chunks. Eg. below
text = " The duck works diligently. The duck gets good food. Our duck is succesful." 

re.split("duck", text)

[' The ', ' works diligently. The ', ' gets good food. Our ', ' is succesful.']

In [10]:
re.findall("duck", text) 

['duck', 'duck', 'duck']

In [11]:
# .search() looks for some pattern and returns a boolean. 
# .split() will use a pattern for creating a list of substrings
# .findall() look for a pattern and pull out all the occurences

In [18]:
#  the regext specification standard defines a markup language to describe patterns in text. 
# ANchors specify the start and/or the end of the string that you are trying to match. 
# Caret character ^ means start and the dollar sign character $ means end. 

#Sample
text =  " The duck works diligently. The duck gets good food. Our duck is succesful." 
re.search("^duck", text) 

In [19]:
# re.search() actually returned a new object, called re.Match object


## Pattern and Character CLasses

In [84]:
# sample

grades= "BCAAABACAABCBCAAB"

re.findall("B", grades) 

['B', 'B', 'B', 'B', 'B']

In [85]:
re.findall("[AB]", grades) 

# "AB" can be read so we put characters A and B in square brackets

['B', 'A', 'A', 'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'A', 'B']

In [86]:
# Set operators. range of charaters, which are ordered alphanumerically. 
# basically its like an OR conditioning function
re.findall("[A][B-C]", grades) 

['AB', 'AC', 'AB', 'AB']

In [87]:
#but like its also possible you can write like this pattern using a pipe operator (OR) 

re.findall("AB|AC", grades)

['AB', 'AC', 'AB', 'AB']

In [88]:
# we can also like exclude a specific token
re.findall("[^A]", grades) 

['B', 'C', 'B', 'C', 'B', 'C', 'B', 'C', 'B']

In [89]:
#regex says that we want to match any value at the beginning of the string which is not an A. 
#note only at the beginning
re.findall("^[^A]", grades) 

['B']

## Quantifiers

In [90]:
# Quantifiers are the number of time you want a pattern to be matched in order to match. 
# The most basic quantifier is expressed as e{m,n}, 
# where e is the expression or character matching, m - minimum, n - maximum number it was match

re.findall("A{2,10}", grades)


['AAA', 'AA', 'AA']

In [91]:
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA']

In [94]:
re.findall("A{2, 2}", grades)

[]

In [95]:
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [96]:
re.findall("A{2}", grades)

['AA', 'AA', 'AA']

In [97]:
re.findall("A{1,10}B{1,10}C{1,10}", grades)

['AABC']

In [139]:
wikidata = "Some sample text that includes an [edit] tag for testing purposes."

In [140]:
matches = re.findall(r"[a-zA-Z]{1,100}\[edit\]", wikidata)
print(matches)

[]


In [146]:
#btw \w is a metacharater, and indicates a special pattern of any letter or digit.
re.findall("[\w] {1,100}\[edit\]", wikidata) 

  re.findall("[\w] {1,100}\[edit\]", wikidata)


['n [edit]']

In [147]:
re.findall("[\w]*\[edit\]",wikidata)

  re.findall("[\w]*\[edit\]",wikidata)


['[edit]']

In [None]:
re.findall("[\w ]*\[edit\]",wikidata)

In [151]:
# The code finds all sequences of word characters and spaces followed by [edit] in the string, splits each match at the [, and prints the part before [edit].
for title in re.findall("[\w ]*\[edit\]",wikidata): 
    print(re.split("[\[]", title)[0]) 

Some sample text that includes an 


  for title in re.findall("[\w ]*\[edit\]",wikidata):
  print(re.split("[\[]", title)[0])


## Groups 

In [161]:
#you can also match different patterns, called groups. to group patterns together you can use parentheses

In [162]:
re.findall("([\w ]*\[edit\])",wikidata)

  re.findall("([\w ]*\[edit\])",wikidata)


['Some sample text that includes an [edit]']

In [163]:
for item in re.finditer("([\w ]*\[edit\])",wikidata):
    print(item.groups()) 


('Some sample text that includes an [edit]',)


  for item in re.finditer("([\w ]*\[edit\])",wikidata):


In [164]:
for item in re.finditer("([\w ]*\[edit\])",wikidata):
    print(item.group(1)) 

Some sample text that includes an [edit]


  for item in re.finditer("([\w ]*\[edit\])",wikidata):


In [168]:
#giving a name of the group is (?P<name>), parethesis starts the group
# ?P indicates that this is an extension to basic regexes, and <name> is the dictionary key we want to weapped in <>

for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wikidata): 
    print(item.groupdict()['title'])

Some sample text that includes an 


  for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wikidata):


In [169]:
print(item.groupdict())

{'title': 'Some sample text that includes an ', 'edit_link': '[edit]'}


## Look-ahead and Look-behind

In [173]:
# the look-ahead assertion (?=\[edit\]) is used to ensure that only the text immediately preceding [edit] is matched and captured by the named group title, without including [edit] itself in the match.
for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wikidata):
    print(item) 

<re.Match object; span=(0, 34), match='Some sample text that includes an '>


  for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wikidata):
