# Data Cleansing
***

## Regular Expressions
***

https://docs.python.org/3/library/re.html

https://realpython.com/regex-python/

https://developers.google.com/edu/python/regular-expressions

## Python's re module
***

In [43]:
import re

\w = '[a-zA-Z0-9_]'

\W = '[^a-zA-Z0-9_]'

In [44]:
# r means the raw string, do not escape any of the characters
# \W Matahces generally means whitespace (Ian)
# \W Matches any character which is not a word character (https://docs.python.org/3/library/re.html)

# \w Matches Unicode word characters (https://docs.python.org/3/library/re.html)

# + means match 1 or more repetitions

# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.

pattern = r'\W+' # in other words set up the string \W+, do not escape it, we ned it as it is when passingt it to split

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', 'words', 'words', '']


In [45]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'(\W+)'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', ', ', 'words', ', ', 'words', '.', '']


In [46]:
re.split(r'\W+', 'Words, words, words.', 1)

['Words', 'words, words.']

In [47]:
# Split it on letters a to f (ignore case)
re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

## Real Python
***

In [48]:
'abccba' == 'abccba'

True

In [49]:
'abccba' == 'cbaabc'

False

In [50]:
'abc' in 'cbaabc'

True

In [51]:
'cbaabc'.index('a')

2

In [52]:
'cbaabc'[2]

'a'

In [53]:
'cbaaabc'.find('aa')

2

In [54]:
s = 'foo123bar'

re.search('123', s)

<re.Match object; span=(3, 6), match='123'>

In [55]:
re.search(r'[0-9][0-9][0-9]', 'foo456bar')

<re.Match object; span=(3, 6), match='456'>

In [56]:
re.search(r'[0-9][0-9][0-9]', '234baz')

<re.Match object; span=(0, 3), match='234'>

In [57]:
re.search(r'[0-9][0-9][0-9]', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [58]:
print(re.search(r'[0-9][0-9][0-9]', '12foo34'))

None


In [59]:
re.search(r'[0-9]{3}', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [60]:
# Only gives first match
print(re.search(r'[0-9][0-9][0-9]', '122foo344'))

<re.Match object; span=(0, 3), match='122'>


In [61]:
if 'Hello':
    print ("yes")
else:
    print("no")

yes


In [62]:
if '':
    print ("yes")
else:
    print("no")

no


## Google for Education
***

. (a period) -- matches any single character except newline '\n'

***

In [63]:
import re

str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)

# If-statement after search() tests if it succeeded
if match:
  print('found', match.group()) ## 'found word:cat'
else:
  print('did not find')

found word:cat


In [64]:
# + -- 1 or more occurrences of the pattern. Ref: https://developers.google.com/edu/python/regular-expressions

string = 'aaaabaa'
pattern = r'a+'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [65]:
# * -- 0 or more occurrences of the pattern. Ref: https://developers.google.com/edu/python/regular-expressions

string = 'aaaabaa'
pattern = r'a*'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [66]:
## i+ = one or more i's, as many as possible.
match = re.search(r'pi+', 'piiig') # found, match.group() == "piii"
match

<re.Match object; span=(0, 4), match='piii'>

In [67]:
## Finds the first/leftmost solution, and within it drives the +
## as far as possible (aka 'leftmost and largest').
## In this example, note that it does not get to the second set of i's.
match = re.search(r'i+', 'piigiiii') # found, match.group() == "ii"
match

<re.Match object; span=(1, 3), match='ii'>

In [68]:
## \s* = zero or more whitespace chars
## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') # found, match.group() == "1 2   3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx') # found, match.group() == "12  3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx123xx') # found, match.group() == "123"
print(match)

<re.Match object; span=(2, 9), match='1 2   3'>
<re.Match object; span=(2, 7), match='12  3'>
<re.Match object; span=(2, 5), match='123'>


In [69]:
## ^ = matches the start of string, so this fails:
match = re.search(r'^b\w+', 'foobar') # not found, match == None
match

In [70]:
## but without the ^ it succeeds:
match = re.search(r'b\w+', 'foobar') # found, match.group() == "bar"
match

<re.Match object; span=(3, 6), match='bar'>

In [71]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print(match.group())  ## 'b@google'

b@google


***
## Exercise 1
***

Write a Python function to remove all non-alphanumeric characters from a string.

***

In [72]:
# Answer 1 removes spaces also

# According to https://docs.python.org/3/library/re.html:
#    \W is equivalent of [^a-zA-Z0-9_]
#    re.sub(pattern, repl, string)
#    Return the string obtained by replacing occurrences of pattern in string by the replacement repl
# I am replacing occurances of non-words with the empty string, so esentially removing them.

def letters_numbers_only(text) :
    # \w = '[a-zA-Z0-9_]'
    return re.sub(r'[\W_]', '', text)

letters_numbers_only("This has numbers 123, letters and symbols. Remove everything, such as $, even spaces, except letters and numbers")

'Thishasnumbers123lettersandsymbolsRemoveeverythingsuchasevenspacesexceptlettersandnumbers'

In [73]:
# Answer 2 does not remove spaces (Debate on whether space is an alphanumeric character)

# \s -- (lowercase s) matches a single whitespace character, ref: https://developers.google.com/edu/python/regular-expressions

def letters_numbers_spaces_only(text) :
    return re.sub(r'[^\w\s]', '', text)

letters_numbers_spaces_only("This has numbers 123, letters and symbols. Remove everything, such as $, except letters and numbers")

'This has numbers 123 letters and symbols Remove everything such as  except letters and numbers'