# Data Cleansing
***

## Regular Expressions
***

https://docs.python.org/3/library/re.html

https://realpython.com/regex-python/

https://realpython.com/regex-python-part-2/

https://developers.google.com/edu/python/regular-expressions

## Python's re module
***

In [2]:
import re

\w = '[a-zA-Z0-9_]'

\W = '[^a-zA-Z0-9_]'

In [3]:
# r at start means the raw string, do not escape any of the characters
# \W Matahces generally means whitespace (Ian)
# \W Matches any character which is not a word character (https://docs.python.org/3/library/re.html)

# \w Matches Unicode word characters (https://docs.python.org/3/library/re.html)

# + means match 1 or more repetitions

# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.

pattern = r'\W+' # in other words set up the string \W+, do not escape it, we ned it as it is when passingt it to split

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', 'words', 'words', '']


In [4]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'(\W+)'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', ', ', 'words', ', ', 'words', '.', '']


In [5]:
re.split(r'\W+', 'Words, words, words.', 1)

['Words', 'words, words.']

In [6]:
# Split it on letters a to f (ignore case)
re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

## Real Python
***

In [7]:
'abccba' == 'abccba'

True

In [8]:
'abccba' == 'cbaabc'

False

In [9]:
'abc' in 'cbaabc'

True

In [10]:
'cbaabc'.index('a')

2

In [11]:
'cbaabc'[2]

'a'

In [12]:
'cbaaabc'.find('aa')

2

In [13]:
s = 'foo123bar'

re.search('123', s)

<re.Match object; span=(3, 6), match='123'>

In [14]:
re.search(r'[0-9][0-9][0-9]', 'foo456bar')

<re.Match object; span=(3, 6), match='456'>

In [15]:
re.search(r'[0-9][0-9][0-9]', '234baz')

<re.Match object; span=(0, 3), match='234'>

In [16]:
re.search(r'[0-9][0-9][0-9]', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [17]:
print(re.search(r'[0-9][0-9][0-9]', '12foo34'))

None


In [18]:
re.search(r'[0-9]{3}', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [19]:
# Only gives first match
print(re.search(r'[0-9][0-9][0-9]', '122foo344'))

<re.Match object; span=(0, 3), match='122'>


In [20]:
if 'Hello':
    print ("yes")
else:
    print("no")

yes


In [21]:
if '':
    print ("yes")
else:
    print("no")

no


## Google for Education
***

. (a period) -- matches any single character except newline '\n'

***

In [22]:
import re

str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)

# If-statement after search() tests if it succeeded
if match:
  print('found', match.group()) ## 'found word:cat'
else:
  print('did not find')

found word:cat


In [23]:
# + -- 1 or more occurrences of the pattern. Ref: https://developers.google.com/edu/python/regular-expressions

string = 'aaaabaa'
pattern = r'a+'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [24]:
# * -- 0 or more occurrences of the pattern. Ref: https://developers.google.com/edu/python/regular-expressions

string = 'aaaabaa'
pattern = r'a*'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [25]:
## i+ = one or more i's, as many as possible.
match = re.search(r'pi+', 'piiig') # found, match.group() == "piii"
match

<re.Match object; span=(0, 4), match='piii'>

In [26]:
## Finds the first/leftmost solution, and within it drives the +
## as far as possible (aka 'leftmost and largest').
## In this example, note that it does not get to the second set of i's.
match = re.search(r'i+', 'piigiiii') # found, match.group() == "ii"
match

<re.Match object; span=(1, 3), match='ii'>

In [27]:
## \s* = zero or more whitespace chars
## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') # found, match.group() == "1 2   3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx') # found, match.group() == "12  3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx123xx') # found, match.group() == "123"
print(match)

<re.Match object; span=(2, 9), match='1 2   3'>
<re.Match object; span=(2, 7), match='12  3'>
<re.Match object; span=(2, 5), match='123'>


In [28]:
## ^ = matches the start of string, so this fails:
match = re.search(r'^b\w+', 'foobar') # not found, match == None
match

In [29]:
## but without the ^ it succeeds:
match = re.search(r'b\w+', 'foobar') # found, match.group() == "bar"
match

<re.Match object; span=(3, 6), match='bar'>

In [30]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print(match.group())  ## 'b@google'

b@google


***
## Exercise 1
***

Write a Python function to remove all non-alphanumeric characters from a string.

***

In [31]:
# Answer 1 removes spaces also

# According to https://docs.python.org/3/library/re.html:
#    \W is equivalent of [^a-zA-Z0-9_]
#    re.sub(pattern, repl, string)
#    Return the string obtained by replacing occurrences of pattern in string by the replacement repl
# I am replacing occurances of non-words with the empty string, so esentially removing them.

def letters_numbers_only(text) :
    # \w = '[a-zA-Z0-9_]'
    return re.sub(r'[\W_]', '', text)

letters_numbers_only("This has numbers 123, letters and symbols. Remove everything, such as $, even spaces, except letters and numbers")

'Thishasnumbers123lettersandsymbolsRemoveeverythingsuchasevenspacesexceptlettersandnumbers'

In [32]:
# Answer 2 does not remove spaces (Debate on whether space is an alphanumeric character)

# \s -- (lowercase s) matches a single whitespace character, ref: https://developers.google.com/edu/python/regular-expressions

def letters_numbers_spaces_only(text) :
    return re.sub(r'[^\w\s]', '', text)

letters_numbers_spaces_only("This has numbers 123, letters and symbols. Remove everything, such as $, except letters and numbers")

'This has numbers 123 letters and symbols Remove everything such as  except letters and numbers'

## Second part of Real Python's Regexes

#### Remember

The 'r' at the start of the pattern string designates a python "raw" string which passes through backslashes without change 

In [33]:
# + means 1 or more.

re.search(r'(\d+)', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [34]:
re.search(r'[a-z]+', '123FOO456', flags=re.IGNORECASE)

<re.Match object; span=(3, 6), match='FOO'>

In [35]:
print(re.search(r'\d+', 'foo.bar'))

None


In [36]:
re.search(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [37]:
re.search(r'\d+', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [38]:
re.match(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [39]:
# Match only looks at beginning of string

print(re.match(r'\d+', 'foo123bar'))

None


In [56]:
print(re.fullmatch(r'\d+', '123foobar'))

None


In [40]:
re.fullmatch(r'\d+', '123')

<re.Match object; span=(0, 3), match='123'>

In [68]:
# ^ means a match at the start of the string
# + means 1 or more
# $ Anchors a match at the end of a string

# Esentially looking for only numbers

re.search(r'^\d+$', '123')

<re.Match object; span=(0, 3), match='123'>

In [42]:
print(re.search(r'^\d+', 'foo123bar'))

None


In [44]:
print(re.search(r'^\d+', '123foobar'))

<re.Match object; span=(0, 3), match='123'>


In [46]:
print(re.search(r'^\d+$', '123foobar'))

None


In [47]:
re.search(r'\d+', '123foo456bar789.')

<re.Match object; span=(0, 3), match='123'>

In [48]:
re.match(r'\d+', '123foo456bar789.')

<re.Match object; span=(0, 3), match='123'>

In [49]:
re.fullmatch(r'\d+', '123foo456bar789.')

In [50]:
re.findall(r'\d+', '123foo456bar789.')

['123', '456', '789']

https://realpython.com/introduction-to-python-generators/

In [51]:
matches = re.finditer(r'\d+', '123foo456bar789.')
matches

<callable_iterator at 0x29aa9453220>

In [52]:
next(matches)

<re.Match object; span=(0, 3), match='123'>

In [53]:
next(matches)

<re.Match object; span=(6, 9), match='456'>

In [54]:
next(matches)

<re.Match object; span=(12, 15), match='789'>

In [55]:
try:
    next(matches)
except:
    print(None)

None


In [69]:
matches = re.finditer(r'\d+', '123foo456bar789.')

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(6, 9), match='456'>
<re.Match object; span=(12, 15), match='789'>


## re.sub()
---

In [70]:
s = 'foo.123.bar.789.baz'

In [71]:
re.sub(r'\d+', '#', s)

'foo.#.bar.#.baz'

In [72]:
re.sub('[a-z]+', '(*)', s)

'(*).123.(*).789.(*)'

In [73]:
# swaps digit strings with alpha strings in string.
re.sub(r'([a-z]+)([0-9]+)', r'\2\1', 'foo123bar456')

'123foo456bar'

In [74]:
# swaps first and last strings
re.sub(r'(\w+),bar,baz,(\w+)', r'\2,bar,baz,\1', 'foo,bar,baz,qux')

'qux,bar,baz,foo'

## Compiling
---

In [77]:
my_regex = re.compile(r'([0-9]+)')
my_regex

re.compile(r'([0-9]+)', re.UNICODE)

In [78]:
my_regex.search('foo123bar456')

<re.Match object; span=(3, 6), match='123'>

In [79]:
my_regex.findall('foo123bar456')

['123', '456']

In [80]:
my_regex.sub(r'...', 'foo123bar456')

'foo...bar...'

## Regular Experssions on iris
---

In [120]:
# https://stackoverflow.com/a/1393367
# Getting the Iris dataset

import urllib.request

url = r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

iris = [line.decode('utf-8').strip() for line in urllib.request.urlopen(url)]

iris

['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3.0,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5.0,3.6,1.4,0.2,Iris-setosa',
 '5.4,3.9,1.7,0.4,Iris-setosa',
 '4.6,3.4,1.4,0.3,Iris-setosa',
 '5.0,3.4,1.5,0.2,Iris-setosa',
 '4.4,2.9,1.4,0.2,Iris-setosa',
 '4.9,3.1,1.5,0.1,Iris-setosa',
 '5.4,3.7,1.5,0.2,Iris-setosa',
 '4.8,3.4,1.6,0.2,Iris-setosa',
 '4.8,3.0,1.4,0.1,Iris-setosa',
 '4.3,3.0,1.1,0.1,Iris-setosa',
 '5.8,4.0,1.2,0.2,Iris-setosa',
 '5.7,4.4,1.5,0.4,Iris-setosa',
 '5.4,3.9,1.3,0.4,Iris-setosa',
 '5.1,3.5,1.4,0.3,Iris-setosa',
 '5.7,3.8,1.7,0.3,Iris-setosa',
 '5.1,3.8,1.5,0.3,Iris-setosa',
 '5.4,3.4,1.7,0.2,Iris-setosa',
 '5.1,3.7,1.5,0.4,Iris-setosa',
 '4.6,3.6,1.0,0.2,Iris-setosa',
 '5.1,3.3,1.7,0.5,Iris-setosa',
 '4.8,3.4,1.9,0.2,Iris-setosa',
 '5.0,3.0,1.6,0.2,Iris-setosa',
 '5.0,3.4,1.6,0.4,Iris-setosa',
 '5.2,3.5,1.5,0.2,Iris-setosa',
 '5.2,3.4,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.6,0.2,Iris-setosa',
 '4.8,3.1,1.6,0.2,Iris-setosa',
 '5.4,3.

In [125]:
strip_iris = re.compile(r'Iris-([a-z]+)')

In [126]:
[strip_iris.sub(r'\1', line) for line in iris]

['5.1,3.5,1.4,0.2,setosa',
 '4.9,3.0,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa',
 '5.0,3.6,1.4,0.2,setosa',
 '5.4,3.9,1.7,0.4,setosa',
 '4.6,3.4,1.4,0.3,setosa',
 '5.0,3.4,1.5,0.2,setosa',
 '4.4,2.9,1.4,0.2,setosa',
 '4.9,3.1,1.5,0.1,setosa',
 '5.4,3.7,1.5,0.2,setosa',
 '4.8,3.4,1.6,0.2,setosa',
 '4.8,3.0,1.4,0.1,setosa',
 '4.3,3.0,1.1,0.1,setosa',
 '5.8,4.0,1.2,0.2,setosa',
 '5.7,4.4,1.5,0.4,setosa',
 '5.4,3.9,1.3,0.4,setosa',
 '5.1,3.5,1.4,0.3,setosa',
 '5.7,3.8,1.7,0.3,setosa',
 '5.1,3.8,1.5,0.3,setosa',
 '5.4,3.4,1.7,0.2,setosa',
 '5.1,3.7,1.5,0.4,setosa',
 '4.6,3.6,1.0,0.2,setosa',
 '5.1,3.3,1.7,0.5,setosa',
 '4.8,3.4,1.9,0.2,setosa',
 '5.0,3.0,1.6,0.2,setosa',
 '5.0,3.4,1.6,0.4,setosa',
 '5.2,3.5,1.5,0.2,setosa',
 '5.2,3.4,1.4,0.2,setosa',
 '4.7,3.2,1.6,0.2,setosa',
 '4.8,3.1,1.6,0.2,setosa',
 '5.4,3.4,1.5,0.4,setosa',
 '5.2,4.1,1.5,0.1,setosa',
 '5.5,4.2,1.4,0.2,setosa',
 '4.9,3.1,1.5,0.1,setosa',
 '5.0,3.2,1.2,0.2,setosa',
 '5.5,3.5,1.3,0.2,setosa',
 

In [127]:
# Sets up the layout for the data
strip_iris = re.compile(r'([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),Iris-([a-z]+)')

In [128]:
# Reverses the order of the column and based on above removes 'Iris-' from the names of the flowers.
[strip_iris.sub(r'\5,\4,\3,\2,\1', line) for line in iris if line]

['setosa,0.2,1.4,3.5,5.1',
 'setosa,0.2,1.4,3.0,4.9',
 'setosa,0.2,1.3,3.2,4.7',
 'setosa,0.2,1.5,3.1,4.6',
 'setosa,0.2,1.4,3.6,5.0',
 'setosa,0.4,1.7,3.9,5.4',
 'setosa,0.3,1.4,3.4,4.6',
 'setosa,0.2,1.5,3.4,5.0',
 'setosa,0.2,1.4,2.9,4.4',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.5,3.7,5.4',
 'setosa,0.2,1.6,3.4,4.8',
 'setosa,0.1,1.4,3.0,4.8',
 'setosa,0.1,1.1,3.0,4.3',
 'setosa,0.2,1.2,4.0,5.8',
 'setosa,0.4,1.5,4.4,5.7',
 'setosa,0.4,1.3,3.9,5.4',
 'setosa,0.3,1.4,3.5,5.1',
 'setosa,0.3,1.7,3.8,5.7',
 'setosa,0.3,1.5,3.8,5.1',
 'setosa,0.2,1.7,3.4,5.4',
 'setosa,0.4,1.5,3.7,5.1',
 'setosa,0.2,1.0,3.6,4.6',
 'setosa,0.5,1.7,3.3,5.1',
 'setosa,0.2,1.9,3.4,4.8',
 'setosa,0.2,1.6,3.0,5.0',
 'setosa,0.4,1.6,3.4,5.0',
 'setosa,0.2,1.5,3.5,5.2',
 'setosa,0.2,1.4,3.4,5.2',
 'setosa,0.2,1.6,3.2,4.7',
 'setosa,0.2,1.6,3.1,4.8',
 'setosa,0.4,1.5,3.4,5.4',
 'setosa,0.1,1.5,4.1,5.2',
 'setosa,0.2,1.4,4.2,5.5',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.2,3.2,5.0',
 'setosa,0.2,1.3,3.5,5.5',
 

In [130]:
# Get the Iris dataset in a list without stripping off metadata.

import urllib.request

url = r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

iris = list(urllib.request.urlopen(url))
iris

[b'5.1,3.5,1.4,0.2,Iris-setosa\n',
 b'4.9,3.0,1.4,0.2,Iris-setosa\n',
 b'4.7,3.2,1.3,0.2,Iris-setosa\n',
 b'4.6,3.1,1.5,0.2,Iris-setosa\n',
 b'5.0,3.6,1.4,0.2,Iris-setosa\n',
 b'5.4,3.9,1.7,0.4,Iris-setosa\n',
 b'4.6,3.4,1.4,0.3,Iris-setosa\n',
 b'5.0,3.4,1.5,0.2,Iris-setosa\n',
 b'4.4,2.9,1.4,0.2,Iris-setosa\n',
 b'4.9,3.1,1.5,0.1,Iris-setosa\n',
 b'5.4,3.7,1.5,0.2,Iris-setosa\n',
 b'4.8,3.4,1.6,0.2,Iris-setosa\n',
 b'4.8,3.0,1.4,0.1,Iris-setosa\n',
 b'4.3,3.0,1.1,0.1,Iris-setosa\n',
 b'5.8,4.0,1.2,0.2,Iris-setosa\n',
 b'5.7,4.4,1.5,0.4,Iris-setosa\n',
 b'5.4,3.9,1.3,0.4,Iris-setosa\n',
 b'5.1,3.5,1.4,0.3,Iris-setosa\n',
 b'5.7,3.8,1.7,0.3,Iris-setosa\n',
 b'5.1,3.8,1.5,0.3,Iris-setosa\n',
 b'5.4,3.4,1.7,0.2,Iris-setosa\n',
 b'5.1,3.7,1.5,0.4,Iris-setosa\n',
 b'4.6,3.6,1.0,0.2,Iris-setosa\n',
 b'5.1,3.3,1.7,0.5,Iris-setosa\n',
 b'4.8,3.4,1.9,0.2,Iris-setosa\n',
 b'5.0,3.0,1.6,0.2,Iris-setosa\n',
 b'5.0,3.4,1.6,0.4,Iris-setosa\n',
 b'5.2,3.5,1.5,0.2,Iris-setosa\n',
 b'5.2,3.4,1.4,0.2,I

## Exercise 2
---

Adapt the above code to capitalise the first letter of the iris species, using regular expressions.

---

In [135]:
# Get the Iris dataset again to have a fresh set of data removing the meta-data.

import urllib.request

url = r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

iris = [line.decode('utf-8').strip() for line in urllib.request.urlopen(url)]

In [136]:
# Sets up the pattern for the data
#strip_iris = re.compile(r'([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),Iris-([a-z]+)')
strip_iris = re.compile(r'([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),Iris-([a-z]+)')


In [138]:

# Reverses the order of the column and based on above removes 'Iris-' from the names of the flowers.
# Also capitalises the first letter of the iris species
[strip_iris.sub(r'\5:\4:\3:\2:\1', line) for line in iris if line]

['setosa:0.2:1.4:3.5:5.1',
 'setosa:0.2:1.4:3.0:4.9',
 'setosa:0.2:1.3:3.2:4.7',
 'setosa:0.2:1.5:3.1:4.6',
 'setosa:0.2:1.4:3.6:5.0',
 'setosa:0.4:1.7:3.9:5.4',
 'setosa:0.3:1.4:3.4:4.6',
 'setosa:0.2:1.5:3.4:5.0',
 'setosa:0.2:1.4:2.9:4.4',
 'setosa:0.1:1.5:3.1:4.9',
 'setosa:0.2:1.5:3.7:5.4',
 'setosa:0.2:1.6:3.4:4.8',
 'setosa:0.1:1.4:3.0:4.8',
 'setosa:0.1:1.1:3.0:4.3',
 'setosa:0.2:1.2:4.0:5.8',
 'setosa:0.4:1.5:4.4:5.7',
 'setosa:0.4:1.3:3.9:5.4',
 'setosa:0.3:1.4:3.5:5.1',
 'setosa:0.3:1.7:3.8:5.7',
 'setosa:0.3:1.5:3.8:5.1',
 'setosa:0.2:1.7:3.4:5.4',
 'setosa:0.4:1.5:3.7:5.1',
 'setosa:0.2:1.0:3.6:4.6',
 'setosa:0.5:1.7:3.3:5.1',
 'setosa:0.2:1.9:3.4:4.8',
 'setosa:0.2:1.6:3.0:5.0',
 'setosa:0.4:1.6:3.4:5.0',
 'setosa:0.2:1.5:3.5:5.2',
 'setosa:0.2:1.4:3.4:5.2',
 'setosa:0.2:1.6:3.2:4.7',
 'setosa:0.2:1.6:3.1:4.8',
 'setosa:0.4:1.5:3.4:5.4',
 'setosa:0.1:1.5:4.1:5.2',
 'setosa:0.2:1.4:4.2:5.5',
 'setosa:0.1:1.5:3.1:4.9',
 'setosa:0.2:1.2:3.2:5.0',
 'setosa:0.2:1.3:3.5:5.5',
 

In [144]:
iris2 = [strip_iris.sub(r'\5,\4,\3,\2,\1', line) for line in iris if line]
iris2

['setosa,0.2,1.4,3.5,5.1',
 'setosa,0.2,1.4,3.0,4.9',
 'setosa,0.2,1.3,3.2,4.7',
 'setosa,0.2,1.5,3.1,4.6',
 'setosa,0.2,1.4,3.6,5.0',
 'setosa,0.4,1.7,3.9,5.4',
 'setosa,0.3,1.4,3.4,4.6',
 'setosa,0.2,1.5,3.4,5.0',
 'setosa,0.2,1.4,2.9,4.4',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.5,3.7,5.4',
 'setosa,0.2,1.6,3.4,4.8',
 'setosa,0.1,1.4,3.0,4.8',
 'setosa,0.1,1.1,3.0,4.3',
 'setosa,0.2,1.2,4.0,5.8',
 'setosa,0.4,1.5,4.4,5.7',
 'setosa,0.4,1.3,3.9,5.4',
 'setosa,0.3,1.4,3.5,5.1',
 'setosa,0.3,1.7,3.8,5.7',
 'setosa,0.3,1.5,3.8,5.1',
 'setosa,0.2,1.7,3.4,5.4',
 'setosa,0.4,1.5,3.7,5.1',
 'setosa,0.2,1.0,3.6,4.6',
 'setosa,0.5,1.7,3.3,5.1',
 'setosa,0.2,1.9,3.4,4.8',
 'setosa,0.2,1.6,3.0,5.0',
 'setosa,0.4,1.6,3.4,5.0',
 'setosa,0.2,1.5,3.5,5.2',
 'setosa,0.2,1.4,3.4,5.2',
 'setosa,0.2,1.6,3.2,4.7',
 'setosa,0.2,1.6,3.1,4.8',
 'setosa,0.4,1.5,3.4,5.4',
 'setosa,0.1,1.5,4.1,5.2',
 'setosa,0.2,1.4,4.2,5.5',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.2,3.2,5.0',
 'setosa,0.2,1.3,3.5,5.5',
 

In [153]:
def convert_into_uppercase(a):
    return a.group(1) + a.group(2).upper()

[re.sub("(^|\s)(\S)", convert_into_uppercase, line) for line in iris2 if line]


['Setosa,0.2,1.4,3.5,5.1',
 'Setosa,0.2,1.4,3.0,4.9',
 'Setosa,0.2,1.3,3.2,4.7',
 'Setosa,0.2,1.5,3.1,4.6',
 'Setosa,0.2,1.4,3.6,5.0',
 'Setosa,0.4,1.7,3.9,5.4',
 'Setosa,0.3,1.4,3.4,4.6',
 'Setosa,0.2,1.5,3.4,5.0',
 'Setosa,0.2,1.4,2.9,4.4',
 'Setosa,0.1,1.5,3.1,4.9',
 'Setosa,0.2,1.5,3.7,5.4',
 'Setosa,0.2,1.6,3.4,4.8',
 'Setosa,0.1,1.4,3.0,4.8',
 'Setosa,0.1,1.1,3.0,4.3',
 'Setosa,0.2,1.2,4.0,5.8',
 'Setosa,0.4,1.5,4.4,5.7',
 'Setosa,0.4,1.3,3.9,5.4',
 'Setosa,0.3,1.4,3.5,5.1',
 'Setosa,0.3,1.7,3.8,5.7',
 'Setosa,0.3,1.5,3.8,5.1',
 'Setosa,0.2,1.7,3.4,5.4',
 'Setosa,0.4,1.5,3.7,5.1',
 'Setosa,0.2,1.0,3.6,4.6',
 'Setosa,0.5,1.7,3.3,5.1',
 'Setosa,0.2,1.9,3.4,4.8',
 'Setosa,0.2,1.6,3.0,5.0',
 'Setosa,0.4,1.6,3.4,5.0',
 'Setosa,0.2,1.5,3.5,5.2',
 'Setosa,0.2,1.4,3.4,5.2',
 'Setosa,0.2,1.6,3.2,4.7',
 'Setosa,0.2,1.6,3.1,4.8',
 'Setosa,0.4,1.5,3.4,5.4',
 'Setosa,0.1,1.5,4.1,5.2',
 'Setosa,0.2,1.4,4.2,5.5',
 'Setosa,0.1,1.5,3.1,4.9',
 'Setosa,0.2,1.2,3.2,5.0',
 'Setosa,0.2,1.3,3.5,5.5',
 