_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 7 - Data Cleaning and Preparation
### Part 3 - String Manipulation

In [1]:
import numpy as np
import pandas as pd

String Object Methods

In [2]:
# plain python
val = 'a,b,  guido'

# split
print(val.split(','))

# split and strip
pieces = [x.strip() for x in val.split(',')]
print(pieces)

# joining pieces
print('Join on double colon:', '::'.join(pieces))

# check existence
print('Is guido in val?', 'guido' in val)

# get position
print('First comma position is @', val.index(','))
try:
    print(val.index(':'))
except ValueError as exc:
    print('There is no colon in val:', exc)

# find position
if val.find(':') == -1:
    print('There is no colon in val')
    
# count occurences
print('There are', val.count(','), 'commas in val')

# replace a substring
print('double colon for each comma:', val.replace(',', '::'))
print('remove all commas:', val.replace(',', ''))

['a', 'b', '  guido']
['a', 'b', 'guido']
Join on double colon: a::b::guido
Is guido in val? True
First comma position is @ 1
There is no colon in val: substring not found
There is no colon in val
There are 2 commas in val
double colon for each comma: a::b::  guido
remove all commas: ab  guido


Regular Expressions

In [3]:
import re

In [4]:
# split of words with different whitespaces as delimiters
text = "foo   bar\t baz \nqux\rtail"
print(re.split('\s+', text))

# precompiled regex could be useful
regex = re.compile('\s+')
print(regex.split(text))
print('All patterns matching the regex:', regex.findall(text))

['foo', 'bar', 'baz', 'qux', 'tail']
['foo', 'bar', 'baz', 'qux', 'tail']
All patterns matching the regex: ['   ', '\t ', ' \n', '\r']


In [5]:
# regex findall() vs search() vs match() vs sub()

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

# it's a _raw_ string - simple email address
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# case insensitive regex
regex = re.compile(pattern, flags=re.IGNORECASE)

print('findall()', regex.findall(text))
match = regex.search(text)
print('search():', match, 'that means:', text[match.start():match.end()])
print('no match(), given implicit ^:', regex.match(text))
print('sub(), from pattern to OMISSIS:')
print(regex.sub('OMISSIS', text))
print('---')

# parentheses to identify components in the pattern:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

match = regex.match('wesm@bright.net')
print('match() groups:', match.group(), '->', match.groups())
print('findall() groups as tuples:', regex.findall(text))

print('\nsub() access to group components:')
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

findall() ['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
search(): <_sre.SRE_Match object; span=(5, 20), match='dave@google.com'> that means: dave@google.com
no match(), given implicit ^: None
sub(), from pattern to OMISSIS:
Dave OMISSIS
Steve OMISSIS
Rob OMISSIS
Ryan OMISSIS

---
match() groups: wesm@bright.net -> ('wesm', 'bright', 'net')
findall() groups as tuples: [('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com'), ('ryan', 'yahoo', 'com')]

sub() access to group components:
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



Vectorized String Functions in pandas

In [46]:
# Series.str to simplify work with missing values

data = pd.Series(
    {'Dave': 'dave@google.com',
     'Steve': 'steve@gmail.com',
     'Rob': 'rob@gmail.com',
     'Wes': np.nan}
)

print(data)
print('-- isnull()')
print(data.isnull())

print('-- str contains()')
print(data.str.contains('gmail'))

print('-- emulate in plain python')
y = lambda x : (False if x.find('gmail') < 0 else True) if type(x) == str else x
print(list(map(y, data)))

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object
-- isnull()
Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool
-- str contains()
Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object
-- emulate in plain python
[False, True, True, nan]


In [63]:
# regex through str
print('Using pattern', pattern)
print(data.str.findall(pattern, flags=re.IGNORECASE))
print(data.str.match(pattern, flags=re.IGNORECASE))

# str.get() or index on str to element retrieval
print('--')
print(data.str.get(0))
print(data.str[-1])
print('--')

# string slicing
print(data.str[:5])

Using pattern ([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object
Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object
--
Dave       d
Rob        r
Steve      s
Wes      NaN
dtype: object
Dave       m
Rob        m
Steve      m
Wes      NaN
dtype: object
--
Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object
