#Examples from the Text

In [32]:
import pandas as pd

sample_string = 'take 5'
print(sample_string[4:5].isspace())

# Experiment on the sample string below using the other methods listed above.
# What does each do? When would they be useful? What hapens when you try them
# on different slices of the sample string, like sample_string[:4]?


True


In [36]:
money = pd.Series([400, 111, '$20', 57, 'q7', 'Lots'])

# Running `money.isdigit()` throws an error because .isdigit() is a string
# attribute, _not_ a series attribute. Uncomment the line below to see.

# print(money.isdigit())

# Instead, let's define a new function that takes a string as an argument
# and returns True if the string is all digits, otherwise False.

def is_a_string(x):
    # First make sure we're operating on a string, then use our string method.
    return str(x).isalnum()

# Now let's apply our custom function to each element in our series.
print(money.apply(is_a_string))

0     True
1     True
2    False
3     True
4     True
5     True
dtype: bool


In [38]:
# Dirty list
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Here's a lambda function that mirrors the is_a_digit function above.
# Read this print statement carefully and compare to the previous one.
print(money.apply(lambda x: str(x).isalnum()))

0     True
1     True
2    False
3     True
4     True
dtype: bool


In [40]:
# We're using list() on the result because filter() returns an iterator.

print('Filtering the whole series:')
print(list(filter(lambda x: str(x).isalnum(), money)))

print('\nApplying filter() to each value in the series:')
print(money.apply(lambda x: ''.join(list(filter(str.isalnum, str(x))))))

Filtering the whole series:
[400, 111, 57, 'Lots']

Applying filter() to each value in the series:
0     400
1     111
2      20
3      57
4    Lots
dtype: object


In [44]:
# Create a series of dirty, annoying strings.
words = pd.Series([
    'MollyMalone$molmal@gmail.com',
    'JeffreyJones$jefjo@hotmail.com',
    'DeadParrot$fjords@gmail.com'
])

# Split on '$'. We'll use the Pandas split method.
word_split = words.str.split('@', expand=True)
names = word_split[0]
emails = word_split[1]
print(word_split)

                    0            1
0  MollyMalone$molmal    gmail.com
1  JeffreyJones$jefjo  hotmail.com
2   DeadParrot$fjords    gmail.com


In [9]:
# Splitting on capital letters.
# Just because we can doesn't mean we should:
print(names.str.split('[A-Z]', expand=True))

  0       1      2
0      olly  alone
1    effrey   ones
2       ead  arrot


In [64]:
import re

words = pd.Series([
    'MollyMalone$molmal@gmail.com',
    'JeffreyJones$jefjo@hotmail.com',
    'DeadParrot$fjords@gmail.com'])

# We expect the first name to follow the first capital letter.
firstname = words.apply(lambda x: re.findall('[A-Z][a-z]*[A-Z][a-z]{2}', x)[0])

# We expect the last name to follow the second capital letter.
lastname = words.apply(lambda x: re.findall('[A-Z][a-z]*', x)[1])

print(firstname, '\n')
print(lastname)

0      MollyMal
1    JeffreyJon
2       DeadPar
dtype: object 

0    Malone
1     Jones
2    Parrot
dtype: object


In [67]:
print(emails)

0      gmail.com
1    hotmail.com
2      gmail.com
Name: 1, dtype: object


In [69]:
print(emails.str.replace('.', '$'),'\n')

print(emails.str.replace('.com', ''))

0      gmail$com
1    hotmail$com
2      gmail$com
Name: 1, dtype: object 

0      gmail
1    hotmail
2      gmail
Name: 1, dtype: object


In [12]:
print(names.str.lower(), '\n')
print(names.str.upper(), '\n')
print(names.str.capitalize())

0     mollymalone
1    jeffreyjones
2      deadparrot
Name: 0, dtype: object 

0     MOLLYMALONE
1    JEFFREYJONES
2      DEADPARROT
Name: 0, dtype: object 

0     Mollymalone
1    Jeffreyjones
2      Deadparrot
Name: 0, dtype: object


In [13]:
spacy = '   What, on earth, is going on here?      '
print(spacy)
print(spacy.strip())

   What, on earth, is going on here?      
What, on earth, is going on here?


In [14]:
# Series of strings with annoying whitespace.
words = pd.Series([' duck', 'duck ', ' duck ', 'goose'])
print(words[0] == words[1])

stripped = words.str.strip()
print(stripped[0] == stripped[1])

False
True


#Drills - Cleaning Data - Lesson 3.5