# CH 7 - Data Cleaning and Preparation

## 7.1 Handling Missing Data

In [2]:
import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# The built-in Python None value is also treated as NA in object arrays:
string_data[0] = None

string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [6]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

* With DataFrame objects, things are a bit more complex. You may want to drop rows
or columns that are all NA or only those containing any NAs. dropna by default drops
any row containing a missing value:

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

cleaned = data.dropna()

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [12]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [13]:
data[4] = np.nan

data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
#Suppose  you want to keep only rows containing a certain number 
#of observations. You can indicate this with the thresh argument:

df = pd.DataFrame(np.random.randn(7,3))

df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

df

Unnamed: 0,0,1,2
0,1.672793,,
1,1.004658,,
2,-1.114168,,-0.952043
3,-0.389834,,1.385255
4,-1.518672,1.363134,0.737251
5,-0.689001,-0.48932,0.377504
6,-0.898161,2.872587,0.276567


In [15]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.114168,,-0.952043
3,-0.389834,,1.385255
4,-1.518672,1.363134,0.737251
5,-0.689001,-0.48932,0.377504
6,-0.898161,2.872587,0.276567


### Filling In Missing Data

In [17]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.672793,0.0,0.0
1,1.004658,0.0,0.0
2,-1.114168,0.0,-0.952043
3,-0.389834,0.0,1.385255
4,-1.518672,1.363134,0.737251
5,-0.689001,-0.48932,0.377504
6,-0.898161,2.872587,0.276567


In [18]:
# Calling fillna with a dict, you can use a different 
# fill value for each column:

df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,1.672793,0.5,0.0
1,1.004658,0.5,0.0
2,-1.114168,0.5,-0.952043
3,-0.389834,0.5,1.385255
4,-1.518672,1.363134,0.737251
5,-0.689001,-0.48932,0.377504
6,-0.898161,2.872587,0.276567


In [20]:
# The same interpolation methods available for reindexing 
# can be used with fillna :
df = pd.DataFrame(np.random.randn(6,3))

df.iloc[4:, 2] = np.nan
df.iloc[2:, 1] = np.nan

df

Unnamed: 0,0,1,2
0,-0.637441,-0.658749,-0.322327
1,0.675118,0.784179,0.808365
2,-0.71448,,0.481113
3,1.482924,,-0.032358
4,1.469247,,
5,-0.853348,,


In [21]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.637441,-0.658749,-0.322327
1,0.675118,0.784179,0.808365
2,-0.71448,0.784179,0.481113
3,1.482924,0.784179,-0.032358
4,1.469247,0.784179,-0.032358
5,-0.853348,0.784179,-0.032358


In [22]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.637441,-0.658749,-0.322327
1,0.675118,0.784179,0.808365
2,-0.71448,0.784179,0.481113
3,1.482924,0.784179,-0.032358
4,1.469247,,-0.032358
5,-0.853348,,-0.032358


In [23]:
# Fill the mean
df.fillna(data.mean())

Unnamed: 0,0,1,2
0,-0.637441,-0.658749,-0.322327
1,0.675118,0.784179,0.808365
2,-0.71448,6.5,0.481113
3,1.482924,6.5,-0.032358
4,1.469247,6.5,3.0
5,-0.853348,6.5,3.0


## 7.2 Data Transformation

### Removing Duplicates

In [32]:
data = pd.DataFrame({'k1': ['one','two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [28]:
# The DataFrame method duplicated returns a boolean Series indicating whether each
# row is a duplicate (has been observed in a previous row) or not:

data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [29]:
# Relatedly, drop_duplicates returns a DataFrame where the duplicated array is False :

data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [34]:
data['v1'] = range(7)

data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [35]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [37]:
# duplicated and drop_duplicates by default keep the first observed 
# value combination. Passing keep='last' will return the last one:

data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [39]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 
                              'Pastrami','corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [40]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [42]:
# The map method on a Series accepts a function or dict-like 
# object containing a mapping

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

lowercased = data['food'].str.lower()

lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [43]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [44]:
# We could also have passed a function that does all the work:

data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [57]:
data['ounces'].map(lambda x: int(x))

0     4
1     3
2    12
3     6
4     7
5     8
6     3
7     5
8     6
Name: ounces, dtype: int64

### Replacing Values

In [58]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [59]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [60]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [61]:
# If you want to replace multiple values at once, you instead pass 
# a list and then the substitute value:

data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [62]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [63]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

* Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects.

In [64]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [65]:
transform = lambda x: x[:4].upper()

In [66]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [67]:
# assign to index
data.index = data.index.map(transform)

In [68]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [69]:
# If you want to create a transformed version of a dataset 
# without modifying the original, a useful method is rename :
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [71]:
# rename can be used in conjunction with a dict-like object 
# providing new values for a subset of the axis labels:

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

In [72]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [73]:
bins = [18, 25, 35, 60, 100]

In [75]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

* The object pandas returns is a special Categorical object. The output you see
describes the bins computed by pandas.cut . You can treat it like an array of strings
indicating the bin name; internally it contains a categories array specifying the distinct category names along with a labeling for the ages data in the codes attribute:

In [76]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [77]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [78]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [79]:
# You can change which side is closed by passing right=False :

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [80]:
# You can also pass your own bin names by passing a list or array to the labels option:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [83]:
# If you pass an integer number of bins to cut instead of explicit bin edges, 
# it will compute equal-length bins based on the minimum and maximum values in the data.
data = np.random.rand(20)
data

array([0.66777869, 0.62424082, 0.48840712, 0.53323403, 0.51617712,
       0.59249142, 0.35825734, 0.10351604, 0.63489895, 0.03093074,
       0.98725407, 0.12613631, 0.43969467, 0.94737019, 0.05399104,
       0.54131029, 0.75888891, 0.04042185, 0.43349627, 0.72185428])

In [85]:
dt_cut = pd.cut(data, 4, precision=2)
dt_cut

[(0.51, 0.75], (0.51, 0.75], (0.27, 0.51], (0.51, 0.75], (0.51, 0.75], ..., (0.51, 0.75], (0.75, 0.99], (0.03, 0.27], (0.27, 0.51], (0.51, 0.75]]
Length: 20
Categories (4, interval[float64]): [(0.03, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]]

In [87]:
dt_cut.value_counts()

(0.03, 0.27]    5
(0.27, 0.51]    4
(0.51, 0.75]    8
(0.75, 0.99]    3
dtype: int64

* A closely related function, qcut , bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:

In [92]:
data = np.random.randn(1000)
data[:5]

array([ 0.03247455, -1.59741335, -0.95111905,  0.00331238,  0.75751901])

In [93]:
cats = pd.qcut(data, 4) # cut into quartiles
cats

[(-0.651, 0.0451], (-3.13, -0.651], (-3.13, -0.651], (-0.651, 0.0451], (0.676, 3.063], ..., (-3.13, -0.651], (0.676, 3.063], (-0.651, 0.0451], (0.676, 3.063], (0.0451, 0.676]]
Length: 1000
Categories (4, interval[float64]): [(-3.13, -0.651] < (-0.651, 0.0451] < (0.0451, 0.676] < (0.676, 3.063]]

In [94]:
pd.value_counts(cats)

(0.676, 3.063]      250
(0.0451, 0.676]     250
(-0.651, 0.0451]    250
(-3.13, -0.651]     250
dtype: int64

In [95]:
# You can pass your own quantiles (numbers between 0 and 1, inclusive):
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.28, 0.0451], (-3.13, -1.28], (-1.28, 0.0451], (-1.28, 0.0451], (0.0451, 1.225], ..., (-3.13, -1.28], (0.0451, 1.225], (-1.28, 0.0451], (1.225, 3.063], (0.0451, 1.225]]
Length: 1000
Categories (4, interval[float64]): [(-3.13, -1.28] < (-1.28, 0.0451] < (0.0451, 1.225] < (1.225, 3.063]]

### Detecting and Filtering Outliers

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.DataFrame(np.random.randn(1000,4))

In [4]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.016048,-0.001044,0.025319,-0.028931
std,0.990459,0.987315,1.011005,0.997131
min,-3.509042,-2.943107,-2.86369,-2.749482
25%,-0.659826,-0.677699,-0.608577,-0.736194
50%,-0.027524,-0.013884,0.025646,-0.01256
75%,0.666134,0.705386,0.715049,0.6623
max,3.153429,3.116783,2.820931,2.883412


In [13]:
# Suppose you wanted to find values in one of the columns 
# exceeding 3 in absolute value:

col = data[0]
col[np.abs(col) > 3]

In [26]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
213,1.253911,3.116783,-0.616597,1.697944
214,-3.150729,-0.365523,1.059369,-0.491388
304,-3.141238,-1.485498,-0.819967,-0.141626
390,-3.509042,0.447582,1.504048,0.286908
802,-0.692735,3.008239,0.448644,0.399787
925,3.153429,-0.071249,-1.276056,0.003962


In [27]:
# The statement np.sign(data) produces 1 and –1 values 
# based on whether the values in data are positive or negative:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,1.0,1.0,1.0
2,-1.0,1.0,-1.0,-1.0
3,1.0,-1.0,-1.0,-1.0
4,-1.0,1.0,1.0,-1.0


### Permutation and Random Sampling

In [29]:
df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [30]:
sampler = np.random.permutation(5)
sampler

array([0, 1, 3, 2, 4])

In [31]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


In [32]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15


In [33]:
# To generate a sample with replacement (to allow repeat choices), 
# pass replace=True to sample :
choices = pd.Series([5, 7, -1, 6, 4])

draws = choices.sample(n=10, replace=True)

draws

4    4
4    4
2   -1
3    6
3    6
1    7
4    4
4    4
2   -1
0    5
dtype: int64

### Computing Indicator/Dummy Variables

In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [4]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [5]:
# In some cases, you may want to add a prefix to the columns
# in the indicator DataFrame

dummies = pd.get_dummies(df['key'], prefix='key')

df_with_dummy = df[['data1']].join(dummies)

df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


* If a row in a DataFrame belongs to multiple categories, things are a bit more compli‐
cated. Let’s look at the MovieLens 1M dataset, which is investigated in more detail in
Chapter 14:

In [11]:
mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('examples/datasets/movielens/movies.dat',
                       sep='::', header=None, names=mnames, engine='python')

movies.head(10)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [12]:
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))

genres = pd.unique(all_genres)

genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [13]:
# One way to construct the indicator DataFrame is to start with
# a DataFrame of all zeros
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [18]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [19]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [20]:
# Then, we can use .iloc to set values based on these indices:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [22]:
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Then, as before, you can combine this with movies :
movies_windic = movies.join(dummies.add_prefix('Genre_'))

movies_windic.head()

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

* For much larger data, this method of constructing indicator vari‐
ables with multiple membership is not especially speedy. It would
be better to write a lower-level function that writes directly to a
NumPy array, and then wrap the result in a DataFrame.

In [27]:
# A useful recipe for statistical applications is to combine get_dummies
# with a discretization function like cut:

np.random.seed(12345)
values = np.random.rand(10)

values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [28]:
bins = [ 0, 0.2, .4, .6, .8, 1]

pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


## 7.3 String Manipulation

In [5]:
#  a comma-separated string can be broken into pieces with split:
val = 'a, b,    guido'

val.split(',')

['a', ' b', '    guido']

In [6]:
# combined with strip to trim whitespace
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [7]:
# these could be concatenated together
first, second, third = pieces

first + '::' + second + '::' + third

'a::b::guido'

In [8]:
# more pythonic
'::'.join(pieces)

'a::b::guido'

In [12]:
#  Using Python’s in keyword is
# the best way to detect a substring, 
# though index and find can also be used:

print('guido' in val, '\n')
print(val.index(','), '\n')
print(val.find(':'), '\n')

# Note the difference between find and index 
# is that index raises an exception if the
# string isn’t found (versus returning –1)

True 

1 

-1 



In [13]:
# Relatedly, count returns the number of occurrences of a particular substring:
val.count(',')

2

In [14]:
# replace substitutes one pattern for another
val.replace(',', '::')

'a:: b::    guido'

### Regular Expressions

In [15]:
# The re module functions fall into three categories:
# pattern matching, substitution, and splitting

import re

In [16]:
text = "foo bar\t baz \tqux"

re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [17]:
# You can compile the regex yourself
# with re.compile, forming a reusable regex object:
regex = re.compile('\s+')

regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [18]:
# If, instead, you wanted to get a list of all patterns 
# matching the regex, you can use the findall method:
regex.findall(text)

[' ', '\t ', ' \t']

In [19]:
# While findall returns all matches in a string, 
# search returns only the first match. More rigidly, 
# match only matches at the beginning of the string.

text = \
"""Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern =  r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [20]:
# search returns a special match object for
# the first email address in the text. 

m = regex.search(text)

m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [21]:
text[m.start():m.end()]

'dave@google.com'

In [22]:
# regex.match returns None, as it only will match 
# if the pattern occurs at the start of the string

print(regex.match(text))

None


In [23]:
# Relatedly, sub will return a new string with 
# occurrences of the pattern replaced by the a new string

print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



* Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: username, domain name, and domain suffix. To
do this, put parentheses around the parts of the pattern to segment:

In [24]:
pattern =  r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')

m.groups()

('wesm', 'bright', 'net')

In [25]:
# findall returns a list of tuples when 
# the pattern has groups

regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [27]:
# sub also has access to groups in each match 
# using special symbols like \1 and \2. 

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



### Vectorized String Functions in pandas

In [16]:
import numpy as np
import pandas as pd
import re

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

* You can apply string and regular expression methods can be applied (passing a
lambda or other function) to each value using data.map, but it will fail on the NA
(null) values. To cope with this, Series has array-oriented methods for string opera‐
tions that skip NA values. These are accessed through Series’s str attribute; for exam‐
ple, we could check whether each email address has 'gmail' in it with str.contains:

In [17]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [21]:
# Regular expressions can be used, too, along with 
# any re options like IGNORECASE:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [22]:
# There are a couple of ways to do vectorized element retrieval. 
# Either use str.get or index into the str attribute:

matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [28]:
data.str.get(1)

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [29]:
data.str[0]

Dave       d
Steve      s
Rob        r
Wes      NaN
dtype: object

In [30]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object