# Pandas - Cleaning

### Contents

 - [Map](#map)
 - [Apply](#apply)
 - [Apply Map](#applymap)
 - [Duplicates](#duplicates)
 - [Replace](#replace)
 - [Transform](#transform)

In [1]:
from IPython.core.display import HTML
css = open('styles/style-table.css').read() + open('styles/style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

### Map

In [2]:
import pandas as pd
import numpy as np

In [3]:
def square(x):
    return x*x

values = range(10)
#np.multiply(list(values), list(values))

results = []
for x in values:
    results.append(square(x))
    
results

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [4]:
# Same as above but with map(func, df)
results = map(square, values)
list(results)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [5]:
data = pd.DataFrame(10*np.arange(1,17).reshape(4,4),
                    index = ['y1','y2','y3','y4'],
                    columns = ['x1','x2','x3','x4'])
data

Unnamed: 0,x1,x2,x3,x4
y1,10,20,30,40
y2,50,60,70,80
y3,90,100,110,120
y4,130,140,150,160


In [6]:
# Apply transformations to column x1
data['x1'].map( lambda  x : x if x < 60 else -1 )

y1    10
y2    50
y3    -1
y4    -1
Name: x1, dtype: int64

In [7]:
# Apply transformations to row y1
data.ix['y1'].map( lambda x: x if x == 30 else 10 )

x1    10
x2    10
x3    30
x4    10
Name: y1, dtype: int64

### Apply

In [8]:
data = pd.DataFrame(10*np.arange(1,17).reshape(4,4),
                    index = ['y1','y2','y3','y4'],
                    columns = ['x1','x2','x3','x4'])
data

Unnamed: 0,x1,x2,x3,x4
y1,10,20,30,40
y2,50,60,70,80
y3,90,100,110,120
y4,130,140,150,160


In [9]:
def normalize(x):
    norm_x = (x - x.min())/float(x.max()-x.min())
    return norm_x

In [10]:
# Normalize rows...
norm_rows = data.apply(normalize, axis="columns")
norm_rows

Unnamed: 0,x1,x2,x3,x4
y1,0.0,0.333333,0.666667,1.0
y2,0.0,0.333333,0.666667,1.0
y3,0.0,0.333333,0.666667,1.0
y4,0.0,0.333333,0.666667,1.0


In [11]:
# Normalize columns
norm_cols = data.apply(normalize)
norm_cols

Unnamed: 0,x1,x2,x3,x4
y1,0.0,0.0,0.0,0.0
y2,0.333333,0.333333,0.333333,0.333333
y3,0.666667,0.666667,0.666667,0.666667
y4,1.0,1.0,1.0,1.0


In [12]:
def add_prop(group):
    x2 = group['x1'].astype(float)
    group['prop'] = x2/x2.sum()
    return group

In [13]:
# Apply a custom function to a group
grp = data.groupby("x1")
grp = grp.apply(add_prop)
grp.head()

Unnamed: 0,x1,x2,x3,x4,prop
y1,10,20,30,40,1.0
y2,50,60,70,80,1.0
y3,90,100,110,120,1.0
y4,130,140,150,160,1.0


### Apply Map

In [14]:
def format(x):
    return '%.2f'%x

In [15]:
data = pd.DataFrame(10*np.arange(1,17).reshape(4,4),
                    index = ['y1','y2','y3','y4'],
                    columns = ['x1','x2','x3','x4'])
data

Unnamed: 0,x1,x2,x3,x4
y1,10,20,30,40
y2,50,60,70,80
y3,90,100,110,120
y4,130,140,150,160


In [16]:
# Apply function to each of the elements.
data.applymap(format)

Unnamed: 0,x1,x2,x3,x4
y1,10.0,20.0,30.0,40.0
y2,50.0,60.0,70.0,80.0
y3,90.0,100.0,110.0,120.0
y4,130.0,140.0,150.0,160.0


In [17]:
# Apply function to each of the elements
data.applymap(lambda x: "%.2f"%x)

Unnamed: 0,x1,x2,x3,x4
y1,10.0,20.0,30.0,40.0
y2,50.0,60.0,70.0,80.0
y3,90.0,100.0,110.0,120.0
y4,130.0,140.0,150.0,160.0


### Duplicates

In [18]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
balance = [10.2, 84.3, 72.9, 27.1, 223.1]
department = ['A','A','B','B','B']

users = pd.DataFrame({'id': ids, 
                      'name': names, 
                      'bal':balance, 
                      'dept': department})

In [19]:
dups = pd.concat([users,users], ignore_index=True)
dups

Unnamed: 0,bal,dept,id,name
0,10.2,A,23,John
1,84.3,A,34,Matt
2,72.9,B,83,Sara
3,27.1,B,86,Jim
4,223.1,B,12,Ashley
5,10.2,A,23,John
6,84.3,A,34,Matt
7,72.9,B,83,Sara
8,27.1,B,86,Jim
9,223.1,B,12,Ashley


In [20]:
dups.duplicated()
#dups.sort_values('dept').duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [21]:
dups.drop_duplicates(inplace=True)
dups

Unnamed: 0,bal,dept,id,name
0,10.2,A,23,John
1,84.3,A,34,Matt
2,72.9,B,83,Sara
3,27.1,B,86,Jim
4,223.1,B,12,Ashley


### Replace

In [22]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
balance = [10.2, 84.3, 72.9, 27.1, 223.1]
department = ['A','A','B','B','B']

users = pd.DataFrame({'id': ids, 
                      'name': names, 
                      'bal':balance, 
                      'dept': department})

In [23]:
department_name = {'A': 'Finance', 'B': 'Marketing'}

In [24]:
# Replace the values
users['dept'] = users['dept'].map( lambda x: department_name.get(x,None))
users

Unnamed: 0,bal,dept,id,name
0,10.2,Finance,23,John
1,84.3,Finance,34,Matt
2,72.9,Marketing,83,Sara
3,27.1,Marketing,86,Jim
4,223.1,Marketing,12,Ashley


In [25]:
#
users['dept'].replace(['Finance','Marketing'],['a','b'], inplace=True)
users

Unnamed: 0,bal,dept,id,name
0,10.2,a,23,John
1,84.3,a,34,Matt
2,72.9,b,83,Sara
3,27.1,b,86,Jim
4,223.1,b,12,Ashley


<a id="transform" />

### Transform