#Data Wrangling: Clean, Transform, Merge, Reshape

In [None]:
import numpy as np
from numpy import nan
import pandas as pd
from pandas import Series, DataFrame

##Combining and Merging Data Sets

###Database-style Data Frame Merges

In [None]:
d1 = DataFrame({'key': list('bacab'), 'data1': range(5)})
d1

In [None]:
d2 = DataFrame({'key': list('abd'), 'data2': range(3)})
d2

In [None]:
pd.merge(d1, d2)

In [None]:
pd.merge(d1, d2, on='key')

In [None]:
d3 = DataFrame({'key1': list('bacab'), 'data1': range(5)})
d4 = DataFrame({'key2': list('abd'), 'data2': range(3)})
pd.merge(d3, d4, left_on='key1', right_on='key2')

In [None]:
pd.merge(d3, d4, left_on='key1', right_on='key2', how='outer')

In [None]:
pd.merge(d3, d4, left_on='key1', right_on='key2', how='left')

In [None]:
pd.merge(d3, d4, left_on='key1', right_on='key2', how='right')

In [None]:
pd.merge(d3, d4, left_on='key1', right_on='key2', how='inner')

In [None]:
d5 = DataFrame({'key1': list('ffb'), 'key2': list('oto'), 'data1': range(3)})
d6 = DataFrame({'key1': list('ffbb'), 'key2': list('ooto'), 'data2': range(10, 14)})
pd.merge(d5, d6, on=['key1', 'key2'], how='outer')

###Merging on Index

In [None]:
d7 = DataFrame({'key': list('abaabc'), 'value': range(6)})
d7

In [None]:
d8 = DataFrame({'group_val': [3.5, 7]}, index=list('ab'))
d8

In [None]:
pd.merge(d7, d8, left_on='key', right_index=True)

###Concatenating Along an Axis

In [None]:
a1 = np.arange(12).reshape((3, 4))
a1

In [None]:
np.concatenate([a1, a1], axis=1)

In [None]:
s1 = Series([0, 1], index=list('ab'))
s2 = Series([2, 3, 4], index=list('cde'))
s3 = Series([5, 6], index=list('fg'))
pd.concat([s1, s2, s3])

In [None]:
pd.concat([s1, s2, s3], axis=1)

In [None]:
s4 = pd.concat([s1 * 5, s3])
pd.concat([s1, s4], axis=1)

In [None]:
pd.concat([s1, s4], axis=1, join='inner')

In [None]:
pd.concat([s1, s4], axis=1, join_axes=[list('acbe')])

In [None]:
d1 = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
d1

In [None]:
d1.unstack()

In [None]:
pd.concat([d3, d4], axis=1, keys=['level1', 'level2'])

###Combining Data with Overlap

In [None]:
s1 = Series([nan, 2.5, nan, 3.5, 4.5, nan], index=list('fedcba'))
s1

In [None]:
s2 = Series(np.arange(len(s1)), dtype=np.float64, index=list('fedcba'))
s2[-1] = nan
s2

In [None]:
s1.combine_first(s2)

In [None]:
d1 = DataFrame([[nan, 4.], [5., nan]])
d2 = DataFrame(np.eye(2))
d1.combine_first(d2)

##Reshaping and Pivoting

In [None]:
d1 = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number'))
d1

In [None]:
result = d1.stack()
result

In [None]:
result.unstack()

In [None]:
result.unstack(0)

In [None]:
s1 = Series(range(4), index=list('abcd'))
s2 = Series(range(4, 7), index=list('cde'))
d1 = pd.concat([s1, s2], keys=['one', 'two'])
d1.unstack()

In [None]:
d1.unstack().stack()

In [None]:
d1.unstack().stack(dropna=False)

###Pivoting "long" to "wide" Format

In [None]:
d1 = pd.read_csv('data/05/macrodata.csv')
periods = pd.PeriodIndex(year=d1.year, quarter=d1.quarter, name='date')
d1 = DataFrame(d1.to_records(), columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D', 'end'))
ld1 = d1.stack().reset_index().rename(columns={0: 'value'})
ld1[:10]

In [None]:
pivoted = ld1.pivot('date', 'item', 'value')
pivoted[:10]

In [None]:
ld1['value2'] = np.random.randn(len(ld1))
ld1[:10]

In [None]:
pivoted = ld1.pivot('date', 'item')
pivoted[:10]

In [None]:
unstacked = ld1.set_index(['date', 'item']).unstack('item')
unstacked[:10]

##Data Transformation

###Removing Duplicates

In [None]:
d1 = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
d1

In [None]:
d1.duplicated()

In [None]:
d1.drop_duplicates()

In [None]:
d1.drop_duplicates(['k1'])

In [None]:
d1.drop_duplicates(['k1', 'k2'], take_last=True)

###Transforming Data Using a Function or Mapping

In [None]:
d1 = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
d1

In [None]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
d1['animal'] = d1['food'].map(str.lower).map(meat_to_animal)
d1

In [None]:
d1['animal2'] = d1['food'].map(lambda x: meat_to_animal[x.lower()])
d1

###Replacing Values

In [None]:
s1 = Series([1., -999., 2., -999., -1000., 3.])
s1

In [None]:
s1.replace(-999., nan)

In [None]:
s1.replace([-999., -1000.], nan)

In [None]:
s1.replace([-999., -1000.], [nan, 0])

In [None]:
s1.replace({-999.: nan, -1000.: 0})

###Renaming Axis Indexes

In [None]:
d1 = DataFrame(np.arange(9).reshape((3, 3)), columns=['foo', 'bar', 'baz'])
d1

In [None]:
d1.columns = d1.columns.map(str.upper)
d1

###Discretization and Binning

In [None]:
s = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(s, bins)
cats

In [None]:
cats.codes

In [None]:
cats.categories

In [None]:
pd.value_counts(cats)

In [None]:
s = np.random.randn(1000)
pd.cut(s, 3, precision=2)

In [None]:
pd.qcut(s, 4)

In [None]:
pd.qcut(s, [0.0, 0.1, 0.5, 0.9, 1])

###Detecting and Filtering Outliers

In [None]:
d1 = DataFrame(np.random.randn(1000, 4))
d1.describe()

In [None]:
d1[np.abs(d1) > 3] = np.sign(d1) * 3
d1.describe()

###Permutation and Random Sampling

In [None]:
d1 = DataFrame(np.arange(5*4).reshape((5, 4)))
d1

In [None]:
sampler = np.random.permutation(5)
sampler

In [None]:
d1.take(sampler)

In [None]:
d1.take(sampler[:3])

In [None]:
d1.take(np.random.randint(0, len(d1), size=5))

###Computing Indicator and Dummy Variables

In [None]:
d1 = DataFrame({'key': list('bbacab'), 'data1': range(6)})
dummies = pd.get_dummies(d1['key'], prefix='key')
dummies

In [None]:
d1[['data1']].join(dummies)

In [None]:
values = np.random.rand(10)
values

In [None]:
bins = [i / 5.0 for i in range(6)]
bins

In [None]:
pd.get_dummies(pd.cut(values, bins))

##String Manipulation

###String Object Methods

In [None]:
s1 = 'fooafoofoobarbazfooo'
s1.count('foo')

In [None]:
s1.index('af')

In [None]:
s1.upper()

In [None]:
s2 = 'foo'
s2.rjust(10, '_')