# 8 Data wrangling:join, combine, and reshape

## 8.1 Hierarchical indexing

In [4]:
import pandas as pd
import numpy as np

In [5]:
# create a Series with a list of lists as the index
data = pd.Series(np.random.randn(9),
                index = [['a','a','a','b','b','c','c','d','d'],
                        [1,2,3,1,3,1,2,2,3]])
data

a  1   -1.422412
   2   -0.841441
   3    1.064353
b  1    0.442994
   3   -0.763362
c  1   -0.682379
   2    0.053266
d  2   -0.637368
   3    1.135165
dtype: float64

In [7]:
# select subset of the data
data['b']

1    0.442994
3   -0.763362
dtype: float64

In [6]:
# selection from an "inner" level
data.loc[:,2]

a   -0.841441
c    0.053266
d   -0.637368
dtype: float64

In [8]:
# rearrange the data into a DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,-1.422412,-0.841441,1.064353
b,0.442994,,-0.763362
c,-0.682379,0.053266,
d,,-0.637368,1.135165


In [9]:
# the inverse operation of unstack
data.unstack().stack()

a  1   -1.422412
   2   -0.841441
   3    1.064353
b  1    0.442994
   3   -0.763362
c  1   -0.682379
   2    0.053266
d  2   -0.637368
   3    1.135165
dtype: float64

In [10]:
# hierachical index
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                    index = [['a','a','b','b'],[1,2,1,2]],
                    columns = [['Ohio','Ohio','Colorado'],
                              ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [11]:
# set name for the index and columns
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [12]:
# selection from partial column indexing
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### 8.1.1 Reordering and sorting levels

In [14]:
# rearrange the order of the levels and sort the data
frame.swaplevel('key1','key2').sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### 8.1.2 Summary statistics by level

In [15]:
# aggregate by level on the rows
frame.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [16]:
# aggregate by level on the columns
frame.sum(level = 'color',axis = 1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 8.1.3 Indexing with a DataFrame's columns

In [17]:
frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                     'c':['one','one','one','two','two','two','two'],
                     'd':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [18]:
# set columns as index
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [19]:
# set columns as index and leave the colums
frame.set_index(['c','d'],drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [20]:
# move the hierarchical index into the columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## 8.2 Combining and merging datasets

### 8.2.1 Database-style dataframe joins

In [25]:
# combine data by linking rows
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df2 = pd.DataFrame({'key':['a','b','d'],
                   'data2':range(3)})
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [28]:
# combine dataframe with different columns name
df3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df4 = pd.DataFrame({'rkey':['a','b','d'],
                   'data2':range(3)})
pd.merge(df3,df4,left_on='lkey',right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


### 8.2.3 Merging on index