_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 8 - Data Wrangling: Join, Combine, and Reshape

In [1]:
import numpy as np
import pandas as pd

### Part 1 - Hierarchical Indexing

In [2]:
# two-level index
data = pd.Series(
    np.random.randn(9),
    index=[
        ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
        [1, 2, 3, 1, 3, 1, 2, 2, 3]]
)

print(data)
print(data.index)

a  1    0.088328
   2    1.009396
   3    0.850139
b  1   -0.793838
   3   -1.807616
c  1   -0.824256
   2   -0.082491
d  2    1.097085
   3   -0.067977
dtype: float64
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])


In [3]:
# partial indexing

# implicit loc
print(data['b'])
print(data['b':'c'])

# explicit loc
print(data.loc[['b']])
print(data.loc[['b', 'd']])

# inner level selection
print('"2" elements from all the first index components')
print(data.loc[:, 2])

1   -0.793838
3   -1.807616
dtype: float64
b  1   -0.793838
   3   -1.807616
c  1   -0.824256
   2   -0.082491
dtype: float64
b  1   -0.793838
   3   -1.807616
dtype: float64
b  1   -0.793838
   3   -1.807616
d  2    1.097085
   3   -0.067977
dtype: float64
"2" elements from all the first index components
a    1.009396
c   -0.082491
d    1.097085
dtype: float64


In [4]:
# unstack a Series to a DataFrame
df = data.unstack()
print(df)
print('--')

# a DataFrame could be stacked to a Series w/ multi level index
print(df.stack())

          1         2         3
a  0.088328  1.009396  0.850139
b -0.793838       NaN -1.807616
c -0.824256 -0.082491       NaN
d       NaN  1.097085 -0.067977
--
a  1    0.088328
   2    1.009396
   3    0.850139
b  1   -0.793838
   3   -1.807616
c  1   -0.824256
   2   -0.082491
d  2    1.097085
   3   -0.067977
dtype: float64


In [14]:
# either axis in a data fram can have a hierarchical index
frame = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print(frame)
print('--')

# hierarchical levels can have names
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']

print(frame)
print('--')

# partial column indexing
print(frame['Ohio'])

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
--
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
--
color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     2         9   10


Reordering and Sorting Levels

In [25]:
# swap multi index levels, data stay the same
print(frame.swaplevel('key1', 'key2'))
print('--')

# sort index on a passed level
print(frame.sort_index(level=1))
print('--')

# combined swap and sort of level
print(frame.swaplevel(0, 1).sort_index()) # by default, sort_index level is zero

state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11
--
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
b    1        6   7        8
a    2        3   4        5
b    2        9  10       11
--
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
     b        6   7        8
2    a        3   4        5
     b        9  10       11


Summary Statistics by Level

In [32]:
print(frame)
print('-- aggregating on key2')
print(frame.sum(level='key2'))  # by default axis is zero
print('-- aggregating on axis 1 by color')
print(frame.sum(level='color', axis=1))

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
-- aggregating on key2
state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2        12  14       16
-- aggregating on axis 1 by color
state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2        12  14       16


Indexing with a DataFrame’s columns

In [45]:
frame = pd.DataFrame({
    'a': range(7),
    'b': range(7, 0, -1),
    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
    'd': [0, 1, 2, 0, 1, 2, 3]})

print(frame)
print('--')

# set_index() to use column(s) as DF index
frame2 = frame.set_index(['c', 'd'])
print(frame2)
print('--')

# by default, removes the columns, but we can keep them in
print(frame.set_index(['c', 'd'], drop=False))
print('--')

# move back index to data
print(frame2.reset_index())

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
--
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
--
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
--
     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1
