In [1]:
import numpy as np
import pandas as pd

## Hierarchical indexing

Representing high dimensions in a low dimensional form.

### Series

In [2]:
ser = pd.Series(np.random.standard_normal(12),
                index=[['a'] * 4 + ['b'] * 2 + ['c'] * 6,
                       [1, 2, 3, 4, 1, 2, 1, 3, 5, 7, 9, 11]])
ser

a  1    -1.467952
   2    -0.239462
   3     0.359938
   4    -0.034652
b  1     0.144800
   2     0.846866
c  1    -0.674324
   3     0.928318
   5    -0.213066
   7    -0.430199
   9     0.406780
   11   -0.640326
dtype: float64

Internal representation of index:

In [3]:
ser.index

MultiIndex([('a',  1),
            ('a',  2),
            ('a',  3),
            ('a',  4),
            ('b',  1),
            ('b',  2),
            ('c',  1),
            ('c',  3),
            ('c',  5),
            ('c',  7),
            ('c',  9),
            ('c', 11)],
           )

In [4]:
ser.loc['b']

1    0.144800
2    0.846866
dtype: float64

In [5]:
ser['a':'b']

a  1   -1.467952
   2   -0.239462
   3    0.359938
   4   -0.034652
b  1    0.144800
   2    0.846866
dtype: float64

In [6]:
ser[:, 3]

a    0.359938
c    0.928318
dtype: float64

Turning into a dataframe by unstacking, which can also be stacked.

In [7]:
ser.unstack()

Unnamed: 0,1,2,3,4,5,7,9,11
a,-1.467952,-0.239462,0.359938,-0.034652,,,,
b,0.1448,0.846866,,,,,,
c,-0.674324,,0.928318,,-0.213066,-0.430199,0.40678,-0.640326


In [8]:
ser.unstack().stack()

a  1    -1.467952
   2    -0.239462
   3     0.359938
   4    -0.034652
b  1     0.144800
   2     0.846866
c  1    -0.674324
   3     0.928318
   5    -0.213066
   7    -0.430199
   9     0.406780
   11   -0.640326
dtype: float64

### DataFrame

In [9]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=[['a', 'a', 'b', 'b'], [0, 1, 1, 3]],
                  columns=[['Bacon', 'Bacon', 'Eggs', 'Eggs'], ['Piggy', 'Puppy', 'Piggy', 'Kitty']])
df.index.names = ['1st', '2nd']
df.columns.names = ['Food', 'Customer']
df

Unnamed: 0_level_0,Food,Bacon,Bacon,Eggs,Eggs
Unnamed: 0_level_1,Customer,Piggy,Puppy,Piggy,Kitty
1st,2nd,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,0,0,1,2,3
a,1,4,5,6,7
b,1,8,9,10,11
b,3,12,13,14,15


In [10]:
df.columns.nlevels, df.index.nlevels

(2, 2)

In [11]:
df['Bacon']

Unnamed: 0_level_0,Customer,Piggy,Puppy
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0,1
a,1,4,5
b,1,8,9
b,3,12,13


In [12]:
df.loc['a']

Food,Bacon,Bacon,Eggs,Eggs
Customer,Piggy,Puppy,Piggy,Kitty
2nd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,1,2,3
1,4,5,6,7


This is another way of creating a multi index.

In [13]:
pd.MultiIndex.from_arrays([['Bacon', 'Bacon', 'Eggs', 'Eggs'],
                          ['Piggy', 'Puppy', 'Piggy', 'Kitty']],
                          names=['Food', 'Customer'])

MultiIndex([('Bacon', 'Piggy'),
            ('Bacon', 'Puppy'),
            ( 'Eggs', 'Piggy'),
            ( 'Eggs', 'Kitty')],
           names=['Food', 'Customer'])

In [14]:
df.columns

MultiIndex([('Bacon', 'Piggy'),
            ('Bacon', 'Puppy'),
            ( 'Eggs', 'Piggy'),
            ( 'Eggs', 'Kitty')],
           names=['Food', 'Customer'])

### Reordering and Sorting levels

In [15]:
df.swaplevel()

Unnamed: 0_level_0,Food,Bacon,Bacon,Eggs,Eggs
Unnamed: 0_level_1,Customer,Piggy,Puppy,Piggy,Kitty
2nd,1st,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,a,0,1,2,3
1,a,4,5,6,7
1,b,8,9,10,11
3,b,12,13,14,15


Level is for index and axis is axis.

In [26]:
df.sort_index(level=1, ascending=False, axis=1)

Unnamed: 0_level_0,Food,Bacon,Eggs,Bacon,Eggs
Unnamed: 0_level_1,Customer,Puppy,Piggy,Piggy,Kitty
1st,2nd,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,0,1,2,0,3
a,1,5,6,4,7
b,1,9,10,8,11
b,3,13,14,12,15


Selection performance is better on `sort_index()` dataframes.

### Index to Column to Index 

Column can become an index with `set_index`.

In [17]:
df2 = df['Bacon']
df2

Unnamed: 0_level_0,Customer,Piggy,Puppy
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0,1
a,1,4,5
b,1,8,9
b,3,12,13


In [18]:
df2.set_index(['Piggy'])

Customer,Puppy
Piggy,Unnamed: 1_level_1
0,1
4,5
8,9
12,13


In [19]:
df2.set_index(['Piggy'], drop=False)

Customer,Piggy,Puppy
Piggy,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1
4,4,5
8,8,9
12,12,13


Or remove all the index by `reset_index`

In [20]:
df2.reset_index()

Customer,1st,2nd,Piggy,Puppy
0,a,0,0,1
1,a,1,4,5
2,b,1,8,9
3,b,3,12,13
