# 13.1 Hierarchical indexing (MultiIndex)
## 13.1.1 Creating a MultiIndex (hierarchical index) object

In [1]:
%matplotlib inline
from __future__ import division
from numpy.random import randn
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import pandas as pd

  return f(*args, **kwds)


In [2]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
     ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip( * arrays))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [3]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [4]:
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one       0.451192
       two      -1.039329
baz    one       2.122429
       two      -0.239152
foo    one       1.375938
       two      -0.816652
qux    one       0.733294
       two       1.781976
dtype: float64

In [5]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]
pd.MultiIndex.from_product(iterables, names=['first', 'second'])

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [6]:
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
s = pd.Series(np.random.randn(8), index=arrays)
s

bar  one    0.638664
     two    0.195827
baz  one    2.279088
     two   -0.641272
foo  one   -0.359445
     two   -0.746817
qux  one   -1.770047
     two    0.497112
dtype: float64

In [12]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-1.056779,-1.040125,-0.15174,-0.515528
bar,two,-0.368087,1.185506,-0.417869,-1.058281
baz,one,-1.532904,0.13888,0.349308,0.620886
baz,two,0.046689,0.842409,-0.44798,1.783645
foo,one,0.361065,1.036775,1.020216,-0.05163
foo,two,-2.102155,-1.989624,-0.97902,-0.421649
qux,one,0.537921,-0.855494,-1.631132,-0.191199
qux,two,-0.206648,-0.211115,0.399097,0.19427


In [13]:
df.index.names

FrozenList([None, None])

In [14]:
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.074241,-0.482127,0.129298,-0.32198,0.734013,1.498712,-0.415488,0.370363
B,0.649399,1.228761,-0.028103,-0.498397,1.532541,0.924552,0.345741,-0.739153
C,-0.43256,0.368315,-0.162692,2.166071,-0.47019,-1.302963,1.130997,1.160428


In [15]:
 pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,1.841936,2.324082,0.456609,1.479648,0.223268,-0.925408
bar,two,-0.067001,0.424622,-0.815133,-0.92503,1.745802,-1.913478
baz,one,0.094427,-0.996544,0.313434,-2.217662,0.244596,-0.646073
baz,two,0.752656,-1.549422,-1.003636,-0.315211,-1.671138,-0.875241
foo,one,-1.128865,-0.91306,-0.057344,0.186962,1.019944,-0.025936
foo,two,-1.32132,-1.464029,-0.093285,0.121781,0.653368,0.157146


In [16]:
pd.Series(np.random.randn(8), index=tuples)

(bar, one)    0.788477
(bar, two)    0.762617
(baz, one)    1.378453
(baz, two)    0.273679
(foo, one)    0.925107
(foo, two)    1.098801
(qux, one)    0.930256
(qux, two)    0.319794
dtype: float64

## 13.1.2 Reconstructing the level labels

In [17]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [18]:
index.get_level_values('second')

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

## 13.1.3 Basic indexing on axis with MultiIndex
>这段话的意思是沿着MultiIndex所在的轴机械索引

In [19]:
df['bar']#输出结构是一个dataframe

second,one,two
A,0.074006,-0.372976
B,1.780504,-0.530403
C,0.43703,-0.729604


In [20]:
 df['bar', 'one']#输出结果是一个serial

A    0.074006
B    1.780504
C    0.437030
Name: (bar, one), dtype: float64

In [21]:
 df['bar']['one']#和上述写法的输出相同

A    0.074006
B    1.780504
C    0.437030
Name: one, dtype: float64

使用`df[]`如果要选中多列的话，则写法如下:

In [22]:
df[['bar', 'baz']]

first,bar,bar,baz,baz
second,one,two,one,two
A,0.074006,-0.372976,-0.370736,-0.909506
B,1.780504,-0.530403,-0.384049,2.324615
C,0.43703,-0.729604,-0.288674,2.109959


## 13.1.4 Defined Levels

In [23]:
df.columns

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [24]:
df[['foo','qux']].columns

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[2, 2, 3, 3], [0, 1, 0, 1]],
           names=['first', 'second'])

In [25]:
df[['foo','qux']].columns.values

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [9]:
df[['foo','qux']].columns.get_level_values(0)

KeyError: "['foo' 'qux'] not in index"

In [27]:
df[['foo','qux']].columns.remove_unused_levels()#低版本可能不支持此方法

AttributeError: 'MultiIndex' object has no attribute 'remove_unused_levels'

## 13.1.5 Data alignment and using reindex

In [28]:
s[:-2]

bar  one   -1.725771
     two   -1.004470
baz  one    0.138889
     two   -0.840849
foo  one   -1.271550
     two   -1.450136
dtype: float64

In [29]:
s + s[:-2]

bar  one   -3.451542
     two   -2.008940
baz  one    0.277778
     two   -1.681698
foo  one   -2.543101
     two   -2.900272
qux  one         NaN
     two         NaN
dtype: float64

In [30]:
s + s[::2]

bar  one   -3.451542
     two         NaN
baz  one    0.277778
     two         NaN
foo  one   -2.543101
     two         NaN
qux  one    0.951645
     two         NaN
dtype: float64

In [31]:
 s.reindex(index[:3])

first  second
bar    one      -1.725771
       two      -1.004470
baz    one       0.138889
dtype: float64

In [32]:
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])

foo  two   -1.450136
bar  one   -1.725771
qux  one    0.475822
baz  one    0.138889
dtype: float64

# 13.2 Advanced indexing with hierarchical index

In [16]:
 df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.074241,-0.482127,0.129298,-0.32198,0.734013,1.498712,-0.415488,0.370363
B,0.649399,1.228761,-0.028103,-0.498397,1.532541,0.924552,0.345741,-0.739153
C,-0.43256,0.368315,-0.162692,2.166071,-0.47019,-1.302963,1.130997,1.160428


In [17]:
df = df.T
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.074241,0.649399,-0.43256
bar,two,-0.482127,1.228761,0.368315
baz,one,0.129298,-0.028103,-0.162692
baz,two,-0.32198,-0.498397,2.166071
foo,one,0.734013,1.532541,-0.47019
foo,two,1.498712,0.924552,-1.302963
qux,one,-0.415488,0.345741,1.130997
qux,two,0.370363,-0.739153,1.160428


In [18]:
df.loc['bar']

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.074241,0.649399,-0.43256
two,-0.482127,1.228761,0.368315


In [19]:
 df.loc['bar', 'two']

A   -0.482127
B    1.228761
C    0.368315
Name: (bar, two), dtype: float64

In [20]:
df.loc['baz':'foo']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,one,0.129298,-0.028103,-0.162692
baz,two,-0.32198,-0.498397,2.166071
foo,one,0.734013,1.532541,-0.47019
foo,two,1.498712,0.924552,-1.302963


In [21]:
df.loc[('baz', 'two'):('qux', 'one')]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,-0.32198,-0.498397,2.166071
foo,one,0.734013,1.532541,-0.47019
foo,two,1.498712,0.924552,-1.302963
qux,one,-0.415488,0.345741,1.130997


In [22]:
df.loc[('baz', 'two'):'foo']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,-0.32198,-0.498397,2.166071
foo,one,0.734013,1.532541,-0.47019
foo,two,1.498712,0.924552,-1.302963


In [23]:
df.loc[[('bar', 'two'), ('qux', 'one')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,two,-0.482127,1.228761,0.368315
qux,one,-0.415488,0.345741,1.130997


## 13.2.1 Using slicers

In [24]:
def mklbl(prefix,n):
    return ["%s%s" % (prefix,i) for i in range(n)]

In [25]:
miindex = pd.MultiIndex.from_product([mklbl('A',4),mklbl('B',2),mklbl('C',4),mklbl('D',2)])

In [26]:
micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')],names=['lvl0', 'lvl1'])

In [28]:
dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns)).reshape((len(miindex),len(micolumns))),index=miindex,columns=micolumns).sort_index().sort_index(axis=1)
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
A0,B0,C2,D1,21,20,23,22
A0,B0,C3,D0,25,24,27,26
A0,B0,C3,D1,29,28,31,30
A0,B1,C0,D0,33,32,35,34
A0,B1,C0,D1,37,36,39,38


In [51]:
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [52]:
idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [53]:
dfmi.loc['A1', (slice(None), 'foo')]

Unnamed: 0_level_0,Unnamed: 1_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,lvl1,foo,foo
B0,C0,D0,64,66
B0,C0,D1,68,70
B0,C1,D0,72,74
B0,C1,D1,76,78
B0,C2,D0,80,82
B0,C2,D1,84,86
B0,C3,D0,88,90
B0,C3,D1,92,94
B1,C0,D0,96,98
B1,C0,D1,100,102


In [54]:
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [56]:
mask = dfmi[('a', 'foo')] > 200
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A3,B0,C1,D1,204,206
A3,B0,C3,D0,216,218
A3,B0,C3,D1,220,222
A3,B1,C1,D0,232,234
A3,B1,C1,D1,236,238
A3,B1,C3,D0,248,250
A3,B1,C3,D1,252,254


In [57]:
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]df2 = dfmi.copy()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C3,D0,25,24,27,26
A0,B0,C3,D1,29,28,31,30
A0,B1,C1,D0,41,40,43,42
A0,B1,C1,D1,45,44,47,46
A0,B1,C3,D0,57,56,59,58
A0,B1,C3,D1,61,60,63,62
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78


In [58]:
df2 = dfmi.copy()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,-10,-10,-10,-10
A0,B0,C1,D1,-10,-10,-10,-10
A0,B0,C2,D0,17,16,19,18
A0,B0,C2,D1,21,20,23,22
A0,B0,C3,D0,-10,-10,-10,-10
A0,B0,C3,D1,-10,-10,-10,-10
A0,B1,C0,D0,33,32,35,34
A0,B1,C0,D1,37,36,39,38


In [59]:
df2 = dfmi.copy()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2*1000
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9000,8000,11000,10000
A0,B0,C1,D1,13000,12000,15000,14000
A0,B0,C2,D0,17,16,19,18
A0,B0,C2,D1,21,20,23,22
A0,B0,C3,D0,25000,24000,27000,26000
A0,B0,C3,D1,29000,28000,31000,30000
A0,B1,C0,D0,33,32,35,34
A0,B1,C0,D1,37,36,39,38


## 13.2.2 Cross-section

In [60]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.074006,1.780504,0.43703
bar,two,-0.372976,-0.530403,-0.729604
baz,one,-0.370736,-0.384049,-0.288674
baz,two,-0.909506,2.324615,2.109959
foo,one,0.999811,0.074127,0.444223
foo,two,1.173751,2.154924,-0.100172
qux,one,-1.680412,-1.455452,-1.473432
qux,two,-0.126603,-0.036794,-0.554355


In [61]:
df.xs('one', level='second')

Unnamed: 0_level_0,A,B,C
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.074006,1.780504,0.43703
baz,-0.370736,-0.384049,-0.288674
foo,0.999811,0.074127,0.444223
qux,-1.680412,-1.455452,-1.473432


In [62]:
df.loc[(slice(None),'one'),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.074006,1.780504,0.43703
baz,one,-0.370736,-0.384049,-0.288674
foo,one,0.999811,0.074127,0.444223
qux,one,-1.680412,-1.455452,-1.473432


从上面两种写法的输出结果来看，虽然选中的数据相同，但是输出结果的结构是并不相同的，第一种结构是普通的index，而第二种是multiindex

In [63]:
df = df.T
df.xs('one', level='second', axis=1)

first,bar,baz,foo,qux
A,0.074006,-0.370736,0.999811,-1.680412
B,1.780504,-0.384049,0.074127,-1.455452
C,0.43703,-0.288674,0.444223,-1.473432


In [64]:
df.loc[:,(slice(None),'one')]

first,bar,baz,foo,qux
second,one,one,one,one
A,0.074006,-0.370736,0.999811,-1.680412
B,1.780504,-0.384049,0.074127,-1.455452
C,0.43703,-0.288674,0.444223,-1.473432


In [65]:
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)

first,bar
second,one
A,0.074006
B,1.780504
C,0.43703


In [66]:
type(df.xs(('one', 'bar'), level=('second', 'first'), axis=1))

pandas.core.frame.DataFrame

需要注意这里函数`xs`的入参，第一个入参`('one', 'bar')`其实是和第二个入参`('second', 'first')`相匹配的。

You can pass drop_level=False to xs() to retain the level that was selected

In [67]:
df.xs('one', level='second', axis=1, drop_level=False)

first,bar,baz,foo,qux
second,one,one,one,one
A,0.074006,-0.370736,0.999811,-1.680412
B,1.780504,-0.384049,0.074127,-1.455452
C,0.43703,-0.288674,0.444223,-1.473432


In [68]:
df.xs('one', level='second', axis=1, drop_level=True)

first,bar,baz,foo,qux
A,0.074006,-0.370736,0.999811,-1.680412
B,1.780504,-0.384049,0.074127,-1.455452
C,0.43703,-0.288674,0.444223,-1.473432


## 13.2.3 Advanced reindexing and alignment

In [69]:
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],labels=[[1,1,0,0],[1,0,1,0]])

In [71]:
df = pd.DataFrame(np.random.randn(4,2), index=midx)
df

Unnamed: 0,Unnamed: 1,0,1
one,y,1.673751,0.570796
one,x,1.176949,-0.10921
zero,y,-0.608239,0.349932
zero,x,-0.407398,0.108706


In [73]:
df2 = df.mean(level=0)
df2

Unnamed: 0,0,1
zero,-0.507819,0.229319
one,1.42535,0.230793


In [74]:
df2.reindex(df.index, level=0)

Unnamed: 0,Unnamed: 1,0,1
one,y,1.42535,0.230793
one,x,1.42535,0.230793
zero,y,-0.507819,0.229319
zero,x,-0.507819,0.229319


In [77]:
df_aligned, df2_aligned = df.align(df2, level=0)
df_aligned

Unnamed: 0,Unnamed: 1,0,1
one,y,1.673751,0.570796
one,x,1.176949,-0.10921
zero,y,-0.608239,0.349932
zero,x,-0.407398,0.108706


In [78]:
df2_aligned

Unnamed: 0,Unnamed: 1,0,1
one,y,1.42535,0.230793
one,x,1.42535,0.230793
zero,y,-0.507819,0.229319
zero,x,-0.507819,0.229319


## 13.2.4 Swapping levels with swaplevel()

In [79]:
df[:5]

Unnamed: 0,Unnamed: 1,0,1
one,y,1.673751,0.570796
one,x,1.176949,-0.10921
zero,y,-0.608239,0.349932
zero,x,-0.407398,0.108706


In [80]:
 df[:5].swaplevel(0, 1, axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,1.673751,0.570796
x,one,1.176949,-0.10921
y,zero,-0.608239,0.349932
x,zero,-0.407398,0.108706


## 13.2.5 Reordering levels with reorder_levels()

In [81]:
df[:5].reorder_levels([1,0], axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,1.673751,0.570796
x,one,1.176949,-0.10921
y,zero,-0.608239,0.349932
x,zero,-0.407398,0.108706


# 13.3 Sorting a MultiIndex

In [82]:
 import random; random.shuffle(tuples)

In [83]:
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s

baz  one    0.818091
foo  two    0.356114
     one    0.497309
qux  one    0.661342
baz  two   -0.374351
bar  two    0.412466
     one   -0.300108
qux  two   -0.470887
dtype: float64

In [84]:
 s.sort_index()

bar  one   -0.300108
     two    0.412466
baz  one    0.818091
     two   -0.374351
foo  one    0.497309
     two    0.356114
qux  one    0.661342
     two   -0.470887
dtype: float64

In [85]:
s.sort_index(level=0)

bar  one   -0.300108
     two    0.412466
baz  one    0.818091
     two   -0.374351
foo  one    0.497309
     two    0.356114
qux  one    0.661342
     two   -0.470887
dtype: float64

In [86]:
 s.sort_index(level=1)

bar  one   -0.300108
baz  one    0.818091
foo  one    0.497309
qux  one    0.661342
bar  two    0.412466
baz  two   -0.374351
foo  two    0.356114
qux  two   -0.470887
dtype: float64

In [88]:
s.index.set_names(['L1', 'L2'], inplace=True)
s

L1   L2 
baz  one    0.818091
foo  two    0.356114
     one    0.497309
qux  one    0.661342
baz  two   -0.374351
bar  two    0.412466
     one   -0.300108
qux  two   -0.470887
dtype: float64

In [89]:
 s.sort_index(level='L1')

L1   L2 
bar  one   -0.300108
     two    0.412466
baz  one    0.818091
     two   -0.374351
foo  one    0.497309
     two    0.356114
qux  one    0.661342
     two   -0.470887
dtype: float64

In [90]:
 s.sort_index(level='L2')

L1   L2 
bar  one   -0.300108
baz  one    0.818091
foo  one    0.497309
qux  one    0.661342
bar  two    0.412466
baz  two   -0.374351
foo  two    0.356114
qux  two   -0.470887
dtype: float64

In [91]:
 df.T.sort_index(level=1, axis=1)

Unnamed: 0_level_0,zero,one,zero,one
Unnamed: 0_level_1,x,x,y,y
0,-0.407398,1.176949,-0.608239,1.673751
1,0.108706,-0.10921,0.349932,0.570796


In [93]:
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
'joe': ['x', 'x', 'z', 'y'],
'jolie': np.random.rand(4)})
dfm = dfm.set_index(['jim', 'joe'])
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.761517
0,x,0.110394
1,z,0.896214
1,y,0.680846


In [94]:
dfm.loc[(1, 'z')]

  if __name__ == '__main__':


Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,z,0.896214


In [95]:
 dfm.index.is_lexsorted()

False

In [96]:
 dfm.index.lexsort_depth

1

In [97]:
dfm = dfm.sort_index()
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.761517
0,x,0.110394
1,y,0.680846
1,z,0.896214


In [99]:
dfm.index.is_lexsorted()

True

In [98]:
dfm.index.lexsort_depth

2

In [100]:
dfm.loc[(0,'y'):(1, 'z')]

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,y,0.680846
1,z,0.896214


# 13.4 Take Methods

In [102]:
index = pd.Index(np.random.randint(0, 1000, 10))
index

Int64Index([595, 897, 608, 353, 929, 395, 372, 872, 751, 815], dtype='int64')

In [103]:
positions = [0, 9, 3]

In [104]:
index[positions]

Int64Index([595, 815, 353], dtype='int64')

In [105]:
index.take(positions)

Int64Index([595, 815, 353], dtype='int64')

In [106]:
ser = pd.Series(np.random.randn(10))

In [107]:
ser.iloc[positions]

0    0.661064
9   -0.014501
3    1.234987
dtype: float64

In [108]:
ser.take(positions)

0    0.661064
9   -0.014501
3    1.234987
dtype: float64

In [109]:
frm = pd.DataFrame(np.random.randn(5, 3))

In [110]:
frm.take([1, 4, 3])

Unnamed: 0,0,1,2
1,-0.832012,0.799553,0.848477
4,1.927942,-0.144121,1.015594
3,-0.652196,0.601819,-2.059418


In [111]:
 frm.take([0, 2], axis=1)

Unnamed: 0,0,2
0,-0.464378,0.08922
1,-0.832012,0.848477
2,0.804328,1.267339
3,-0.652196,-2.059418
4,1.927942,1.015594


In [114]:
arr = np.random.randn(10)
arr

array([-0.7923, -0.3155,  1.157 , -0.7945,  1.8619,  0.9373, -1.4501,
        1.8907, -0.5277, -0.5532])

In [115]:
arr.take([False, False, True, True])

array([-0.7923, -0.7923, -0.3155, -0.3155])

In [116]:
arr[[0, 1]]

array([-0.7923, -0.3155])

In [119]:
ser = pd.Series(np.random.randn(10))
ser

0    0.397248
1   -2.082079
2   -0.234860
3   -1.605896
4    0.995013
5    0.536290
6    0.172954
7   -2.712020
8    0.274945
9    1.258128
dtype: float64

In [118]:
ser.take([False, False, True, True])

0   -0.152240
0   -0.152240
1   -0.565602
1   -0.565602
dtype: float64

In [120]:
ser.iloc[[0, 1]]

0    0.397248
1   -2.082079
dtype: float64

在这种情况下，False表示的是0，True表示的是1

# 13.5 Index Types

pandas的index类型如下：
- Int64Index
- Float64Index
- RangeIndex
- IntervalIndexS
- MultiIndex
- DatetimeIndex
- PeriodIndex
- TimedeltaIndex
- CategoricalIndex

## 13.5.1 CategoricalIndex

In [121]:
#from pandas.api.types import CategoricalDtype


ImportError: No module named 'pandas.api'

In [125]:
df = pd.DataFrame({'A': np.arange(6),'B': list('aabbca')})
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [126]:
df['B'] = df['B'].astype('category', categories=list('cab'))
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [127]:
df.dtypes

A       int32
B    category
dtype: object

In [132]:
df.B.cat.categories

Index(['c', 'a', 'b'], dtype='object')

In [131]:
df2 = df.set_index('B')
df2

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [133]:
df2.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [135]:
df2.sort_index()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,0
a,1
a,5
b,2
b,3


In [136]:
df2.groupby(level=0).sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,6
b,5


In [137]:
df2.groupby(level=0).sum().index

CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

Reindexing operations, will return a resulting index based on the type of the passed indexer, meaning that passing
a list will return a plain-old-Index; indexing with a Categorical will return a CategoricalIndex, indexed
according to the categories of the PASSED Categorical dtype. This allows one to arbitrarily index these even
with values NOT in the categories, similarly to how you can reindex ANY pandas index

In [138]:
 df2.reindex(['a','e'])

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
a,1.0
a,5.0
e,


In [139]:
 df2.reindex(['a','e']).index

Index(['a', 'a', 'a', 'e'], dtype='object', name='B')

In [140]:
 df2.reindex(pd.Categorical(['a','e'],categories=list('abcde')))

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
a,1.0
a,5.0
e,


In [141]:
 df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index

CategoricalIndex(['a', 'a', 'a', 'e'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, name='B', dtype='category')

## 13.5.2 Int64Index and RangeIndex