In [2]:
import numpy as np
import pandas as pd

In [2]:
ser = pd.Series(np.random.randn(10), 
               index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], \
                      [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
ser

a  1    0.333672
   2   -0.170583
   3    0.534783
b  1    0.721344
   2   -0.815547
   3    1.901678
c  1   -0.251056
   2    0.445848
d  2   -0.069375
   3   -0.681292
dtype: float64

In [3]:
ser.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [4]:
ser['b']

1    0.721344
2   -0.815547
3    1.901678
dtype: float64

In [5]:
ser[2] # 就是第二个  和索引中的数字无关

0.5347830326057521

In [6]:
ser['d': 'b']

Series([], dtype: float64)

In [7]:
ser['b': 'd']

b  1    0.721344
   2   -0.815547
   3    1.901678
c  1   -0.251056
   2    0.445848
d  2   -0.069375
   3   -0.681292
dtype: float64

In [8]:
ser[['b', 'd']]

b  1    0.721344
   2   -0.815547
   3    1.901678
d  2   -0.069375
   3   -0.681292
dtype: float64

In [9]:
ser

a  1    0.333672
   2   -0.170583
   3    0.534783
b  1    0.721344
   2   -0.815547
   3    1.901678
c  1   -0.251056
   2    0.445848
d  2   -0.069375
   3   -0.681292
dtype: float64

### 对 “内层” 选取

In [10]:
ser[:, 2]

a   -0.170583
b   -0.815547
c    0.445848
d   -0.069375
dtype: float64

## stack() 与unstack()后面介绍 相当重要，生成透视表

# DataFrame的分层索引

In [11]:
df = pd.DataFrame(np.arange(12).reshape(4, 3),
                 index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                 columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [12]:
df.index.names = ["key1", 'key2']
df.columns.names = ["state", 'color']
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [13]:
df['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### 重排分级顺序 返回新数据

In [14]:
df2 = df.swaplevel('key1', 'key2') # === df.swaplevel(0, 1)
df2

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [15]:
df # 原始数据不变

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
df.sort_index(level=0) # 默认axis=0=index, 按照key1排序

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [17]:
df.sort_index(level=1) # 默认axis=0=index，  按照key2排序

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [18]:
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 根据级别汇总

In [19]:
df.sum(level=1) # 

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [20]:
df.sum(level=0)

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [21]:
df.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## 使用DataFrame的列

In [22]:
df = pd.DataFrame({'a': range(7), \
                   'b': range(7, 0, -1), \
                   'c': ['one'] * 3 + ['two'] * 4, \
                   'd': [0, 1, 2, 0, 1, 2, 3]})
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [23]:
df2 = df.set_index(['c', 'd']) # 创建新的df 将列转换为行索引，并删除
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [24]:
df.set_index(['c', 'd'], drop=False)  # 不删除原始的列

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [25]:
df2.reset_index()  # reset_index()将index转为列

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


# 5.6 pandas其他话题
## 5.6.1 整数索引

In [15]:
ser = pd.Series(np.arange(4, 8, 1.))
ser

0    4.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [16]:
ser[1]

5.0

In [17]:
ser.loc[:3]

0    4.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [18]:
ser.iloc[:3]

0    4.0
1    5.0
2    6.0
dtype: float64

In [52]:
# ser[-1]  ERROR

看到了，pandas在整数索引上可能会出错。这里我们有一个index包括0，1，2，但是猜测用户想要什么是很困难的：

如果用非整数来做index，就没有歧义了：

In [19]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [34]:
print(ser2[1])
print(ser2[-1])

1.0
2.0


In [39]:
ser3 = pd.Series(np.array(['first', 'second', 'third']), index=[-1, 4, 6])
ser3

-1     first
 4    second
 6     third
dtype: object

In [40]:
print(ser3[-1])
print(ser3[4])
print(ser3[6])

first
second
third


loc(for label) 或  ilco(for integers)

In [51]:
# ser3[0]  # KeyError: 0
# ser3.loc[0]   # KeyError: 'the label [0] is not in the [index]'
ser3.iloc[0]  # first

'first'

In [47]:
ser3.iloc[-2]  # second
# ser3.loc[-2]  # KeyError: 'the label [-2] is not in the [index]'
# ser3[-2]  # KeyError: -2

'second'