In [1]:
import pandas as pd
import numpy as np

# 数据规整：连接、联合、重塑

## 1. 分层索引

#### Series分层索引

In [2]:
ser = pd.Series(np.random.randn(9), index = [['good','good','good','good','good','bad','bad','bad','bad'],
                                             ['a','a','a','b','b','b','b','c','c'],[1,2,3,1,2,3,1,2,3]])
ser

good  a  1    0.389121
         2    0.864123
         3   -1.005931
      b  1    0.136535
         2    0.061069
bad   b  3   -0.340082
         1    1.624100
      c  2    0.369799
         3   -0.044377
dtype: float64

In [4]:
ser['good']['a']

1    0.389121
2    0.864123
3   -1.005931
dtype: float64

In [9]:
ser.loc[:,'c']

bad  2    0.369799
     3   -0.044377
dtype: float64

In [10]:
ser.unstack()

Unnamed: 0,Unnamed: 1,1,2,3
bad,b,1.6241,,-0.340082
bad,c,,0.369799,-0.044377
good,a,0.389121,0.864123,-1.005931
good,b,0.136535,0.061069,


In [5]:
ser.index

MultiIndex(levels=[['bad', 'good'], ['a', 'b', 'c'], [1, 2, 3]],
           labels=[[1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]])

#### DataFrame分层索引

In [11]:
df = pd.DataFrame(np.arange(12).reshape((4,3)), index = [['a','a','b','b'],[1,2,1,2]], columns = [['APP','APP','Goole'],['Red','Bule','Red']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,APP,APP,Goole
Unnamed: 0_level_1,Unnamed: 1_level_1,Red,Bule,Red
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
df.index.names = ['k1','k2']
df.columns.names = ['Company','Color']
df

Unnamed: 0_level_0,Company,APP,APP,Goole
Unnamed: 0_level_1,Color,Red,Bule,Red
k1,k2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [21]:
df.loc['a','APP']

Color,Red,Bule
k2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,3,4


In [22]:
df.index

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['k1', 'k2'])

In [23]:
df.columns

MultiIndex(levels=[['APP', 'Goole'], ['Bule', 'Red']],
           labels=[[0, 0, 1], [1, 0, 1]],
           names=['Company', 'Color'])

## 2. 重排序和层级排序

sort_index只能在单一层级上对数据进行排序,通过level来控制排序的层级

In [25]:
df

Unnamed: 0_level_0,Company,APP,APP,Goole
Unnamed: 0_level_1,Color,Red,Bule,Red
k1,k2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [29]:
df.sort_index(level = 1)

Unnamed: 0_level_0,Company,APP,APP,Goole
Unnamed: 0_level_1,Color,Red,Bule,Red
k1,k2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [30]:
df.swaplevel(0,1).sort_index(level = 0)

Unnamed: 0_level_0,Company,APP,APP,Goole
Unnamed: 0_level_1,Color,Red,Bule,Red
k2,k1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


按层级进行汇总统计

In [33]:
df.sum(level = 0)

Company,APP,APP,Goole
Color,Red,Bule,Red
k1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


## 3.联合与合并数据集