In [2]:
import pandas as pd
import numpy as np

### 다중 인덱스
데이터프레임에 여러 계층을 가지는 인덱스를 지정할 수 있음
데이터프레임 생성시 `columns` 인수로 다차원 리스트 형태를 지정하면 다중 인덱스로 지정할 수 있음 

In [7]:
df = pd.DataFrame(np.random.randn(5,4).round(2), columns=[['A','A','B','B'],['C1','C2','C1','C2']])
df

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,0.97,-0.19,-2.37,-1.22
1,-0.62,-0.25,-0.22,-1.16
2,1.11,0.03,-0.28,-1.4
3,-0.44,-0.17,0.28,0.66
4,-0.74,-1.12,-0.52,0.01


데이터 프레임의 `columns`속성의 `names`속성으로 각 열 인덱스에 대한 이름을
부여할 수 있음

In [55]:
df.columns.names = ['Cidx1','Cidx2']
df

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,-0.34,-0.47,1.54,0.28
1,0.02,-0.28,1.37,1.1
2,0.18,1.34,-0.16,-2.56
3,0.47,-0.17,-2.1,-0.72
4,0.58,-1.0,-1.13,0.6


데이터 프레임 생성 시 `index` 인수로 다차원 리스트를 지정하면  
다차원 형태의 행 인덱스를 지정할 수 있음

행 인덱스의 이름은 데이터프레임 인스턴스의 `index` 속성의 `names` 속성으로 지정할 수 있음


In [115]:
df2= pd.DataFrame(np.random.randn(6,4).round(2), 
            columns=[['A','A','B','B'],['C1','C2','C1','C2']],
            index=[['M','M','M','F','F','F'],['id_1','id_2','id_3','id_1','id_2','id_3']]) 
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,C1,C2,C1,C2
M,id_1,0.43,-0.18,-1.24,-1.17
M,id_2,1.75,-0.41,1.11,0.65
M,id_3,-1.09,-1.79,-0.06,-0.19
F,id_1,-1.0,0.23,0.37,-0.37
F,id_2,-0.57,0.55,1.44,1.74
F,id_3,-0.94,-0.25,0.29,0.35


In [57]:
df2.index.names = ['Ridx1','Ridx2']
df2.columns.names = ['Cidx1','Cidx2']
df2

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46


### 열 인덱스와 행 인덱스 교환
`stack`,`unstack` 메서드로 열 인덱스를 행 인덱스로  
또는 행 인덱스를 열 인덱스로 바꿀 수 있음
`stack()` 메서드 : 열 인덱스를 행 인덱스로 변경
`unstack()` 메서드 : 행 인덱스를 열 인덱스로 변경

In [58]:
df2.stack("Cidx1")

  df2.stack("Cidx1")


Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx2,C1,C2
Ridx1,Ridx2,Cidx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,A,-0.34,-1.88
M,id_1,B,-0.48,-0.66
M,id_2,A,0.94,0.3
M,id_2,B,-0.31,-1.0
M,id_3,A,0.01,0.47
M,id_3,B,0.55,0.29
F,id_1,A,-0.2,-0.09
F,id_1,B,-1.49,0.63
F,id_2,A,0.47,-0.96
F,id_2,B,1.72,0.79


In [59]:
df3 = df2.stack(1)
df3

  df3 = df2.stack(1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx1,A,B
Ridx1,Ridx2,Cidx2,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,C1,-0.34,-0.48
M,id_1,C2,-1.88,-0.66
M,id_2,C1,0.94,-0.31
M,id_2,C2,0.3,-1.0
M,id_3,C1,0.01,0.55
M,id_3,C2,0.47,0.29
F,id_1,C1,-0.2,-1.49
F,id_1,C2,-0.09,0.63
F,id_2,C1,0.47,1.72
F,id_2,C2,-0.96,0.79


In [60]:
df2.unstack(1)

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C1,C1,C1,C2,C2,C2,C1,C1,C1,C2,C2,C2
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,-0.2,0.47,-0.23,-0.09,-0.96,2.23,-1.49,1.72,0.89,0.63,0.79,-0.46
M,-0.34,0.94,0.01,-1.88,0.3,0.47,-0.48,-0.31,0.55,-0.66,-1.0,0.29


In [61]:
df2.unstack(0)

Cidx1,A,A,A,A,B,B,B,B
Cidx2,C1,C1,C2,C2,C1,C1,C2,C2
Ridx1,F,M,F,M,F,M,F,M
Ridx2,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
id_1,-0.2,-0.34,-0.09,-1.88,-1.49,-0.48,0.63,-0.66
id_2,0.47,0.94,-0.96,0.3,1.72,-0.31,0.79,-1.0
id_3,-0.23,0.01,2.23,0.47,0.89,0.55,-0.46,0.29


### 다중 인덱스의 인덱싱
다중 인덱스를 가지고 있는 데이터프레임의 경우 하나의 인덱스가 아니라 `()`로 둘러쌓인 튜플이어야 함

In [62]:
df

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,-0.34,-0.47,1.54,0.28
1,0.02,-0.28,1.37,1.1
2,0.18,1.34,-0.16,-2.56
3,0.47,-0.17,-2.1,-0.72
4,0.58,-1.0,-1.13,0.6


In [63]:
df[('A','C2')]

0   -0.47
1   -0.28
2    1.34
3   -0.17
4   -1.00
Name: (A, C2), dtype: float64

In [64]:
df.loc[0,('A','C1')]

-0.34

만약 튜플로 지정하지 않고 단일 값으로 지정하면 제일 최상단의 인덱스를 지정한 것으로 봄

In [65]:
df['A']

Cidx2,C1,C2
0,-0.34,-0.47
1,0.02,-0.28
2,0.18,1.34
3,0.47,-0.17
4,0.58,-1.0


단, `iloc` 인덱서를 사용할 때는 다중인덱스로 접근할 수 없음

In [66]:
df2

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46


In [67]:
df2.loc[('M','id_2')]

Cidx1  Cidx2
A      C1       0.94
       C2       0.30
B      C1      -0.31
       C2      -1.00
Name: (M, id_2), dtype: float64

In [68]:
df2.loc[('M','id_2'),('B','C1')]

-0.31

In [69]:
df2.loc[:,('A','C2')]

Ridx1  Ridx2
M      id_1    -1.88
       id_2     0.30
       id_3     0.47
F      id_1    -0.09
       id_2    -0.96
       id_3     2.23
Name: (A, C2), dtype: float64

In [70]:
df2.loc[('All','All'),:] = df2.sum()
df2

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46
All,All,0.65,0.07,0.88,-0.41


In [71]:
df2.loc['M']

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
Ridx2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
id_1,-0.34,-1.88,-0.48,-0.66
id_2,0.94,0.3,-0.31,-1.0
id_3,0.01,0.47,0.55,0.29


다중인덱스 인덱싱의 튜플 내에서 슬라이싱을 하고 싶다면 `:` 대신 `slice()` 메서드를 사용해야함
`slice(마지막인덱스)`, `slice(시작인덱스,마지막인덱스)`,`slice(시작인덱스,마지막인덱스,스텝)`

In [72]:
df2.loc[('M',slice(None)), :]

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29


In [73]:
df2.loc[:,('A',slice(None))]

Unnamed: 0_level_0,Cidx1,A,A
Unnamed: 0_level_1,Cidx2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2
M,id_1,-0.34,-1.88
M,id_2,0.94,0.3
M,id_3,0.01,0.47
F,id_1,-0.2,-0.09
F,id_2,0.47,-0.96
F,id_3,-0.23,2.23
All,All,0.65,0.07


### 다중 인덱스의 인덱스 순서 변경
다중 인덱스의 순서를 변경하고 싶으면 `swaplevel(i,j,axis)` 메서드를 사용함  
`i`,`j` 인자 : 순서를 변경할 인덱스의 이름 혹은 번호
`axis` 인자 : 0일 경우 행 인덱스, 1일 경우 열 인덱스

In [74]:
df2

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46
All,All,0.65,0.07,0.88,-0.41


In [75]:
df2.swaplevel('Ridx1','Ridx2',0)  # 행 인덱스

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,-0.34,-1.88,-0.48,-0.66
id_2,M,0.94,0.3,-0.31,-1.0
id_3,M,0.01,0.47,0.55,0.29
id_1,F,-0.2,-0.09,-1.49,0.63
id_2,F,0.47,-0.96,1.72,0.79
id_3,F,-0.23,2.23,0.89,-0.46
All,All,0.65,0.07,0.88,-0.41


In [76]:
df2.swaplevel('Cidx1','Cidx2',1) # 열인덱스 

Unnamed: 0_level_0,Cidx2,C1,C2,C1,C2
Unnamed: 0_level_1,Cidx1,A,A,B,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46
All,All,0.65,0.07,0.88,-0.41


### 다중 인덱스의 정렬
다중 인덱스를 가지고 있는 데이터프레임에서 `sort_index`로 정렬할 때 `level` 인수를 사용하여  
어떤 인덱스 기준으로 정렬할지 지정해야 함

In [77]:
df2.sort_index(level=0)

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
All,All,0.65,0.07,0.88,-0.41
F,id_1,-0.2,-0.09,-1.49,0.63
F,id_2,0.47,-0.96,1.72,0.79
F,id_3,-0.23,2.23,0.89,-0.46
M,id_1,-0.34,-1.88,-0.48,-0.66
M,id_2,0.94,0.3,-0.31,-1.0
M,id_3,0.01,0.47,0.55,0.29


In [78]:
df2.sort_index(level=(1,0))

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C1,C2,C1,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
All,All,0.65,0.07,0.88,-0.41
F,id_1,-0.2,-0.09,-1.49,0.63
M,id_1,-0.34,-1.88,-0.48,-0.66
F,id_2,0.47,-0.96,1.72,0.79
M,id_2,0.94,0.3,-0.31,-1.0
F,id_3,-0.23,2.23,0.89,-0.46
M,id_3,0.01,0.47,0.55,0.29


In [79]:
df2.sort_index(level=1, axis=1)

Unnamed: 0_level_0,Cidx1,A,B,A,B
Unnamed: 0_level_1,Cidx2,C1,C1,C2,C2
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,-0.34,-0.48,-1.88,-0.66
M,id_2,0.94,-0.31,0.3,-1.0
M,id_3,0.01,0.55,0.47,0.29
F,id_1,-0.2,-1.49,-0.09,0.63
F,id_2,0.47,1.72,-0.96,0.79
F,id_3,-0.23,0.89,2.23,-0.46
All,All,0.65,0.88,0.07,-0.41


 **파이썬으로 다음 연산을 수행한다.**

A 반 학생 5명과 B반 학생 5명의 국어, 영어, 수학 점수를 나타내는 데이터프레임을 다음과 같이 만든다.

1. “반”, “번호”, “국어”, “영어”, “수학” 을 열로 가지는 데이터프레임 `df_score3`을 만든다.
2. `df_score3`을 변형하여 1차 행 인덱스로 “반”을 2차 행 인덱스로 “번호”을 가지는 데이터프레임 `df_score4`을 만든다.
3. 데이터 프레임 `df_score4`에 각 학생의 평균을 나타내는 행을 오른쪽에 추가한다.
4. `df_score3`을 변형하여 행 인덱스로 “번호”를, 1차 열 인덱스로 “국어”, “영어”, “수학”을, 2차 열 인덱스로 “반”을 가지는 데이터프레임 `df_score5`을 만든다.
5. 데이터 프레임 `df_score5`에 각 반별 각 과목의 평균을 나타내는 행을 아래에 추가한다.
</aside>


In [None]:
columns = ["반","번호","국어", "영어", "수학"]

data = {
    "반" : [1,1,1,2,2,2],
    "번호":[1,2,3,1,2,3],
    "국어": [60, 80, 70, 90,100,50],
    "영어": [70, 50, 90, 60,100,80],
    "수학": [80, 70,50, 90,60,100]
}


df_score3 = pd.DataFrame(data,  columns = columns)
df_score3



In [None]:
df_score3 = pd. DataFrame({
        "반" : [1,1,1,2,2,2],
        "번호":[1,2,3,1,2,3],
        "국어": [60, 80, 70, 90,100,50],
        "영어": [70, 50, 90, 60,100,80],
        "수학": [80, 70,50, 90,60,100],

})

df_score3

In [None]:
# 2. `df_score3`을 변형하여 1차 행 인덱스로 “반”을 2차 행 인덱스로 “번호”을 가지는 데이터프레임 `df_score4`을 만든다.
df_score4 = df_score3.set_index(['반','번호'])
df_score4

In [None]:
# 3. 데이터 프레임 `df_score4`에 각 학생의 평균을 나타내는 행을 오른쪽에 추가한다.
df_score4['평균'] = df_score4.mean(axis=1).round(2)
df_score4

In [126]:
#  4. `df_score3`을 변형하여 행 인덱스로 “번호”를  
#  1차 열 인덱스로 “국어”, “영어”, “수학”을, 2차 열 인덱스로 “반”을 가지는 데이터프레임 `df_score5`을 만든다.
df_score5 = df_score3.set_index(['반','번호']).unstack('반')

In [None]:
df_score5.loc['평균',:] = df_score5.mean().round(2)
df_score5