#### 데이터프레임 인덱스 조작

데이터프레임 인덱스 설정 및 제거
set_index: 기존의 행 인덱스를 제거하고 데이터 열 중 하나를 인덱스로 설정
reset_index: 기존의 행 인덱스를 제거하고 인덱스를 데이터 열로 추가

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                            np.round(np.random.rand(3, 5),2)]).T,
                            columns=["C1","C2","C3","C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [3]:
df2 = df1.set_index("C1") #기존의 인덱스는 사라짐
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [4]:
df2.set_index("C2")

Unnamed: 0_level_0,C3,C4
C2,Unnamed: 1_level_1,Unnamed: 2_level_1
0.55,0.65,0.79
0.72,0.44,0.53
0.6,0.89,0.57
0.54,0.96,0.93
0.42,0.38,0.07


In [5]:
df2.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [6]:
df2.reset_index(drop=True)

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07


#### 연습 문제 1

In [7]:
#df_score1
df_score1=pd.DataFrame({"A":[100,100,100], "B":[100,75,75], "C":[50,100,50],
                       "D":[75,75,75], "E":[50,50,50]}, index=["국어","영어","수학"]).T

In [8]:
df_score1

Unnamed: 0,국어,영어,수학
A,100,100,100
B,100,75,75
C,50,100,50
D,75,75,75
E,50,50,50


In [9]:
df_score2 = df_score1.reset_index()
df_score2.columns = ["이름","국어","영어","수학"]

In [10]:
df_score2

Unnamed: 0,이름,국어,영어,수학
0,A,100,100,100
1,B,100,75,75
2,C,50,100,50
3,D,75,75,75
4,E,50,50,50


In [11]:
df_score2.set_index("이름")

Unnamed: 0_level_0,국어,영어,수학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100,100,100
B,100,75,75
C,50,100,50
D,75,75,75
E,50,50,50


#### 다중 인덱스

In [12]:
np.random.seed(0)
df3 = pd.DataFrame(np.round(np.random.rand(5,4),2),
                  columns=[["A","A","B","B"],
                      ["C1","C2","C1","C2"]])

In [13]:
df3

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,0.55,0.72,0.6,0.54
1,0.42,0.65,0.44,0.89
2,0.96,0.38,0.79,0.53
3,0.57,0.93,0.07,0.09
4,0.02,0.83,0.78,0.87


In [14]:
df3.columns.names = ["Cidx1","Cidx2"]
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,0.55,0.72,0.6,0.54
1,0.42,0.65,0.44,0.89
2,0.96,0.38,0.79,0.53
3,0.57,0.93,0.07,0.09
4,0.02,0.83,0.78,0.87


In [15]:
np.random.seed(0)
df4 = pd.DataFrame(np.round(np.random.randn(6,4),2),
                  columns=[["A","A","B","B"],
                          ["C","D","C","D"]],
                  index=[["M","M","M","F","F","F"],
                         ["id_" + str(i+1) for i in range(3)]*2])
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,C,D,C,D
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [16]:
df4.columns.names = ["Cidx1","Cidx2"]
df4.index.names = ["Ridx1","Ridx2"]
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


#### 행 인덱스와 열 인데스 교환

In [20]:
stack() 열 인덱스 -> 행 인덱스로 변환
unstack() 행 인덱스 -> 열 인덱스로 변환

SyntaxError: invalid syntax (<ipython-input-20-5cd6428e2fef>, line 1)

In [17]:
df4.stack("Cidx1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx2,C,D
Ridx1,Ridx2,Cidx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,A,1.76,0.4
M,id_1,B,0.98,2.24
M,id_2,A,1.87,-0.98
M,id_2,B,0.95,-0.15
M,id_3,A,-0.1,0.41
M,id_3,B,0.14,1.45
F,id_1,A,0.76,0.12
F,id_1,B,0.44,0.33
F,id_2,A,1.49,-0.21
F,id_2,B,0.31,-0.85


In [18]:
df4.stack(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx1,A,B
Ridx1,Ridx2,Cidx2,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,C,1.76,0.98
M,id_1,D,0.4,2.24
M,id_2,C,1.87,0.95
M,id_2,D,-0.98,-0.15
M,id_3,C,-0.1,0.14
M,id_3,D,0.41,1.45
F,id_1,C,0.76,0.44
F,id_1,D,0.12,0.33
F,id_2,C,1.49,0.31
F,id_2,D,-0.21,-0.85


In [19]:
df4.unstack()

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C,C,C,D,D,D,C,C,C,D,D,D
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,0.76,1.49,-2.55,0.12,-0.21,0.65,0.44,0.31,0.86,0.33,-0.85,-0.74
M,1.76,1.87,-0.1,0.4,-0.98,0.41,0.98,0.95,0.14,2.24,-0.15,1.45


In [20]:
df4.unstack("Ridx2")

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C,C,C,D,D,D,C,C,C,D,D,D
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,0.76,1.49,-2.55,0.12,-0.21,0.65,0.44,0.31,0.86,0.33,-0.85,-0.74
M,1.76,1.87,-0.1,0.4,-0.98,0.41,0.98,0.95,0.14,2.24,-0.15,1.45


In [21]:
df4.unstack(0)

Cidx1,A,A,A,A,B,B,B,B
Cidx2,C,C,D,D,C,C,D,D
Ridx1,F,M,F,M,F,M,F,M
Ridx2,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
id_1,0.76,1.76,0.12,0.4,0.44,0.98,0.33,2.24
id_2,1.49,1.87,-0.21,-0.98,0.31,0.95,-0.85,-0.15
id_3,-2.55,-0.1,0.65,0.41,0.86,0.14,-0.74,1.45


#### 다중 인덱스가 있는 경우의 인덱싱

In [22]:
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,0.55,0.72,0.6,0.54
1,0.42,0.65,0.44,0.89
2,0.96,0.38,0.79,0.53
3,0.57,0.93,0.07,0.09
4,0.02,0.83,0.78,0.87


In [23]:
df3[("B","C1")]

0    0.60
1    0.44
2    0.79
3    0.07
4    0.78
Name: (B, C1), dtype: float64

In [24]:
df3.loc[0, ("B", "C1")]

0.6

In [25]:
df3.loc[0, ("B", "C1")] = 100
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,0.55,0.72,100.0,0.54
1,0.42,0.65,0.44,0.89
2,0.96,0.38,0.79,0.53
3,0.57,0.93,0.07,0.09
4,0.02,0.83,0.78,0.87


In [26]:
df3.iloc[0, 2]

100.0

In [27]:
df3['A']

Cidx2,C1,C2
0,0.55,0.72
1,0.42,0.65
2,0.96,0.38
3,0.57,0.93
4,0.02,0.83


In [28]:
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [29]:
df4.loc[('M','id_1'),('A','C')]

1.76

In [30]:
df4.loc[:, ("A", "C")]

Ridx1  Ridx2
M      id_1     1.76
       id_2     1.87
       id_3    -0.10
F      id_1     0.76
       id_2     1.49
       id_3    -2.55
Name: (A, C), dtype: float64

In [31]:
df4.loc[("M", "id_1"), :]

Cidx1  Cidx2
A      C        1.76
       D        0.40
B      C        0.98
       D        2.24
Name: (M, id_1), dtype: float64

In [32]:
df4.loc[("All","All"),:] = df4.sum()

In [33]:
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


#### 다중 인덱서의 인덱스 순서 교환

In [34]:
df5 = df4.swaplevel("Ridx1","Ridx2")

In [35]:
df5

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,1.76,0.4,0.98,2.24
id_2,M,1.87,-0.98,0.95,-0.15
id_3,M,-0.1,0.41,0.14,1.45
id_1,F,0.76,0.12,0.44,0.33
id_2,F,1.49,-0.21,0.31,-0.85
id_3,F,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


In [36]:
df6 = df4.swaplevel("Cidx1", "Cidx2", 1)
df6

Unnamed: 0_level_0,Cidx2,C,D,C,D
Unnamed: 0_level_1,Cidx1,A,A,B,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


#### 다중 인덱스가 있는 경우의 정렬

In [37]:
df5

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,1.76,0.4,0.98,2.24
id_2,M,1.87,-0.98,0.95,-0.15
id_3,M,-0.1,0.41,0.14,1.45
id_1,F,0.76,0.12,0.44,0.33
id_2,F,1.49,-0.21,0.31,-0.85
id_3,F,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


In [38]:
df5.sort_index(level=0)

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
All,All,3.23,0.39,3.68,2.28
id_1,F,0.76,0.12,0.44,0.33
id_1,M,1.76,0.4,0.98,2.24
id_2,F,1.49,-0.21,0.31,-0.85
id_2,M,1.87,-0.98,0.95,-0.15
id_3,F,-2.55,0.65,0.86,-0.74
id_3,M,-0.1,0.41,0.14,1.45


In [39]:
df6.sort_index(axis=1, level=0)

Unnamed: 0_level_0,Cidx2,C,C,D,D
Unnamed: 0_level_1,Cidx1,A,B,A,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.98,0.4,2.24
M,id_2,1.87,0.95,-0.98,-0.15
M,id_3,-0.1,0.14,0.41,1.45
F,id_1,0.76,0.44,0.12,0.33
F,id_2,1.49,0.31,-0.21,-0.85
F,id_3,-2.55,0.86,0.65,-0.74
All,All,3.23,3.68,0.39,2.28


#### 연습 문제 2

In [64]:
df_score3=pd.DataFrame({"반":["A"]*5 +["B"]*5,
                        "번호":["ID0"+ str(i) for i in range(5)]*2,
                        "국어":np.random.randint(70,100,10),
                        "영어":np.random.randint(70,100,10),
                        "수학":np.random.randint(70,100,10)},
                      columns=['반', '번호', '국어', '영어', '수학'])

In [65]:
df_score3

Unnamed: 0,반,번호,국어,영어,수학
0,A,ID00,72,97,81
1,A,ID01,73,84,72
2,A,ID02,97,79,89
3,A,ID03,73,96,86
4,A,ID04,88,71,70
5,B,ID00,84,74,92
6,B,ID01,73,80,70
7,B,ID02,90,92,76
8,B,ID03,87,81,89
9,B,ID04,88,78,84


In [66]:
df_score4=df_score3.set_index(["반","번호"])

In [67]:
df_score4

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,ID00,72,97,81
A,ID01,73,84,72
A,ID02,97,79,89
A,ID03,73,96,86
A,ID04,88,71,70
B,ID00,84,74,92
B,ID01,73,80,70
B,ID02,90,92,76
B,ID03,87,81,89
B,ID04,88,78,84


In [68]:
df_score4["평균"] = df_score4.mean(axis=1).round(2)

In [69]:
df_score4

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학,평균
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,ID00,72,97,81,83.33
A,ID01,73,84,72,76.33
A,ID02,97,79,89,88.33
A,ID03,73,96,86,85.0
A,ID04,88,71,70,76.33
B,ID00,84,74,92,83.33
B,ID01,73,80,70,74.33
B,ID02,90,92,76,86.0
B,ID03,87,81,89,85.67
B,ID04,88,78,84,83.33


In [70]:
df_score5=df_score3.set_index(["번호","반"]).unstack()

In [71]:
df_score5

Unnamed: 0_level_0,국어,국어,영어,영어,수학,수학
반,A,B,A,B,A,B
번호,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ID00,72,84,97,74,81,92
ID01,73,73,84,80,72,70
ID02,97,90,79,92,89,76
ID03,73,87,96,81,86,89
ID04,88,88,71,78,70,84


In [72]:
df_score5.loc["평균",:] = df_score5.mean()

In [73]:
df_score5

Unnamed: 0_level_0,국어,국어,영어,영어,수학,수학
반,A,B,A,B,A,B
번호,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ID00,72.0,84.0,97.0,74.0,81.0,92.0
ID01,73.0,73.0,84.0,80.0,72.0,70.0
ID02,97.0,90.0,79.0,92.0,89.0,76.0
ID03,73.0,87.0,96.0,81.0,86.0,89.0
ID04,88.0,88.0,71.0,78.0,70.0,84.0
평균,80.6,84.4,85.4,81.0,79.6,82.2
