# Day_08

# Pandas 인덱스 조작

In [1]:
import numpy as np
import pandas as pd

## 인덱스 설정 및 제거

* ``set_index``: 기존의 행 인덱스를 제거하고 데이터 열 중 하나를 인덱스로 설정
* ``reset_index``: 기존의 행 인덱스를 제거하고 덱스를 마지막 데이터 열로 추가

In [2]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(1, 10, (10, 4)), 
                  columns=["C1", "C2", "C3", "C4"])
df

Unnamed: 0,C1,C2,C3,C4
0,6,1,4,4
1,8,4,6,3
2,5,8,7,9
3,9,2,7,8
4,8,9,2,6
5,9,5,4,1
6,4,6,1,3
7,4,9,2,4
8,4,4,8,1
9,2,1,5,8


``set_index``명령으로 C1을 인ㄷ게스로 설정할 수 있다. 이 때 기존 인덱스는 없어진다.

In [3]:
df1 = df.set_index("C1")
df1

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,1,4,4
8,4,6,3
5,8,7,9
9,2,7,8
8,9,2,6
9,5,4,1
4,6,1,3
4,9,2,4
4,4,8,1
2,1,5,8


In [4]:
df2 = df1.set_index("C2")
df2

Unnamed: 0_level_0,C3,C4
C2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,4
4,6,3
8,7,9
2,7,8
9,2,6
5,4,1
6,1,3
9,2,4
4,8,1
1,5,8


``reset_index`` 명령으로 인덱스 열을 보통의 자료열로 넣을 수 있다. 이 때 인덱스 열은 자료열의 가장 선두로 삽입된다. 인덱스는 숫자로 된 디폴트 인덱스가 된다.

``reset_index`` 명령 사용시에 ``drop=True``로 설정하면 인덱스 열을 보통의 자료열로 올리는 것이 아니라 그냥 버리게 된다.

In [5]:
df1.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,6,1,4,4
1,8,4,6,3
2,5,8,7,9
3,9,2,7,8
4,8,9,2,6
5,9,5,4,1
6,4,6,1,3
7,4,9,2,4
8,4,4,8,1
9,2,1,5,8


In [6]:
df1.reset_index(drop=True)

Unnamed: 0,C2,C3,C4
0,1,4,4
1,4,6,3
2,8,7,9
3,2,7,8
4,9,2,6
5,5,4,1
6,6,1,3
7,9,2,4
8,4,8,1
9,1,5,8


**연습문제 1**

In [7]:
df_score = pd.DataFrame({
    'name': ['alpha', 'bravo', 'charlie', 'delta', 'eco'],
    'kor': [24, 25, 21, 22, 24],   
    'math': [25, 25, 24, 25, 15], 
    'eng': [10, 23, 23, 25, 10],
})
df_score

Unnamed: 0,eng,kor,math,name
0,10,24,25,alpha
1,23,25,25,bravo
2,23,21,24,charlie
3,25,22,25,delta
4,10,24,15,eco


In [8]:
df_score.set_index('name')

Unnamed: 0_level_0,eng,kor,math
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alpha,10,24,25
bravo,23,25,25
charlie,23,21,24
delta,25,22,25
eco,10,24,15


In [9]:
df_score = pd.DataFrame({
    'kor': [24, 25, 21, 22, 24],   
    'math': [25, 25, 24, 25, 15], 
    'eng': [10, 23, 23, 25, 10],
})
df_score

Unnamed: 0,eng,kor,math
0,10,24,25
1,23,25,25
2,23,21,24
3,25,22,25
4,10,24,15


In [10]:
df_score.index = ['alpha', 'bravo', 'charlie', 'delta', 'eco']
df_score.index

Index(['alpha', 'bravo', 'charlie', 'delta', 'eco'], dtype='object')

In [11]:
df_score

Unnamed: 0,eng,kor,math
alpha,10,24,25
bravo,23,25,25
charlie,23,21,24
delta,25,22,25
eco,10,24,15


In [12]:
df_score.reset_index()

Unnamed: 0,index,eng,kor,math
0,alpha,10,24,25
1,bravo,23,25,25
2,charlie,23,21,24
3,delta,25,22,25
4,eco,10,24,15


# 계층적 인덱스

인덱스는 여러 계층을 가질 수 있다.
데이터프레임을 생성할 때 ``columns`` 인수에 리스트의 리스트(행렬) 형태로 인덱스를 넣으면 계층적인 열 인덱스를 가진다.

In [13]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(1, 10, (10, 4)), 
                  columns=[["A", "A", "B", "B"], ["C1", "C2", "C3", "C4"]])
df

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C3,C4
0,6,1,4,4
1,8,4,6,3
2,5,8,7,9
3,9,2,7,8
4,8,9,2,6
5,9,5,4,1
6,4,6,1,3
7,4,9,2,4
8,4,4,8,1
9,2,1,5,8


계층적인 인덱스는 이름을 지정하면 더 편리하게 사용할 수 있다. 열 인덱스들의 이름 지정은 ``columns`` 객체의 ``names`` 속성에 리스트를 넣어서 지정한다.

In [14]:
df.columns.names = ["Cdx1", "Cdx2"]
df

Cdx1,A,A,B,B
Cdx2,C1,C2,C3,C4
0,6,1,4,4
1,8,4,6,3
2,5,8,7,9
3,9,2,7,8
4,8,9,2,6
5,9,5,4,1
6,4,6,1,3
7,4,9,2,4
8,4,4,8,1
9,2,1,5,8


Naming columns is very useful in the case of hierarchial columns.

마찬가지로 데이터프레임을 생성할 때 ``index`` 인수에 리스트의 리스트(행렬) 형태로 인덱스를 넣으면 계층적인 (행) 인덱스를 가진다. 행 인덱스들의 이름 지정은 ``index`` 객체의 ``names`` 속성에 리스트를 넣어서 지정한다.

In [15]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(1, 10, (8, 4)),
                  columns=[["A", "A", "B", "B"], ["C", "D", "C", "D"]],            
                  index = [["M", "M", "M", "M", "F", "F", "F", "F"], ["ID" + str(i) for i in range(4)]*2])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,C,D,C,D
M,ID0,6,1,4,4
M,ID1,8,4,6,3
M,ID2,5,8,7,9
M,ID3,9,2,7,8
F,ID0,8,9,2,6
F,ID1,9,5,4,1
F,ID2,4,6,1,3
F,ID3,4,9,2,4


In [16]:
df.columns.names = ["Cdx1", "Cdx2"]
df.index.names = ["Rdx1", "Rdx2"]
df

Unnamed: 0_level_0,Cdx1,A,A,B,B
Unnamed: 0_level_1,Cdx2,C,D,C,D
Rdx1,Rdx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,ID0,6,1,4,4
M,ID1,8,4,6,3
M,ID2,5,8,7,9
M,ID3,9,2,7,8
F,ID0,8,9,2,6
F,ID1,9,5,4,1
F,ID2,4,6,1,3
F,ID3,4,9,2,4


# 행 인덱스와 열 인덱스 교환 [중요]

행 인덱스와 열 인덱스는 ``stack`` 명령이나 ``unstack`` 명령으로 교환할 수 있다.

* stack()
    * 열 인덱스 -> (최하위) 행 인덱스로 변환
* unstack()
    * 행 인덱스 -> (최하위) 열 인덱스로 변환
``stack`` 명령을 실행하면 열 인덱스가 시계 방향으로 90도 회전한 것과 비슷한 모양이 된다. 마찬가지로 ``unstack`` 명령을 실행하면 행 인덱스가 반시계 방향으로 90도 회전한 것과 비슷하다.

인덱스 조작시에는 이름이나 숫자 인덱스를 사용한다.

In [17]:
df

Unnamed: 0_level_0,Cdx1,A,A,B,B
Unnamed: 0_level_1,Cdx2,C,D,C,D
Rdx1,Rdx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,ID0,6,1,4,4
M,ID1,8,4,6,3
M,ID2,5,8,7,9
M,ID3,9,2,7,8
F,ID0,8,9,2,6
F,ID1,9,5,4,1
F,ID2,4,6,1,3
F,ID3,4,9,2,4


In [18]:
df.stack("Cdx1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Cdx2,C,D
Rdx1,Rdx2,Cdx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,ID0,A,6,1
M,ID0,B,4,4
M,ID1,A,8,4
M,ID1,B,6,3
M,ID2,A,5,8
M,ID2,B,7,9
M,ID3,A,9,2
M,ID3,B,7,8
F,ID0,A,8,9
F,ID0,B,2,6


In [19]:
df.stack(["Cdx1","Cdx2"])

Rdx1  Rdx2  Cdx1  Cdx2
M     ID0   A     C       6
                  D       1
            B     C       4
                  D       4
      ID1   A     C       8
                  D       4
            B     C       6
                  D       3
      ID2   A     C       5
                  D       8
            B     C       7
                  D       9
      ID3   A     C       9
                  D       2
            B     C       7
                  D       8
F     ID0   A     C       8
                  D       9
            B     C       2
                  D       6
      ID1   A     C       9
                  D       5
            B     C       4
                  D       1
      ID2   A     C       4
                  D       6
            B     C       1
                  D       3
      ID3   A     C       4
                  D       9
            B     C       2
                  D       4
dtype: int32

In [20]:
df.stack(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Cdx2,C,D
Rdx1,Rdx2,Cdx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,ID0,A,6,1
M,ID0,B,4,4
M,ID1,A,8,4
M,ID1,B,6,3
M,ID2,A,5,8
M,ID2,B,7,9
M,ID3,A,9,2
M,ID3,B,7,8
F,ID0,A,8,9
F,ID0,B,2,6


In [21]:
df

Unnamed: 0_level_0,Cdx1,A,A,B,B
Unnamed: 0_level_1,Cdx2,C,D,C,D
Rdx1,Rdx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,ID0,6,1,4,4
M,ID1,8,4,6,3
M,ID2,5,8,7,9
M,ID3,9,2,7,8
F,ID0,8,9,2,6
F,ID1,9,5,4,1
F,ID2,4,6,1,3
F,ID3,4,9,2,4


In [22]:
df.unstack(["Rdx2","Rdx1"])

Cdx1  Cdx2  Rdx2  Rdx1
A     C     ID0   F       8
                  M       6
            ID1   F       9
                  M       8
            ID2   F       4
                  M       5
            ID3   F       4
                  M       9
      D     ID0   F       9
                  M       1
            ID1   F       5
                  M       4
            ID2   F       6
                  M       8
            ID3   F       9
                  M       2
B     C     ID0   F       2
                  M       4
            ID1   F       4
                  M       6
            ID2   F       1
                  M       7
            ID3   F       2
                  M       7
      D     ID0   F       6
                  M       4
            ID1   F       1
                  M       3
            ID2   F       3
                  M       9
            ID3   F       4
                  M       8
dtype: int32

In [23]:
df.unstack(["Rdx1","Rdx2"])

Cdx1  Cdx2  Rdx1  Rdx2
A     C     F     ID0     8
                  ID1     9
                  ID2     4
                  ID3     4
            M     ID0     6
                  ID1     8
                  ID2     5
                  ID3     9
      D     F     ID0     9
                  ID1     5
                  ID2     6
                  ID3     9
            M     ID0     1
                  ID1     4
                  ID2     8
                  ID3     2
B     C     F     ID0     2
                  ID1     4
                  ID2     1
                  ID3     2
            M     ID0     4
                  ID1     6
                  ID2     7
                  ID3     7
      D     F     ID0     6
                  ID1     1
                  ID2     3
                  ID3     4
            M     ID0     4
                  ID1     3
                  ID2     9
                  ID3     8
dtype: int32

In [24]:
df.unstack(1)

Cdx1,A,A,A,A,A,A,A,A,B,B,B,B,B,B,B,B
Cdx2,C,C,C,C,D,D,D,D,C,C,C,C,D,D,D,D
Rdx2,ID0,ID1,ID2,ID3,ID0,ID1,ID2,ID3,ID0,ID1,ID2,ID3,ID0,ID1,ID2,ID3
Rdx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
F,8,9,4,4,9,5,6,9,2,4,1,2,6,1,3,4
M,6,8,5,9,1,4,8,2,4,6,7,7,4,3,9,8


If all rows or all columns are stacked or unstacked, then DataFrame is changed to Series.

#### TIP

데이터베이스에서 데이터를 추가하는 일은 간단한 일이나 Column을 추가하는 일은 상당한 일이다. 즉, 최초의 Schema를 고치는 것은 매우 어렵다. (Wide Form)


|차량|배기량|차고|$\cdots$|
|---|---|---|---|
|ava|1500|40|$\cdots$|

따라서 이는 다음과 같이 표현하는 것이 좋다. (Stacked Form, Long Form), 엑셀의 Pivot Table로 생각하면 된다.

|차량|항목|값|
|---|---|---|
|ava|배기량|1600|
||차고|40|
||$\cdots$|$\cdots$|

In [25]:
import itertools

x1, x2 = zip(*[x for x in itertools.product(["A", "B"], [str(i) for i in range(1, 6)])])
x3, x4, x5 = np.random.randint(1, 100, size=(3, 10)).tolist()
df = pd.DataFrame({"반": x1, "번호": x2, "국어": x3, "영어": x4, "수학": x5},
                  columns=["반", "번호", "국어", "영어", "수학"])

In [26]:
df2 = df.set_index(["반", "번호"])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1,30,32,37
A,2,20,75,54
A,3,20,24,6
A,4,15,36,39
A,5,40,76,18
B,1,33,56,80
B,2,66,29,5
B,3,10,35,43
B,4,58,1,59
B,5,33,1,32


In [27]:
df2.loc[("과목", "평균"), :] = df2.mean()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1,30.0,32.0,37.0
A,2,20.0,75.0,54.0
A,3,20.0,24.0,6.0
A,4,15.0,36.0,39.0
A,5,40.0,76.0,18.0
B,1,33.0,56.0,80.0
B,2,66.0,29.0,5.0
B,3,10.0,35.0,43.0
B,4,58.0,1.0,59.0
B,5,33.0,1.0,32.0


In [28]:
df2["평균"] = df2.mean(axis=1)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학,평균
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,30.0,32.0,37.0,33.0
A,2,20.0,75.0,54.0,49.666667
A,3,20.0,24.0,6.0,16.666667
A,4,15.0,36.0,39.0,30.0
A,5,40.0,76.0,18.0,44.666667
B,1,33.0,56.0,80.0,56.333333
B,2,66.0,29.0,5.0,33.333333
B,3,10.0,35.0,43.0,29.333333
B,4,58.0,1.0,59.0,39.333333
B,5,33.0,1.0,32.0,22.0


# Pandas 데이터 합성

# Merge
merge 명령은 두 데이터 프레임의 공통 열 혹은 인덱스를 기준으로 데이터베이스 테이블 조인(join)과 같이 두 개의 테이블을 합친다. 기준이 되는 데이터를 키(key)라고 한다.

## merge 예제 1

In [29]:
df1 = pd.DataFrame({'key': ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희'], 'data1': range(7)})
df1

Unnamed: 0,data1,key
0,0,둘리
1,1,도우너
2,2,또치
3,3,길동
4,4,희동
5,5,마이콜
6,6,영희


In [30]:
df2 = pd.DataFrame({'key': ['길동', '희동', '철수'], 'data2': range(3)})
df2

Unnamed: 0,data2,key
0,0,길동
1,1,희동
2,2,철수


``merge`` 명령으로 두 데이터프레임 df1, df2 를 합치면 공통 열인 ``key`` 열을 기준으로 데이터를 찾아서 합친다. 이 때 기본적으로는 양쪽 데이터프레임에 모두 키가 존재하는 데이터만 보여주는 inner join 방식을 사용한다.

In [31]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,3,길동,0
1,4,희동,1


Inner Join:

In [32]:
df1.merge(df2)

Unnamed: 0,data1,key,data2
0,3,길동,0
1,4,희동,1


outer join 방식은 키 값이 한쪽에만 있어도 양쪽 데이터를 모두 보여준다.

In [33]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,data1,key,data2
0,0.0,둘리,
1,1.0,도우너,
2,2.0,또치,
3,3.0,길동,0.0
4,4.0,희동,1.0
5,5.0,마이콜,
6,6.0,영희,
7,,철수,2.0


left, right 방식은 첫번째, 혹은 두번째 데이터프레임을 모두 보여준다.

In [34]:
pd.merge(df1, df2, how='left')

Unnamed: 0,data1,key,data2
0,0,둘리,
1,1,도우너,
2,2,또치,
3,3,길동,0.0
4,4,희동,1.0
5,5,마이콜,
6,6,영희,


In [35]:
pd.merge(df1, df2, how='right')

Unnamed: 0,data1,key,data2
0,3.0,길동,0
1,4.0,희동,1
2,,철수,2


## merge 예제 2

In [36]:
df1 = pd.DataFrame({'key': ['setosa', 'setosa', 'virginica', 'versicolor'], 
                    'petal length': [1.4, 1.3, 1.5, 1.3]})
df1

Unnamed: 0,key,petal length
0,setosa,1.4
1,setosa,1.3
2,virginica,1.5
3,versicolor,1.3


In [37]:
df2 = pd.DataFrame({'key': ['setosa', 'virginica', 'virginica', 'versicolor'], 
                    'petal width': [0.4, 0.3, 0.5, 0.3]})
df2

Unnamed: 0,key,petal width
0,setosa,0.4
1,virginica,0.3
2,virginica,0.5
3,versicolor,0.3


In [38]:
pd.merge(df1, df2)

Unnamed: 0,key,petal length,petal width
0,setosa,1.4,0.4
1,setosa,1.3,0.4
2,virginica,1.5,0.3
3,virginica,1.5,0.5
4,versicolor,1.3,0.3


# merge 예제 3

In [39]:
df1 = pd.DataFrame({'key1': ['foo', 'foo', 'bar'], 
                    'key2': ['one', 'two', 'one'], 
                    'lval': [1, 2, 3]})
df1

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [40]:
df2 = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                    'key2': ['one', 'one', 'one', 'two'],
                    'rval': [4, 5, 6, 7]})
df2

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


이름이 같은 열은 모두 기준 열로 사용된다.

In [41]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


기준 열은 ``on`` 인수로 명시적 설정이 가능하다.

In [42]:
pd.merge(df1, df2, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [43]:
pd.merge(df1, df2, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


``suffixes`` 인수로 접미사를 직접 설정할 수도 있다.

# merge 예제 4

기준 열을 각각의 데이터프레임에 대해 다르게 정하려면 ``left_on``, ``right_on`` 인수를 사용한다.

In [44]:
df1 = pd.DataFrame({'key1': ['foo', 'foo', 'bar'], 
                    'key2': ['one', 'two', 'one'], 
                    'lval': [1, 2, 3]})
df1

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [45]:
df2 = pd.DataFrame({'k1': ['foo', 'foo', 'bar', 'bar'],
                    'k2': ['one', 'one', 'one', 'two'],
                    'rval': [4, 5, 6, 7]})
df2

Unnamed: 0,k1,k2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [46]:
pd.merge(df1, df2, left_on='key1', right_on="k1")

Unnamed: 0,key1,key2,lval,k1,k2,rval
0,foo,one,1,foo,one,4
1,foo,one,1,foo,one,5
2,foo,two,2,foo,one,4
3,foo,two,2,foo,one,5
4,bar,one,3,bar,one,6
5,bar,one,3,bar,two,7


# merge 예제 5

일반 데이터 열이 아닌 인덱스를 기준열로 사용하려면 left_index 또는 right_index 인수를 True 로 설정한다.

In [47]:
df1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
df1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [48]:
df2 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
df2

Unnamed: 0,group_val
a,3.5
b,7.0


In [49]:
pd.merge(df1, df2, left_on='key', right_index=True, how = 'outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


# merge 예제 6

In [50]:
df1 = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                    'key2': [2000, 2001, 2002, 2001, 2002],
                    'data': np.arange(5.)})
df1

Unnamed: 0,data,key1,key2
0,0.0,Ohio,2000
1,1.0,Ohio,2001
2,2.0,Ohio,2002
3,3.0,Nevada,2001
4,4.0,Nevada,2002


In [51]:
df2 = pd.DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])
df2

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [52]:
pd.merge(df1, df2, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4,5
0,0.0,Ohio,2000,6,7
1,1.0,Ohio,2001,8,9
2,2.0,Ohio,2002,10,11
3,3.0,Nevada,2001,0,1


# merge 예제 7

In [53]:
df1 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], 
                   index=['a', 'c', 'e'], 
                   columns=['Ohio', 'Nevada'])
df1

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [54]:
df2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], 
                   index=['b', 'c', 'd', 'e'], 
                   columns=['Missouri', 'Alabama'])
df2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [55]:
pd.merge(df1, df2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


# join 메서드

``merge`` 명령어 대신 ``join``메서드를 사용할 수도 있다

In [56]:
df1.join(df2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


# Concat

``concat`` 명령을 사용하면 기준 열(key column)을 사용하지 않고 단순히 데이터를 추가한다.

기본적으로는 아래에 데이터 행을 덧붙이지만 (인덱스가 중복됨) ``axis=1``로 인수를 설정하면 인덱스 기준으로 옆으로 데이터 열를 덧붙인다.

In [57]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [58]:
s1

a    0
b    1
dtype: int64

In [59]:
s2

c    2
d    3
e    4
dtype: int64

In [60]:
s3

f    5
g    6
dtype: int64

In [61]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [62]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [63]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [64]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [65]:
pd.concat([df1, df2])

Unnamed: 0,four,one,three,two
a,,0.0,,1.0
b,,2.0,,3.0
c,,4.0,,5.0
a,6.0,,5.0,
c,8.0,,7.0,


In [66]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


# Pandas 피봇과 그룹 연산

# 피봇 테이블

피봇 테이블(pivot table)이란 데이터 열(column) 중에서 두 개를 키(key)로 사용하여 데이터를 선택하는 방법을 말한다.

피봇 테이블을 사용하기 위해서는 키가 될 수 있는 두 개의 열(column) 혹은 필드(field)를 선택하여 이 두 열을

* 행 인덱스 (row index)
* 열 인덱스 (column index)

로 변경해야 한다.

<img src='./images/pivot_table.png'>

* `pivot` 메서드를 사용하면 행 인덱스, 열 인덱스, 자료가 될 3가지의 열(column)을 지정할 수 있다.

In [82]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 2.5, 3.0, 2.5, 3.5]
}
df = pd.DataFrame(data, columns=["state", "year", "pop"])
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,2.5
2,Ohio,2002,3.0
3,Nevada,2001,2.5
4,Nevada,2002,3.5


In [83]:
df.pivot("state", "year", "pop")

year,2000,2001,2002
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nevada,,2.5,3.5
Ohio,1.5,2.5,3.0


* 행 인덱스와, 열 인덱스가 될 자료는 키(key)의 역할을 해야 한다. 즉, 이 값으로 데이터가 유일하게(unique) 결정되어야 한다.

In [88]:
df.set_index(["state", "year"]).unstack()

Unnamed: 0_level_0,pop,pop,pop
year,2000,2001,2002
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Nevada,,2.5,3.5
Ohio,1.5,2.5,3.0


## 그룹 연산

그룹 연산은 피봇 테이블과 달리 키에 의해서 결정되는 데이터가 복수개가 있어도 괜찮다. 대신 연산을 통해 복수개의 그룹 데이터에 대한 대표값을 정한다. 이를 split-apply-combine 연산이라고도 한다.


* split 단계
 * 특정 Key 값에 따라 데이터 그룹을 만든다.


* apply 단계
 * 각각의 그룹에 대해 원하는 연산을 하여 대표값을 생성한다.
   * `count()`, `mean()`, `median()`, `min()`, `max()`
   * `sum()`, `prod()`, `std()`, `var()`, `quantile()`
   * `first()`, `last()`

* combine 단계
 * 그룹의 Key 값에 대해 원하는 연산의 결과를 Value로 지정한 dict를 생성한다.
 
<img src="https://datascienceschool.net/upfiles/5cf33c481e8041ebbf56a5af1f84d487.png" style="width:80%;">

In [90]:
np.random.seed(0)
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.764052,-0.977278,a,one
1,0.400157,0.950088,a,two
2,0.978738,-0.151357,b,one
3,2.240893,-0.103219,b,two
4,1.867558,0.410599,a,one


* 문제: key1 값에 따른 data1의 평균은?

In [91]:
df.data1.groupby(df.key1).mean()

key1
a    1.343923
b    1.609816
Name: data1, dtype: float64

In [92]:
gs = df.data1.groupby(df.key1)
gs

<pandas.core.groupby.SeriesGroupBy object at 0x000002513525A9E8>

In [93]:
gs.mean()

key1
a    1.343923
b    1.609816
Name: data1, dtype: float64

* 문제: 복합 key (key1, key2)  값에 따른 data1의 평균은?

In [94]:
means = df.data1.groupby([df.key1, df.key2]).mean()
means

key1  key2
a     one     1.815805
      two     0.400157
b     one     0.978738
      two     2.240893
Name: data1, dtype: float64

### groupby 명령의 인수

* groupby 명령에서  Key 인수로 입력할 수 있는 값은 다음과 같다.
 * 열 또는 열의 리스트 
 * 행 인덱스
 * 사전/함수: Column의 값을 사전에 매핑(mapping)하거나 함수 처리하여 나온 결괏값을 키로 인식


In [95]:
np.random.seed(0)
people = pd.DataFrame(np.random.randn(5, 5), 
                      columns=['a', 'b', 'c', 'd', 'e'], 
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan
people

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


Unnamed: 0,a,b,c,d,e
Joe,1.764052,0.400157,0.978738,2.240893,1.867558
Steve,-0.977278,0.950088,-0.151357,-0.103219,0.410599
Wes,0.144044,,,0.121675,0.443863
Jim,0.333674,1.494079,-0.205158,0.313068,-0.854096
Travis,-2.55299,0.653619,0.864436,-0.742165,2.269755


In [101]:
print("="*80)
for n, g in people.groupby(people.index):
    print("[key]:", n)
    print("-"*80)
    print(g)
    print("="*80)

[key]: Jim
--------------------------------------------------------------------------------
            a         b         c         d         e
Jim  0.333674  1.494079 -0.205158  0.313068 -0.854096
[key]: Joe
--------------------------------------------------------------------------------
            a         b         c         d         e
Joe  1.764052  0.400157  0.978738  2.240893  1.867558
[key]: Steve
--------------------------------------------------------------------------------
              a         b         c         d         e
Steve -0.977278  0.950088 -0.151357 -0.103219  0.410599
[key]: Travis
--------------------------------------------------------------------------------
              a         b         c         d         e
Travis -2.55299  0.653619  0.864436 -0.742165  2.269755
[key]: Wes
--------------------------------------------------------------------------------
            a   b   c         d         e
Wes  0.144044 NaN NaN  0.121675  0.443863


In [102]:
mapping = {'Joe': 'J', 'Jim': 'J', 'Steve': 'S', 'Wes': 'S', 'Travis': 'S'}
print("="*80)
for n, g in people.groupby(mapping):
    print("[key]:", n)
    print("[group]:", type(g))
    print("-"*80)
    print(g)
    print("="*80)

[key]: J
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
            a         b         c         d         e
Joe  1.764052  0.400157  0.978738  2.240893  1.867558
Jim  0.333674  1.494079 -0.205158  0.313068 -0.854096
[key]: S
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
               a         b         c         d         e
Steve  -0.977278  0.950088 -0.151357 -0.103219  0.410599
Wes     0.144044       NaN       NaN  0.121675  0.443863
Travis -2.552990  0.653619  0.864436 -0.742165  2.269755


In [104]:
cap1 = lambda x: x[0].upper()
print("="*80)
for n, g in people.groupby(cap1):
    print("[key]:", n)
    print("[group]:", type(g))
    print("-"*80)
    print(g)
    print("="*80)

[key]: J
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
            a         b         c         d         e
Joe  1.764052  0.400157  0.978738  2.240893  1.867558
Jim  0.333674  1.494079 -0.205158  0.313068 -0.854096
[key]: S
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
              a         b         c         d         e
Steve -0.977278  0.950088 -0.151357 -0.103219  0.410599
[key]: T
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
              a         b         c         d         e
Travis -2.55299  0.653619  0.864436 -0.742165  2.269755
[key]: W
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
            a   b   c         d         e
Wes  0.144044 NaN NaN  0

In [105]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
for n, g in people.groupby(mapping, axis=1):
    print("[key]:", n)
    print("[group]:", type(g))
    print("-"*80)
    print(g)
    print("="*80)

[key]: blue
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
               c         d
Joe     0.978738  2.240893
Steve  -0.151357 -0.103219
Wes          NaN  0.121675
Jim    -0.205158  0.313068
Travis  0.864436 -0.742165
[key]: red
[group]: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------------------------------------
               a         b         e
Joe     1.764052  0.400157  1.867558
Steve  -0.977278  0.950088  0.410599
Wes     0.144044       NaN  0.443863
Jim     0.333674  1.494079 -0.854096
Travis -2.552990  0.653619  2.269755


## 특별한 group 별 연산
* 통계
   * `describe()`

* 그룹을 대표하는 하나의 값을 계산
   * `agg()`, `aggregate()`

* 대표값으로 필드를 교체
   * `transform()`

* 그룹 전체를 변형하는 계산
   * `apply()`

In [106]:
import seaborn as sns

In [107]:
tips = sns.load_dataset("tips")
tips.tail(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
234,15.53,3.0,Male,Yes,Sat,Dinner,2
235,10.07,1.25,Male,No,Sat,Dinner,2
236,12.6,1.0,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [108]:
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.0,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.0,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204
243,18.78,3.0,Female,No,Thur,Dinner,2,0.159744


In [109]:
tips.describe()

Unnamed: 0,total_bill,tip,size,tip_pct
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.160803
std,8.902412,1.383638,0.9511,0.061072
min,3.07,1.0,1.0,0.035638
25%,13.3475,2.0,2.0,0.129127
50%,17.795,2.9,2.0,0.15477
75%,24.1275,3.5625,3.0,0.191475
max,50.81,10.0,6.0,0.710345


### 그룹별 통계

In [113]:
tips.groupby(["sex", "smoker"])[["tip", "tip_pct"]].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip,tip,tip,tip,tip,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Male,Yes,60.0,3.051167,1.50012,1.0,2.0,3.0,3.82,10.0,60.0,0.152771,0.090588,0.035638,0.101845,0.141015,0.191697,0.710345
Male,No,97.0,3.113402,1.489559,1.25,2.0,2.74,3.71,9.0,97.0,0.160669,0.041849,0.071804,0.13181,0.157604,0.18622,0.29199
Female,Yes,33.0,2.931515,1.219916,1.0,2.0,2.88,3.5,6.5,33.0,0.18215,0.071595,0.056433,0.152439,0.173913,0.198216,0.416667
Female,No,54.0,2.773519,1.128425,1.0,2.0,2.68,3.4375,5.2,54.0,0.156921,0.036421,0.056797,0.139708,0.149691,0.18163,0.252672


In [116]:
gs = tips.groupby(["sex", "smoker"])
gs_pct = gs["tip_pct"]

In [117]:
gs_pct.mean()

sex     smoker
Male    Yes       0.152771
        No        0.160669
Female  Yes       0.182150
        No        0.156921
Name: tip_pct, dtype: float64

In [118]:
gs_pct.agg('mean')

sex     smoker
Male    Yes       0.152771
        No        0.160669
Female  Yes       0.182150
        No        0.156921
Name: tip_pct, dtype: float64

In [119]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

gs_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,Yes,0.152771,0.090588,0.674707
Male,No,0.160669,0.041849,0.220186
Female,Yes,0.18215,0.071595,0.360233
Female,No,0.156921,0.036421,0.195876


In [120]:
gs.agg({'tip_pct' : 'mean', 'total_bill' : peak_to_peak})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,0.152771,43.56
Male,No,0.160669,40.82
Female,Yes,0.18215,41.23
Female,No,0.156921,28.58


### 그룹의 값을 대표값으로 대체

In [122]:
gs = tips.groupby(["sex", "smoker"])

In [126]:
gs.agg("mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,Yes,22.2845,3.051167,2.5,0.152771
Male,No,19.791237,3.113402,2.71134,0.160669
Female,Yes,17.977879,2.931515,2.242424,0.18215
Female,No,18.105185,2.773519,2.592593,0.156921


In [124]:
tips2 = tips.copy()
tips2["tip2"] = gs.transform("mean")["tip_pct"]
tips2.tail(15)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,tip2
229,22.12,2.88,Female,Yes,Sat,Dinner,2,0.130199,0.18215
230,24.01,2.0,Male,Yes,Sat,Dinner,4,0.083299,0.152771
231,15.69,3.0,Male,Yes,Sat,Dinner,3,0.191205,0.152771
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199,0.160669
233,10.77,1.47,Male,No,Sat,Dinner,2,0.13649,0.160669
234,15.53,3.0,Male,Yes,Sat,Dinner,2,0.193175,0.152771
235,10.07,1.25,Male,No,Sat,Dinner,2,0.124131,0.160669
236,12.6,1.0,Male,Yes,Sat,Dinner,2,0.079365,0.152771
237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638,0.152771
238,35.83,4.67,Female,No,Sat,Dinner,3,0.130338,0.156921


### 그룹 자체를 대체

* `apply` 메소드는 수치값이 아닌 Group을 출력
* 단순히 대표값을 계산하는 것 뿐 아니라
* 순서 정렬, 일부 삭제 등 그룹 내의 레코드 자체를 변형하는 것도 가능

In [127]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [128]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [131]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [133]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [134]:
f = lambda x: x.describe()
tips.groupby(['smoker']).apply(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Yes,count,93.0,93.0,93.0,93.0
Yes,mean,20.756344,3.00871,2.408602,0.163196
Yes,std,9.832154,1.401468,0.810751,0.085119
Yes,min,3.07,1.0,1.0,0.035638
Yes,25%,13.42,2.0,2.0,0.106771
Yes,50%,17.92,3.0,2.0,0.153846
Yes,75%,26.86,3.68,3.0,0.195059
Yes,max,50.81,10.0,5.0,0.710345
No,count,151.0,151.0,151.0,151.0
No,mean,19.188278,2.991854,2.668874,0.159328


## pivot_table

* pivot 명령과 groupby 명령의 중간적 성격
* pivot을 수행하지만 데이터가 유니크하게 선택되지 않으면 aggfunc 인수로 정의된 함수를 수행하여 대표값 계산
* 디폴트 aggfunc 은 평균 계산

In [135]:
tips.pivot_table(index=['sex', 'smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,Yes,2.5,3.051167,0.152771,22.2845
Male,No,2.71134,3.113402,0.160669,19.791237
Female,Yes,2.242424,2.931515,0.18215,17.977879
Female,No,2.592593,2.773519,0.156921,18.105185


In [136]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'], columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,Yes,No,Yes,No
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Male,Thur,2.3,2.5,0.164417,0.165706
Male,Fri,2.125,2.0,0.14473,0.138005
Male,Sat,2.62963,2.65625,0.139067,0.162132
Male,Sun,2.6,2.883721,0.173964,0.158291
Female,Thur,2.428571,2.48,0.163073,0.155971
Female,Fri,2.0,2.5,0.209129,0.165296
Female,Sat,2.2,2.307692,0.163817,0.147993
Female,Sun,2.5,3.071429,0.237075,0.16571


In [137]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                 columns='smoker', margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,Yes,No,All,Yes,No,All
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Male,Thur,2.3,2.5,2.433333,0.164417,0.165706,0.165276
Male,Fri,2.125,2.0,2.1,0.14473,0.138005,0.143385
Male,Sat,2.62963,2.65625,2.644068,0.139067,0.162132,0.151577
Male,Sun,2.6,2.883721,2.810345,0.173964,0.158291,0.162344
Female,Thur,2.428571,2.48,2.46875,0.163073,0.155971,0.157525
Female,Fri,2.0,2.5,2.111111,0.209129,0.165296,0.199388
Female,Sat,2.2,2.307692,2.25,0.163817,0.147993,0.15647
Female,Sun,2.5,3.071429,2.944444,0.237075,0.16571,0.181569
All,,2.408602,2.668874,2.569672,0.163196,0.159328,0.160803


In [138]:
tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
                 aggfunc=len, margins=True)

Unnamed: 0_level_0,day,Thur,Fri,Sat,Sun,All
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,Yes,10.0,8.0,27.0,15.0,60.0
Male,No,20.0,2.0,32.0,43.0,97.0
Female,Yes,7.0,7.0,15.0,4.0,33.0
Female,No,25.0,2.0,13.0,14.0,54.0
All,,62.0,19.0,87.0,76.0,244.0


In [139]:
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                 columns='day', aggfunc='sum', fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Thur,Fri,Sat,Sun
time,sex,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lunch,Male,Yes,23,5,0,0
Lunch,Male,No,50,0,0,0
Lunch,Female,Yes,17,6,0,0
Lunch,Female,No,60,3,0,0
Dinner,Male,Yes,0,12,71,39
Dinner,Male,No,0,4,85,124
Dinner,Female,Yes,0,8,33,10
Dinner,Female,No,2,2,30,43


### 연습 문제 1

타이타닉 승객 데이터를 이용하여 다음 분석을 실시하라. 데이터는 다음과 같이 받을 수 있다.

```
titanic = sns.load_dataset("titanic")
```

1. 남/여, 선실, 나이에 의한 생존율을 데이터프레임으로 계산한다. 
행에는 남/여 및 나이에 대한 계층적 인덱스를 사용하고 열에는 선실 인덱스를 사용한다.
2. 남/여 및  선실에 의한 생존율을 피봇 데이터 형태로 만든다.

