## Chapter 7
# 데이터 준비하기: 다듬기, 변형, 병합
---
## 재형성과 피벗

In [1]:
%pylab inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


## stack(), unstack()
- stack : 열을 몽땅 인덱스로 넘긴다. (MultiIndex)
- unstack : MultiIndex 를 열로 옮긴다.

In [2]:
data = pd.DataFrame(np.arange(6).reshape(2,3),
                   index=pd.Index(['Ohio','Colorado'],name='state'),
                   columns=pd.Index(['one','two','three'],name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [3]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [4]:
result.unstack() # level=1

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [5]:
result.unstack(level=0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [6]:
df = pd.DataFrame(result,columns=['a'])
df['b']=range(10,16)
df
df2 = df.unstack()
df2.stack(0)
df2.stack(0).unstack(0)

number,one,one,three,three,two,two
state,Ohio,Colorado,Ohio,Colorado,Ohio,Colorado
a,0,3,2,5,1,4
b,10,13,12,15,11,14


## pivot()
- 인덱스/칼럼/값 세가지 속성(열) 을 지정한다.
- 주의 : 인덱스와 칼럼을 합해 키가 중복되면 안된다.
- 9장에서 상세한 기능을 가진 pivot_table() 함수를 배운다.

In [7]:
from pandas.tseries.offsets import MonthEnd

data = pd.read_csv('macrodata.csv',dtype={0:int, 1:int})
# data.index=pd.to_datetime(['%d-%d' % (s1,int(s2)*3) for s1,s2 in zip(data.year,data.quarter)]) + MonthEnd(1)
data.index=pd.to_datetime(data.year*100 + data.quarter*3,format='%Y%m') + MonthEnd(1)
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959-03-31,1959,1,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1959-06-30,1959,2,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
1959-09-30,1959,3,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
1959-12-31,1959,4,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
1960-03-31,1960,1,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 203 entries, 1959-03-31 to 2009-09-30
Data columns (total 14 columns):
year        203 non-null int32
quarter     203 non-null int32
realgdp     203 non-null float64
realcons    203 non-null float64
realinv     203 non-null float64
realgovt    203 non-null float64
realdpi     203 non-null float64
cpi         203 non-null float64
m1          203 non-null float64
tbilrate    203 non-null float64
unemp       203 non-null float64
pop         203 non-null float64
infl        203 non-null float64
realint     203 non-null float64
dtypes: float64(12), int32(2)
memory usage: 32.2 KB


In [8]:
ldata = data[['realgdp','infl','unemp']].stack().reset_index()
ldata.columns=['date','item','value'] # ldata.rename(columns=...)
ldata.head()

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34


In [9]:
ldata.pivot('date','item','value').head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


In [10]:
ldata['value2']=np.random.randn(len(ldata))
ldata.pivot('date','item').head()
#ldata.set_index(['date','item']).unstack()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,-0.735448,-0.876039,-0.694583
1959-06-30,2.34,2778.801,5.1,-0.506969,-0.043094,0.253357
1959-09-30,2.74,2775.488,5.3,-0.451042,1.859869,-0.665456
1959-12-31,0.27,2785.204,5.6,-0.213347,-1.034907,-0.215612
1960-03-31,2.31,2847.699,5.2,-0.064467,-1.448843,-0.752661


In [11]:
# 참고
ldata2 = ldata.pivot('date','item').rename(columns=str.upper, level=1)
display(ldata2.head())
ldata2.columns = [['a','a','a','b','b','b'],['a1','a2','a3','b1','b2','b3']]
ldata2.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,INFL,REALGDP,UNEMP,INFL,REALGDP,UNEMP
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,-0.735448,-0.876039,-0.694583
1959-06-30,2.34,2778.801,5.1,-0.506969,-0.043094,0.253357
1959-09-30,2.74,2775.488,5.3,-0.451042,1.859869,-0.665456
1959-12-31,0.27,2785.204,5.6,-0.213347,-1.034907,-0.215612
1960-03-31,2.31,2847.699,5.2,-0.064467,-1.448843,-0.752661


Unnamed: 0_level_0,a,a,a,b,b,b
Unnamed: 0_level_1,a1,a2,a3,b1,b2,b3
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,-0.735448,-0.876039,-0.694583
1959-06-30,2.34,2778.801,5.1,-0.506969,-0.043094,0.253357
1959-09-30,2.74,2775.488,5.3,-0.451042,1.859869,-0.665456
1959-12-31,0.27,2785.204,5.6,-0.213347,-1.034907,-0.215612
1960-03-31,2.31,2847.699,5.2,-0.064467,-1.448843,-0.752661
