In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# 5.2 핵심 기능
## 5.2.1 재색인

In [2]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [4]:
obj.reindex(['a','b','c','d','e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [5]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])

In [6]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [7]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [9]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [11]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [12]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [24]:
frame.reindex(index=['a','b','c','d'], 
              method='ffill', 
              columns=states)

ValueError: index must be monotonic increasing or decreasing

In [46]:
frame.loc[['a','b','c','d'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## 5.2.2 하나의 로우 또는 칼럼 삭제하기

In [25]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [26]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])

In [28]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [34]:
data.drop(['two','four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## 5.2.3 색인하기, 선택하기, 거르기

In [36]:
obj = Series(np.arange(4.), index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [39]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [40]:
# 슬라이싱이랑 다르게 시작점과 끝점 포함
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [41]:
data = DataFrame(np.arange(16).reshape(4,4),
                index=['Ohio','Colorado','Utah','New York'],
                columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
data.ix['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32

In [48]:
data.ix[['Colorado','Utah'],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [50]:
data.ix[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [51]:
data.ix[data.three > 5, :3]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


## 5.2.4 산술연산과 데이터 정렬

In [55]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a','c','d','e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [56]:
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [57]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [58]:
df1 = DataFrame(np.arange(9.).reshape(3,3), columns=list('bdc'),
               index=['Ohio','Texas','Colorado'])
df1

Unnamed: 0,b,d,c
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [59]:
df2 = DataFrame(np.arange(12.).reshape(4,3), columns=list('bdc'),
               index=['Utah','Ohio','Texas','Oregon'])
df2

Unnamed: 0,b,d,c
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [60]:
# 겹치지 않는 부분은 NaN
df1 + df2

Unnamed: 0,b,d,c
Colorado,,,
Ohio,3.0,5.0,7.0
Oregon,,,
Texas,9.0,11.0,13.0
Utah,,,


In [61]:
# 겹치지 않는 부분은 NaN 없애기(0 넣음)
df1.add(df2, fill_value=0)

Unnamed: 0,b,d,c
Colorado,6.0,7.0,8.0
Ohio,3.0,5.0,7.0
Oregon,9.0,10.0,11.0
Texas,9.0,11.0,13.0
Utah,0.0,1.0,2.0


In [63]:
df3 = DataFrame(np.arange(20.).reshape(4,5), columns=list('abcde'))
df1.reindex(columns=df3.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
Ohio,0,0.0,2.0,1.0,0
Texas,0,3.0,5.0,4.0,0
Colorado,0,6.0,8.0,7.0,0


## 5.2.5 함수 적용과 매핑

In [65]:
frame= DataFrame(np.random.randn(4,3), columns=list('bde'),
                index=['Utan','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utan,-0.54396,1.668098,2.09008
Ohio,-1.038881,0.83059,1.222307
Texas,-1.341092,-0.710786,0.453553
Oregon,-0.366772,-1.714016,0.752834


In [66]:
np.abs(frame)

Unnamed: 0,b,d,e
Utan,0.54396,1.668098,2.09008
Ohio,1.038881,0.83059,1.222307
Texas,1.341092,0.710786,0.453553
Oregon,0.366772,1.714016,0.752834


In [67]:
# apply : 1차원 배열에 함수 적용
f = lambda x: x.max() - x.min()
frame.apply(f)

b    0.974320
d    3.382114
e    1.636527
dtype: float64

In [68]:
frame.apply(f, axis=1)

Utan      2.634040
Ohio      2.261188
Texas     1.794644
Oregon    2.466850
dtype: float64

In [70]:
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.341092,-1.714016,0.453553
max,-0.366772,1.668098,2.09008


In [71]:
# 실수 값 문자열 포맷
format = lambda x: '%.2f'% x
frame.applymap(format)

Unnamed: 0,b,d,e
Utan,-0.54,1.67,2.09
Ohio,-1.04,0.83,1.22
Texas,-1.34,-0.71,0.45
Oregon,-0.37,-1.71,0.75


In [72]:
frame['e'].map(format)

Utan      2.09
Ohio      1.22
Texas     0.45
Oregon    0.75
Name: e, dtype: object

## 5.2.6 정렬과 순위

In [78]:
obj = Series(range(4), index=list('dabc')) #['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [80]:
frame =DataFrame(np.arange(8).reshape((2,4)), index=['three','one'],
                 columns=list('dabc'))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [81]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [82]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [83]:
# 내림차순 정렬
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [84]:
# 값에 따라 정렬
obj = Series([4,7,-3,2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [85]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [91]:
# 하나 이상의 컬럼에 있는 값으로 정렬
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [92]:
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [94]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [95]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64