In [1]:
import pandas as pd
import numpy as np

# Reindex with Pandas 

This secion handle how to treat data in Series, and DataFrame. In the future, we will see a detail of analyzing, and modifying data using Pandas. This book focus on a core function rather than complete pandas library. 

## 재색인

새로운 색인에 맞도록 객체를 새로 생성하는 기능이다.


In [2]:
obj = pd.Series(
    [4.5, 7.2, -5.3, 3.6],
    index=['d', 'b', 'a', 'c']
)

obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

새로운 색인에 맞게 재배열한다.   
그리고 없는 값이 있는 경우 비어있는 값을 새로 추가한다.

In [3]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

fill_value 옵션을 통해 비어있는 값을 NaN 대신 다른 값으로 치환할 수 있다.

In [4]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [5]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0,2,4])

In [6]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [7]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [8]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])

In [9]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
frame.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [13]:
states = ['Texas', 'Utah', 'California']

frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [19]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill').reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [20]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [21]:
obj = pd.Series(np.arange(5.), index=['a', 'b','c','d','e'])

In [23]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [25]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [27]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])

In [29]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [32]:
# 일치하는 axis가 없는 경우 drop 할 경우에 KeyError가 발생함으로 주의한다.
data.drop(['Colorado', 'Ohio'], axis=0)

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
# axis=0 은 default 값이며 Row 를 의미한다. 
# axis=1 은 Column 을 의미한다.
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


색인하기, 선택하기, 거르기



In [43]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [44]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [40]:
obj['b']

1.0

In [41]:
obj[1]

1.0

In [42]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [45]:
obj[['a','b','d']]

a    0.0
b    1.0
d    3.0
dtype: float64

In [46]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [47]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [50]:
obj['a': 'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [51]:
obj['b':'c'] = 5

In [52]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [54]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [55]:
data[:2] 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [57]:
data[data['three'] > 3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [59]:
data < 4

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,False,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [61]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [67]:
data.loc[['Colorado', 'Utah'], ['four', 'one', 'two']]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [69]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [71]:
data.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [75]:
data.loc[data.three > 5, :'three']

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14
