## Chapter 7
# 데이터 준비하기: 다듬기, 변형, 병합
---
## 데이터 합치기 (pd.concat)

In [1]:
%pylab inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [2]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [3]:
pd.concat([s1,s2,s3], keys=['one','two','three'])

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [4]:
pd.concat([s1,s2,s3], keys=['one','two','three']).unstack() # stack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


In [5]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])

display(df1,df2)

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


Unnamed: 0,three,four
a,5,6
c,7,8


In [6]:
pd.concat([df1,df2])

Unnamed: 0,four,one,three,two
a,,0.0,,1.0
b,,2.0,,3.0
c,,4.0,,5.0
a,6.0,,5.0,
c,8.0,,7.0,


In [7]:
pd.concat([df1,df2],axis=1)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [8]:
df1 = pd.DataFrame(np.arange(12).reshape(4,3))
df2 = pd.DataFrame(np.arange(9).reshape(3,3)+100)

pd.concat([df1,df2])

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
0,100,101,102
1,103,104,105
2,106,107,108


In [9]:
pd.concat([df1,df2], ignore_index=True) # 합친 후 인덱스를 초기화한다

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,100,101,102
5,103,104,105
6,106,107,108


In [15]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

display(a,b)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [16]:
np.where(a.isnull(),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [17]:
a.where(a.notnull(),b) # 거짓인 것에 대해 b 를 적용한다

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [20]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=list('abcdef'))
b[-1] = np.nan

display(a,b)

a.where(a.notnull(),b)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    NaN
dtype: float64

f    NaN
e    2.5
d    3.0
c    3.5
b    4.5
a    0.0
dtype: float64