# 5장 데이터 구조에 익숙해지기

## 5-1 정렬 처리

In [1]:
import pandas as pd

### 5-1-1 시리즈 정렬

## 예제 5-1 시리즈의 값 정렬 

In [2]:
obj1 = pd.Series([40,10,20,30], index=['가','다','나','라'])

In [3]:
obj1

가    40
다    10
나    20
라    30
dtype: int64

In [4]:
obj1.sort_values()

다    10
나    20
라    30
가    40
dtype: int64

In [5]:
obj1.sort_values(ascending=False)

가    40
라    30
나    20
다    10
dtype: int64

In [6]:
obj1

가    40
다    10
나    20
라    30
dtype: int64

In [7]:
obj1.argsort()

가    1
다    2
나    3
라    0
dtype: int64

In [8]:
obj1.idxmin()

'다'

In [9]:
obj1.idxmax()

'가'

## 예제 5-2 시리즈의 인덱스 정렬 

In [10]:
obj2 = pd.Series([40,10,20,30], index=['c','a','b','d'])

In [11]:
obj2

c    40
a    10
b    20
d    30
dtype: int64

In [12]:
obj2.sort_index()

a    10
b    20
c    40
d    30
dtype: int64

In [13]:
obj2.sort_index(ascending=False)

d    30
c    40
b    20
a    10
dtype: int64

### 5-1-2 데이터 프레임 정렬

## 예제 5-3 데이터프레임의 값 정렬

In [14]:
import numpy as np

In [15]:
data = np.arange(8).reshape((2,4))

In [16]:
data

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [17]:
frame = pd.DataFrame(data, 
                     index=['three','one'], 
                     columns=['d','a','b','c'])

In [18]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [19]:
frame.sort_values(by='a')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [20]:
frame.sort_values(by='a', ascending=False)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [21]:
data1 = data.astype(np.float)

In [22]:
data1[0,1] = np.nan

In [23]:
frame1 = pd.DataFrame(data1, 
                     index=['three','one'], 
                     columns=['d','a','b','c'])

In [24]:
frame1

Unnamed: 0,d,a,b,c
three,0.0,,2.0,3.0
one,4.0,5.0,6.0,7.0


In [25]:
frame1.sort_values(by='a')

Unnamed: 0,d,a,b,c
one,4.0,5.0,6.0,7.0
three,0.0,,2.0,3.0


In [26]:
frame1.sort_values(by='a',na_position='last')

Unnamed: 0,d,a,b,c
one,4.0,5.0,6.0,7.0
three,0.0,,2.0,3.0


In [27]:
frame1.sort_values(by='a',na_position='first')

Unnamed: 0,d,a,b,c
three,0.0,,2.0,3.0
one,4.0,5.0,6.0,7.0


In [28]:
frame2 = pd.DataFrame([{'가':3, '나':15, '다': 3},
                       {'가':3, '나':10, '다': 5},
                       {'가':1, '나':20, '다': 5},
                       {'가':2, '나':15, '다': 7},
                       {'가':2, '나':100,'다': 9}])

In [29]:
frame2

Unnamed: 0,가,나,다
0,3,15,3
1,3,10,5
2,1,20,5
3,2,15,7
4,2,100,9


In [30]:
frame2.sort_values(['가','나'], ascending=[False,True])

Unnamed: 0,가,나,다
1,3,10,5
0,3,15,3
3,2,15,7
4,2,100,9
2,1,20,5


## 예제 5-4 데이터프레임의  인덱스 정렬

In [31]:
frame3 = pd.DataFrame([{'가':3, '나':15, '다': 3},
                       {'가':3, '나':10, '다': 5},
                       {'가':1, '나':20, '다': 5},
                       {'가':2, '나':15, '다': 7},
                       {'가':2, '나':100,'다': 9}],
                     columns=['다','가','나'])

In [32]:
frame3

Unnamed: 0,다,가,나
0,3,3,15
1,5,3,10
2,5,1,20
3,7,2,15
4,9,2,100


In [33]:
frame3.sort_index(axis=0)

Unnamed: 0,다,가,나
0,3,3,15
1,5,3,10
2,5,1,20
3,7,2,15
4,9,2,100


In [34]:
frame2.sort_index(axis=1)

Unnamed: 0,가,나,다
0,3,15,3
1,3,10,5
2,1,20,5
3,2,15,7
4,2,100,9


In [35]:
frame2.sort_index().sort_index(axis=1)

Unnamed: 0,가,나,다
0,3,15,3
1,3,10,5
2,1,20,5
3,2,15,7
4,2,100,9


### 5-1-3 데이터의 순위 및 데이터 이동 처리

## 예제 5-5 데이터프레임의  순위 및 이동  처리

In [36]:
data1 = {'이름': ['길동', '옥주', '현웅', '주몽', '지원'], 
        '학번': [2012, 2012, 2013, 2014, 2014], 
        '과제건수': [1, 5, 2, 3, 4],
        '점수': [25, 94, 57, 62, 70]}

In [37]:
frame3 = pd.DataFrame(data1)

In [38]:
frame3

Unnamed: 0,이름,학번,과제건수,점수
0,길동,2012,1,25
1,옥주,2012,5,94
2,현웅,2013,2,57
3,주몽,2014,3,62
4,지원,2014,4,70


In [39]:
frame3['점수'].rank(ascending=False)

0    5.0
1    1.0
2    4.0
3    3.0
4    2.0
Name: 점수, dtype: float64

In [40]:
frame3['순위'] = frame3['점수'].rank(ascending=False)

In [41]:
frame3

Unnamed: 0,이름,학번,과제건수,점수,순위
0,길동,2012,1,25,5.0
1,옥주,2012,5,94,1.0
2,현웅,2013,2,57,4.0
3,주몽,2014,3,62,3.0
4,지원,2014,4,70,2.0


In [42]:
frame3.sort_values(by="순위")

Unnamed: 0,이름,학번,과제건수,점수,순위
1,옥주,2012,5,94,1.0
4,지원,2014,4,70,2.0
3,주몽,2014,3,62,3.0
2,현웅,2013,2,57,4.0
0,길동,2012,1,25,5.0


In [43]:
frame4 = frame3.sort_values(by="순위")

In [44]:
frame4.index = [0,1,2,3,4]

In [45]:
frame4

Unnamed: 0,이름,학번,과제건수,점수,순위
0,옥주,2012,5,94,1.0
1,지원,2014,4,70,2.0
2,주몽,2014,3,62,3.0
3,현웅,2013,2,57,4.0
4,길동,2012,1,25,5.0


In [46]:
data = {'가' : pd.Series([1.], index=['a']),
        '나' : pd.Series([1., 2.], index=['a', 'b']),
        '다' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}


In [47]:
df = pd.DataFrame(data)

In [48]:
df

Unnamed: 0,가,나,다
a,1.0,1.0,1.0
b,,2.0,2.0
c,,,3.0
d,,,4.0


In [49]:
df['나'] = pd.Series([3,4,5,6],index=list('abcd'),dtype='float')

In [50]:
df

Unnamed: 0,가,나,다
a,1.0,3.0,1.0
b,,4.0,2.0
c,,5.0,3.0
d,,6.0,4.0


In [51]:
df1 = df[["나", "다"]]


In [52]:
df1

Unnamed: 0,나,다
a,3.0,1.0
b,4.0,2.0
c,5.0,3.0
d,6.0,4.0


In [53]:
df1.at['a','나'] = 100

In [54]:
df1

Unnamed: 0,나,다
a,100.0,1.0
b,4.0,2.0
c,5.0,3.0
d,6.0,4.0


In [55]:
df1.at[['b','c'],['나']] = 99

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc[index, col] = value


In [56]:
df1

Unnamed: 0,나,다
a,100.0,1.0
b,99.0,2.0
c,99.0,3.0
d,6.0,4.0


In [57]:
pd.set_option('chained',None)

In [58]:
a = df1.나.shift(-1)

In [59]:
a

a    99.0
b    99.0
c     6.0
d     NaN
Name: 나, dtype: float64

In [60]:
df1['나'] = a

In [61]:
a = df1.나.shift(1) 

In [62]:
a

a     NaN
b    99.0
c    99.0
d     6.0
Name: 나, dtype: float64

In [63]:
df1['나'] = a

## 5-2 데이터 구조 변경

### 5-2-1 피봇을 통한 데이터 재구조화

In [64]:
import pandas as pd

## 예제 5-6 피봇을 이용한 재구조화 

In [65]:
df = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                           'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                           'baz': [1, 2, 3, 4, 5, 6]})

In [66]:
df

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
2,one,C,3
3,two,A,4
4,two,B,5
5,two,C,6


In [67]:
df.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [68]:
df1 = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                           'bar': ['A', 'B', 'A', 'A', 'B', 'C'],
                           'baz': [1, 2, 3, 4, 5, 6]})

In [69]:
try : 
    df1.pivot('foo','bar','baz')
except Exception as e :
    print(e)

Index contains duplicate entries, cannot reshape


In [70]:
df1 = df1.drop_duplicates(['foo','bar'])

In [71]:
df1

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
3,two,A,4
4,two,B,5
5,two,C,6


In [72]:
df1.pivot('foo','bar','baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1.0,2.0,
two,4.0,5.0,6.0


In [73]:
import seaborn as sns

In [74]:
titanic = sns.load_dataset('titanic')

In [75]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [76]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [77]:
titaic_s =  titanic.groupby('survived').mean()

In [78]:
titaic_s

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2.531876,30.626179,0.553734,0.32969,22.117887,0.817851,0.681239
1,1.950292,28.34369,0.473684,0.464912,48.395408,0.25731,0.476608


In [79]:
titaic_s.pivot(index='pclass', columns='age', values='fare')

age,28.343690,30.626179
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1.950292,48.395408,
2.531876,,22.117887


In [80]:
titanic_ = titanic.pivot_table(values='survived',
                               index='sex', 
                               columns='class',
                               aggfunc='sum')

In [81]:
titanic_

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,91,70,72
male,45,17,47


In [82]:
titanic_.sum().sum()

342

In [83]:
titanic['survived'].sum()

342

In [84]:
weight = pd.read_csv('../data/weight_loss.csv',encoding='cp949')

In [85]:
weight.head()

Unnamed: 0,이름,월,주,몸무게
0,지완,1월,1주,70
1,찬준,1월,1주,60
2,지완,1월,2주,69
3,찬준,1월,2주,59
4,지완,1월,3주,69


In [86]:
week4 = weight.query('주 == "4주"')

In [87]:
week4

Unnamed: 0,이름,월,주,몸무게
6,지완,1월,4주,69
7,찬준,1월,4주,59
14,지완,2월,4주,66
15,찬준,2월,4주,56
22,지완,3월,4주,64
23,찬준,3월,4주,54
30,지완,4월,4주,62
31,찬준,4월,4주,52


In [88]:
week4.pivot(index='월', columns='이름', values="몸무게")

이름,지완,찬준
월,Unnamed: 1_level_1,Unnamed: 2_level_1
1월,69,59
2월,66,56
3월,64,54
4월,62,52


In [89]:
week4.pivot_table(index='월',columns='이름',values="몸무게")

이름,지완,찬준
월,Unnamed: 1_level_1,Unnamed: 2_level_1
1월,69,59
2월,66,56
3월,64,54
4월,62,52


### 5-2-2 스택을 통한 데이터 재구조화

## 예제 5-7 스택을 이용한 재구조화

In [90]:
week4

Unnamed: 0,이름,월,주,몸무게
6,지완,1월,4주,69
7,찬준,1월,4주,59
14,지완,2월,4주,66
15,찬준,2월,4주,56
22,지완,3월,4주,64
23,찬준,3월,4주,54
30,지완,4월,4주,62
31,찬준,4월,4주,52


In [91]:
week5 = week4.set_index(['이름','월'])

In [92]:
week5.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,주,몸무게
이름,월,Unnamed: 2_level_1,Unnamed: 3_level_1
지완,1월,4주,69
찬준,1월,4주,59


In [93]:
week5_st = week5.stack()

In [94]:
type(week5_st)

pandas.core.series.Series

In [95]:
week5_st.index.get_level_values(0)

Index(['지완', '지완', '찬준', '찬준', '지완', '지완', '찬준', '찬준', '지완', '지완', '찬준', '찬준',
       '지완', '지완', '찬준', '찬준'],
      dtype='object', name='이름')

In [96]:
week5_st.index.get_level_values(1)

Index(['1월', '1월', '1월', '1월', '2월', '2월', '2월', '2월', '3월', '3월', '3월', '3월',
       '4월', '4월', '4월', '4월'],
      dtype='object', name='월')

In [97]:
week5_st.index.get_level_values(2)

Index(['주', '몸무게', '주', '몸무게', '주', '몸무게', '주', '몸무게', '주', '몸무게', '주', '몸무게',
       '주', '몸무게', '주', '몸무게'],
      dtype='object')

In [98]:
week5_st.values

array(['4주', 69, '4주', 59, '4주', 66, '4주', 56, '4주', 64, '4주', 54, '4주',
       62, '4주', 52], dtype=object)

In [99]:
week5_st.head(3)

이름  월      
지완  1월  주      4주
        몸무게    69
찬준  1월  주      4주
dtype: object

In [100]:
to_week5 = week5_st.to_frame()

In [101]:
to_week5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
이름,월,Unnamed: 2_level_1,Unnamed: 3_level_1
지완,1월,주,4주
지완,1월,몸무게,69
찬준,1월,주,4주
찬준,1월,몸무게,59
지완,2월,주,4주


In [102]:
to_week5.index.levels

FrozenList([['지완', '찬준'], ['1월', '2월', '3월', '4월'], ['주', '몸무게']])

In [103]:
to_week5.columns

RangeIndex(start=0, stop=1, step=1)

In [104]:
week5_un = week5.unstack()

In [105]:
week5_un.head()

Unnamed: 0_level_0,주,주,주,주,몸무게,몸무게,몸무게,몸무게
월,1월,2월,3월,4월,1월,2월,3월,4월
이름,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
지완,4주,4주,4주,4주,69,66,64,62
찬준,4주,4주,4주,4주,59,56,54,52


In [106]:
week5_un_un = week5.unstack().unstack()

In [107]:
type(week5_un_un)

pandas.core.series.Series

In [108]:
week5_un_un.values

array(['4주', '4주', '4주', '4주', '4주', '4주', '4주', '4주', 69, 59, 66, 56, 64,
       54, 62, 52], dtype=object)

In [109]:
week5_un_un.to_frame().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
Unnamed: 0_level_1,월,이름,Unnamed: 3_level_1
주,1월,지완,4주
주,1월,찬준,4주
주,2월,지완,4주
주,2월,찬준,4주
주,3월,지완,4주


In [110]:
week4_s = week4['몸무게']

In [111]:
week4_s.index = week5['몸무게'].index

In [112]:
week4_s.head()

이름  월 
지완  1월    69
찬준  1월    59
지완  2월    66
찬준  2월    56
지완  3월    64
Name: 몸무게, dtype: int64

In [113]:
week4_s.unstack()

월,1월,2월,3월,4월
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지완,69,66,64,62
찬준,59,56,54,52


In [114]:
type(week4_s.unstack())

pandas.core.frame.DataFrame

In [115]:
week4_s.unstack(level=0)

이름,지완,찬준
월,Unnamed: 1_level_1,Unnamed: 2_level_1
1월,69,59
2월,66,56
3월,64,54
4월,62,52


In [116]:
week4_s.unstack(level=1)

월,1월,2월,3월,4월
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지완,69,66,64,62
찬준,59,56,54,52


### 5-2-3 데이터 접합

## 예제 5-8  데이터를 합치기 

In [117]:
import numpy as np

In [118]:
columns = np.array(['봄','여름','가을','겨울'])

In [119]:
data = np.array([['A0','A1','A2','A3'],
                 ['B0','B1','B2','B3'],
                 ['C0','C1','C2','C3'],
                 ['D0','D1','D2','D3']])

In [120]:
df1 = pd.DataFrame(data.T,columns=columns)

In [121]:
data1 = np.array([['A4','A5','A6','A7'],
                 ['B4','B5','B6','B7'],
                 ['C4','C5','C6','C7'],
                 ['D4','D5','D6','D7']])

In [122]:
df2 = pd.DataFrame(data1.T,columns=columns)

In [123]:
df_con = pd.concat([df1,df2])

In [124]:
df_con

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [125]:
index_r = list(range(0,8))

In [126]:
df_con.index = index_r

In [127]:
df_con

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [128]:
df_con1 = pd.concat([df1,df2],ignore_index=True)

In [129]:
df_con1

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [130]:
df_h = pd.concat([df1,df2],axis=1)

In [131]:
df_h

Unnamed: 0,봄,여름,가을,겨울,봄.1,여름.1,가을.1,겨울.1
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


In [132]:
columns_r = ['봄', '여름', '가을', '겨울', '봄_', '여름_', '가을_', '겨울_']

In [133]:
df_h.columns = columns_r

In [134]:
df_h

Unnamed: 0,봄,여름,가을,겨울,봄_,여름_,가을_,겨울_
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


In [135]:
df_g = pd.concat([df1,df2],axis=1,ignore_index=True)

In [136]:
df_g

Unnamed: 0,0,1,2,3,4,5,6,7
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


## 예제 5-9  데이터를 추가하기 

In [137]:
df1

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [138]:
df11 = df1.copy()

In [139]:
df11.loc[4] = ['A4','B4','C4','D4']

In [140]:
df11

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4


In [141]:
df11_a = df11.append({'봄':'A5','여름' :'B5','가을':'C5','겨울':'D5'}, 
                     ignore_index=True)

In [142]:
df11_a

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5


In [143]:
df11

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4


In [144]:
df2

Unnamed: 0,봄,여름,가을,겨울
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [145]:
df11.append(df2)

Unnamed: 0,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


## 예제 5-10  데이터를 join 해서 처리하기

In [146]:
data3 = np.array([['A8','A9','A10','A11'],
                 ['B8','B9','B10','B11'],
                 ['E8','E9','E10','E11'],
                 ['F8','F9','F10','F11']])

In [147]:
columns3 = np.array(['봄','여름','춘분','추분'])

In [148]:
df3 = pd.DataFrame(data3.T,columns=columns3)

In [149]:
df3

Unnamed: 0,봄,여름,춘분,추분
0,A8,B8,E8,F8
1,A9,B9,E9,F9
2,A10,B10,E10,F10
3,A11,B11,E11,F11


In [150]:
df4 = pd.concat([df1,df3],join='inner',ignore_index=True)

In [151]:
df4

Unnamed: 0,봄,여름
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,A8,B8
5,A9,B9
6,A10,B10
7,A11,B11


In [152]:
df5 = pd.concat([df1,df3],axis=1, join='inner')

In [153]:
df5

Unnamed: 0,봄,여름,가을,겨울,봄.1,여름.1,춘분,추분
0,A0,B0,C0,D0,A8,B8,E8,F8
1,A1,B1,C1,D1,A9,B9,E9,F9
2,A2,B2,C2,D2,A10,B10,E10,F10
3,A3,B3,C3,D3,A11,B11,E11,F11


In [154]:
df6 = pd.concat([df1,df3],axis=1, join='outer')

In [155]:
df6

Unnamed: 0,봄,여름,가을,겨울,봄.1,여름.1,춘분,추분
0,A0,B0,C0,D0,A8,B8,E8,F8
1,A1,B1,C1,D1,A9,B9,E9,F9
2,A2,B2,C2,D2,A10,B10,E10,F10
3,A3,B3,C3,D3,A11,B11,E11,F11


In [156]:
pd.concat([df1,df3],axis=1, keys=['사계절'],join='inner')

Unnamed: 0_level_0,사계절,사계절,사계절,사계절
Unnamed: 0_level_1,봄,여름,가을,겨울
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [157]:
pd.concat([df1,df3],axis=1, keys=['사계절','춘추분'],join='inner')

Unnamed: 0_level_0,사계절,사계절,사계절,사계절,춘추분,춘추분,춘추분,춘추분
Unnamed: 0_level_1,봄,여름,가을,겨울,봄,여름,춘분,추분
0,A0,B0,C0,D0,A8,B8,E8,F8
1,A1,B1,C1,D1,A9,B9,E9,F9
2,A2,B2,C2,D2,A10,B10,E10,F10
3,A3,B3,C3,D3,A11,B11,E11,F11


In [158]:
df1.join(df3,how='inner',lsuffix='_lx', rsuffix='_rx')

Unnamed: 0,봄_lx,여름_lx,가을,겨울,봄_rx,여름_rx,춘분,추분
0,A0,B0,C0,D0,A8,B8,E8,F8
1,A1,B1,C1,D1,A9,B9,E9,F9
2,A2,B2,C2,D2,A10,B10,E10,F10
3,A3,B3,C3,D3,A11,B11,E11,F11


In [159]:
df1.join(df3,how='outer',lsuffix='_lx', rsuffix='_rx')

Unnamed: 0,봄_lx,여름_lx,가을,겨울,봄_rx,여름_rx,춘분,추분
0,A0,B0,C0,D0,A8,B8,E8,F8
1,A1,B1,C1,D1,A9,B9,E9,F9
2,A2,B2,C2,D2,A10,B10,E10,F10
3,A3,B3,C3,D3,A11,B11,E11,F11


In [160]:
df1.join(df3,on='봄',how='inner',lsuffix='_lx', rsuffix='_rx')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [161]:
df1.join(df3,on='봄',how='outer',lsuffix='_lx', rsuffix='_rx')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

### 5-2-4 데이터 변형

In [162]:
import pandas as pd

## 예제 5-11  데이터를 병합 처리하기

In [163]:
movie = pd.read_csv('../data/korea_movie_list.csv',encoding='cp949')

In [164]:
movie.columns

Index(['movie_code', 'title', 'title_Eng', 'show_time', 'produce_year',
       'open_date', 'produce_state', 'type', 'nation', 'genre', 'director',
       'actor', 'show_type', 'watch_grade'],
      dtype='object')

In [165]:
movie.head(2)

Unnamed: 0,movie_code,title,title_Eng,show_time,produce_year,open_date,produce_state,type,nation,genre,director,actor,show_type,watch_grade
0,20185801,할로우 차일드,The Hollow Child,88.0,2017,20180802,개봉예정,장편,캐나다,공포(호러)/판타지,제레미 루터,,,15세이상관람가
1,20187649,죽음의 리무진,Glass Coffin,75.0,2016,20180816,개봉예정,장편,스페인,스릴러/공포(호러),하리츠 쥬빌라가,파울라 본템피,,


In [166]:
df_ml = pd.melt(movie,id_vars=['title'], value_vars=['title_Eng'])

In [167]:
df_ml.head()

Unnamed: 0,title,variable,value
0,할로우 차일드,title_Eng,The Hollow Child
1,죽음의 리무진,title_Eng,Glass Coffin
2,극장판 도라에몽: 진구의 보물섬,title_Eng,
3,명탐정 코난 : 제로의 집행인,title_Eng,Detective Conan: Zero the Enforcer
4,살아남은 아이,title_Eng,Last Child


In [168]:
df_ml.tail()

Unnamed: 0,title,variable,value
2822,여간호사의 쾌감,title_Eng,Sexy nurse 3
2823,엑소시즘 2017,title_Eng,Evil Born (12/12/12)
2824,선배부인과 비밀공유,title_Eng,"The glossy and beautifl widow ""I'm sorry, hone..."
2825,불륜녀의 황홀한 입맞춤,title_Eng,Affair party
2826,대머리 오일마사지사,title_Eng,Lonely aunt Confessions of abstinence Miku Aoki


In [169]:
df_ml2 = pd.melt(movie,id_vars=['title'], value_vars=['title_Eng'],
               var_name='영어이름', value_name='영어명')

In [170]:
df_ml2.head()

Unnamed: 0,title,영어이름,영어명
0,할로우 차일드,title_Eng,The Hollow Child
1,죽음의 리무진,title_Eng,Glass Coffin
2,극장판 도라에몽: 진구의 보물섬,title_Eng,
3,명탐정 코난 : 제로의 집행인,title_Eng,Detective Conan: Zero the Enforcer
4,살아남은 아이,title_Eng,Last Child


In [171]:
df_ml2.isnull().sum()

title      0
영어이름       0
영어명      601
dtype: int64

In [172]:
df_ml3 = pd.melt(movie,id_vars=['title', 'title_Eng'], value_vars=['open_date'],
               var_name='변수이름', value_name='영어명과 개봉일자')

In [173]:
df_ml3.head()

Unnamed: 0,title,title_Eng,변수이름,영어명과 개봉일자
0,할로우 차일드,The Hollow Child,open_date,20180802
1,죽음의 리무진,Glass Coffin,open_date,20180816
2,극장판 도라에몽: 진구의 보물섬,,open_date,20180815
3,명탐정 코난 : 제로의 집행인,Detective Conan: Zero the Enforcer,open_date,20180808
4,살아남은 아이,Last Child,open_date,20180830


In [174]:
df_ml3.tail()

Unnamed: 0,title,title_Eng,변수이름,영어명과 개봉일자
2822,여간호사의 쾌감,Sexy nurse 3,open_date,20170102
2823,엑소시즘 2017,Evil Born (12/12/12),open_date,20170102
2824,선배부인과 비밀공유,"The glossy and beautifl widow ""I'm sorry, hone...",open_date,20170102
2825,불륜녀의 황홀한 입맞춤,Affair party,open_date,20170102
2826,대머리 오일마사지사,Lonely aunt Confessions of abstinence Miku Aoki,open_date,20170102


In [175]:
df_ml4 = pd.melt(movie,id_vars=['title', 'title_Eng'], value_vars=['produce_year','open_date'],
               var_name='변수이름', value_name='영어명과 개봉일자')

In [176]:
df_ml4.head()

Unnamed: 0,title,title_Eng,변수이름,영어명과 개봉일자
0,할로우 차일드,The Hollow Child,produce_year,2017
1,죽음의 리무진,Glass Coffin,produce_year,2016
2,극장판 도라에몽: 진구의 보물섬,,produce_year,2018
3,명탐정 코난 : 제로의 집행인,Detective Conan: Zero the Enforcer,produce_year,2018
4,살아남은 아이,Last Child,produce_year,2017


### 5-2-5 메소드로 열 추가

## 예제 5-12 열에 대한 추가

In [177]:
import numpy as np

In [178]:
df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})

In [179]:
df

Unnamed: 0,A,B
0,1,-0.428534
1,2,-1.007378
2,3,0.132298
3,4,0.331015
4,5,-1.499837
5,6,-0.089268
6,7,1.264256
7,8,1.18142
8,9,-0.150236
9,10,-1.061139


In [180]:
df_1 = df.copy()

In [181]:
df_1.assign(ln_A = lambda x: np.log(x.A))

Unnamed: 0,A,B,ln_A
0,1,-0.428534,0.0
1,2,-1.007378,0.693147
2,3,0.132298,1.098612
3,4,0.331015,1.386294
4,5,-1.499837,1.609438
5,6,-0.089268,1.791759
6,7,1.264256,1.94591
7,8,1.18142,2.079442
8,9,-0.150236,2.197225
9,10,-1.061139,2.302585


In [182]:
df_2 = df.copy()

In [183]:
df_2['ln_A'] = np.log(df_2['A'])

In [184]:
df_2

Unnamed: 0,A,B,ln_A
0,1,-0.428534,0.0
1,2,-1.007378,0.693147
2,3,0.132298,1.098612
3,4,0.331015,1.386294
4,5,-1.499837,1.609438
5,6,-0.089268,1.791759
6,7,1.264256,1.94591
7,8,1.18142,2.079442
8,9,-0.150236,2.197225
9,10,-1.061139,2.302585


## 예제 5-13 열에 대한 추가 활용 

In [185]:
movie = pd.read_csv('../data/korea_movie_list.csv',encoding='cp949')

In [186]:
movie.head(2)

Unnamed: 0,movie_code,title,title_Eng,show_time,produce_year,open_date,produce_state,type,nation,genre,director,actor,show_type,watch_grade
0,20185801,할로우 차일드,The Hollow Child,88.0,2017,20180802,개봉예정,장편,캐나다,공포(호러)/판타지,제레미 루터,,,15세이상관람가
1,20187649,죽음의 리무진,Glass Coffin,75.0,2016,20180816,개봉예정,장편,스페인,스릴러/공포(호러),하리츠 쥬빌라가,파울라 본템피,,


In [187]:
movie_ = pd.concat([movie.filter(like='prod'),
                    movie.filter(like='title')],
                   axis=1)

In [188]:
movie_.head()

Unnamed: 0,produce_year,produce_state,title,title_Eng
0,2017,개봉예정,할로우 차일드,The Hollow Child
1,2016,개봉예정,죽음의 리무진,Glass Coffin
2,2018,개봉예정,극장판 도라에몽: 진구의 보물섬,
3,2018,개봉예정,명탐정 코난 : 제로의 집행인,Detective Conan: Zero the Enforcer
4,2017,개봉예정,살아남은 아이,Last Child


In [189]:
movie_.assign(show_time= movie['show_time']).head()

Unnamed: 0,produce_year,produce_state,title,title_Eng,show_time
0,2017,개봉예정,할로우 차일드,The Hollow Child,88.0
1,2016,개봉예정,죽음의 리무진,Glass Coffin,75.0
2,2018,개봉예정,극장판 도라에몽: 진구의 보물섬,,107.0
3,2018,개봉예정,명탐정 코난 : 제로의 집행인,Detective Conan: Zero the Enforcer,110.0
4,2017,개봉예정,살아남은 아이,Last Child,123.0


### 5-2-6 여러 열에 대한 변형(wide_to_long)

## 예제 5-15 wide_to_long 처리 

In [190]:
df_new = pd.DataFrame({'구분': ['가', '나', '다', '라', '마'],
                       '년도': [2013, 2014, 2016, 2013, 2014],
                       '측정값1': [100, 245, 200, 200, 300],
                       '측정값2': [100, 245, 200, 200, 300]})

In [191]:
df_new

Unnamed: 0,구분,년도,측정값1,측정값2
0,가,2013,100,100
1,나,2014,245,245
2,다,2016,200,200
3,라,2013,200,200
4,마,2014,300,300


In [192]:
wl = pd.wide_to_long(df_new, ['측정값'],i=['구분','년도'], j='측정값구분')

In [193]:
type(wl)

pandas.core.frame.DataFrame

In [194]:
wl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,측정값
구분,년도,측정값구분,Unnamed: 3_level_1
가,2013,1,100
가,2013,2,100
나,2014,1,245
나,2014,2,245
다,2016,1,200
다,2016,2,200
라,2013,1,200
라,2013,2,200
마,2014,1,300
마,2014,2,300


In [195]:
wl_t = wl.unstack()

In [196]:
wl_t

Unnamed: 0_level_0,Unnamed: 1_level_0,측정값,측정값
Unnamed: 0_level_1,측정값구분,1,2
구분,년도,Unnamed: 2_level_2,Unnamed: 3_level_2
가,2013,100,100
나,2014,245,245
다,2016,200,200
라,2013,200,200
마,2014,300,300


In [197]:
wl_t.index

MultiIndex([('가', 2013),
            ('나', 2014),
            ('다', 2016),
            ('라', 2013),
            ('마', 2014)],
           names=['구분', '년도'])

In [198]:
wl_t.columns

MultiIndex([('측정값', 1),
            ('측정값', 2)],
           names=[None, '측정값구분'])

In [199]:
wl_t.columns = wl_t.columns.map('{0[0]}{0[1]}'.format)

In [200]:
wl_t

Unnamed: 0_level_0,Unnamed: 1_level_0,측정값1,측정값2
구분,년도,Unnamed: 2_level_1,Unnamed: 3_level_1
가,2013,100,100
나,2014,245,245
다,2016,200,200
라,2013,200,200
마,2014,300,300


In [201]:
wl_t.reset_index()

Unnamed: 0,구분,년도,측정값1,측정값2
0,가,2013,100,100
1,나,2014,245,245
2,다,2016,200,200
3,라,2013,200,200
4,마,2014,300,300


## 예제 5-16 wide_to_long 으로 여러 중복열 처리

In [202]:
import numpy as np


In [203]:
np.random.seed(123)


In [204]:
df = pd.DataFrame({"가수1970" : {0 : "남진", 1 : "나훈아", 2 : "문주란"},
                  "가수1980" : {0 : "전영록", 1 : "이용", 2 : "민혜경"},
                  "판매량1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
                  "판매량1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
                  "X"     : dict(zip(range(3), np.random.randn(3)))
                   })


In [205]:
df["id"] = df.index


In [206]:
df

Unnamed: 0,가수1970,가수1980,판매량1970,판매량1980,X,id
0,남진,전영록,2.5,3.2,-1.085631,0
1,나훈아,이용,1.2,1.3,0.997345,1
2,문주란,민혜경,0.7,0.1,0.282978,2


In [207]:
가수판매량 = pd.wide_to_long(df, ["가수", "판매량"], i="id", j="년도")

In [208]:
가수판매량

Unnamed: 0_level_0,Unnamed: 1_level_0,X,가수,판매량
id,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,-1.085631,남진,2.5
1,1970,0.997345,나훈아,1.2
2,1970,0.282978,문주란,0.7
0,1980,-1.085631,전영록,3.2
1,1980,0.997345,이용,1.3
2,1980,0.282978,민혜경,0.1


In [209]:
가수판매량.index

MultiIndex([(0, 1970),
            (1, 1970),
            (2, 1970),
            (0, 1980),
            (1, 1980),
            (2, 1980)],
           names=['id', '년도'])

In [210]:
가수판매량.columns

Index(['X', '가수', '판매량'], dtype='object')

In [211]:
가수판매량_un = 가수판매량.unstack()

In [212]:
가수판매량_un.columns

MultiIndex([(  'X', 1970),
            (  'X', 1980),
            ( '가수', 1970),
            ( '가수', 1980),
            ('판매량', 1970),
            ('판매량', 1980)],
           names=[None, '년도'])

In [213]:
가수판매량_un.columns = 가수판매량_un.columns.map('{0[0]}{0[1]}'.format)

In [214]:
가수판매량_un

Unnamed: 0_level_0,X1970,X1980,가수1970,가수1980,판매량1970,판매량1980
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-1.085631,-1.085631,남진,전영록,2.5,3.2
1,0.997345,0.997345,나훈아,이용,1.2,1.3
2,0.282978,0.282978,문주란,민혜경,0.7,0.1


In [215]:
가수판매량_un =가수판매량_un.reset_index()

In [216]:
가수판매량_un = 가수판매량_un.drop(['id','X1980'],axis=1)

In [217]:
가수판매량_un.rename(index=str, columns={"X1970": "X"},inplace=True)

In [218]:
가수판매량_un.sort_index(axis=1,ascending=False)

Unnamed: 0,판매량1980,판매량1970,가수1980,가수1970,X
0,3.2,2.5,전영록,남진,-1.085631
1,1.3,1.2,이용,나훈아,0.997345
2,0.1,0.7,민혜경,문주란,0.282978
