# DataFrame

In [None]:
# Jupyter Notebook 설정 : 모든 결과 출력
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [86]:
# DeprecationWarning / FutureWarning(차후 버전에서 없어질 기능이니 미리 경고하는 것) 끄기
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [73]:
import pandas as pd
import numpy as np

width = 77
line = "\n" + "-" * width
df = pd.DataFrame(np.arange(10,22).reshape(3,4), index=["a","b","c"], columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [3]:
for column_name, column_series in df.items():
    print(type(column_name), column_name)
    for index_key, cell_value in column_series.items():
        print(index_key, cell_value)

<class 'str'> A
a 10
b 14
c 18
<class 'str'> B
a 11
b 15
c 19
<class 'str'> C
a 12
b 16
c 20
<class 'str'> D
a 13
b 17
c 21


## indexing & Slicing

In [74]:
print("# df :", type(df), '\n', df)
print("\n# Column 기준으로 처리")
column_series = df["B"] # Series
print('\ndf["B"]\n', type(column_series), '\n', column_series)
column_df = df[["B","C"]] # DataFrame
print('\ndf[["B","C"]]\n', type(column_df), '\n', column_df)
#column_df = df["B","C"] # column slicing 처리 불가 (row slicing으로 처리되기 때문)
print("\n# Row 기준으로 처리 : DataFrame.index 리스트의 index를 기준으로 처리)")
row_df = df[1:2] 
print('\ndf[1:2]\n', type(row_df), '\n', row_df)

# df : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21

# Column 기준으로 처리

df["B"]
 <class 'pandas.core.series.Series'> 
 a    11
b    15
c    19
Name: B, dtype: int64

df[["B","C"]]
 <class 'pandas.core.frame.DataFrame'> 
     B   C
a  11  12
b  15  16
c  19  20

# Row 기준으로 처리 : DataFrame.index 리스트의 index를 기준으로 처리)

df[1:2]
 <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17


In [78]:
print("# df :", type(df), '\n', df)
print(line)
print("\n# Row 기준으로 slicing 처리")
temp = df[1:3] # DataFrame (DataFrame.index 리스트의 index 기준)
print('\ndf[1:3]', type(temp), '\n', temp)
temp = df.iloc[1:3] # DataFrame (DataFrame.index 리스트의 index 기준)
print('\ndf.iloc[1:3]', type(temp), '\n', temp)
temp = df["b":"c"] # DataFrame (DataFrame.index 리스트의 value 기준)
print('\ndf["b":"c"]', type(temp), '\n', temp)
temp = df.loc[['b','c']] # DataFrame (DataFrame.index 리스트의 value 기준)
print('\ndf.loc[["b","c"]]', type(temp), '\n', temp)

# df : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21

-----------------------------------------------------------------------------

# Row 기준으로 slicing 처리

df[1:3] <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21

df.iloc[1:3] <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21

df["b":"c"] <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21

df.loc[["b","c"]] <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21


### Column

In [6]:
temp = df["B"] # Series
print(type(temp), '\n', temp)
temp = df["B"][:] # Series
print(type(temp), '\n', temp)

<class 'pandas.core.series.Series'> 
 a    11
b    15
c    19
Name: B, dtype: int64
<class 'pandas.core.series.Series'> 
 a    11
b    15
c    19
Name: B, dtype: int64


In [76]:
column_series = df["B"][:'b'] # Series의 slicing (column 기준)
print(type(column_series), '\n', column_series)
print(line)
series1 = df['B']
print(type(series1), '\n', series1)
series2 = series1[:'b']
print(type(series2), '\n', series2)

<class 'pandas.core.series.Series'> 
 a    11
b    15
Name: B, dtype: int64

-----------------------------------------------------------------------------
<class 'pandas.core.series.Series'> 
 a    11
b    15
c    19
Name: B, dtype: int64
<class 'pandas.core.series.Series'> 
 a    11
b    15
Name: B, dtype: int64


In [8]:
print(type(df), '\n', df)
print(line)

temp = df[["B"]] # DataFrame
print(type(temp), '\n', temp)
temp = df[["B","C","D"]] # DataFrame
print(type(temp), '\n', temp)

temp = df[["B","C","D"]]["B"] # Series (column)
print(type(temp), '\n', temp)
temp = df[["B","C","D"]].loc["a"] # Series (row)
print(type(temp), '\n', temp)
#temp = df[["B","D"], "a"] # InvalidIndexError

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
-----------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'> 
     B
a  11
b  15
c  19
<class 'pandas.core.frame.DataFrame'> 
     B   C   D
a  11  12  13
b  15  16  17
c  19  20  21
<class 'pandas.core.series.Series'> 
 a    11
b    15
c    19
Name: B, dtype: int64
<class 'pandas.core.series.Series'> 
 B    11
C    12
D    13
Name: a, dtype: int64


### Row

In [9]:
print(type(df), '\n', df)
print(line)
temp = df.loc['a'] # Series
print(type(temp), '\n', temp)
temp = df.loc['a', :] # Series
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
-----------------------------------------------------------------------------
<class 'pandas.core.series.Series'> 
 A    10
B    11
C    12
D    13
Name: a, dtype: int64
<class 'pandas.core.series.Series'> 
 A    10
B    11
C    12
D    13
Name: a, dtype: int64


In [10]:
temp = df.loc['a', 'B':'C'] # Series
print(type(temp), '\n', temp)
temp = df.loc['a', ['B','C']] # Series
print(type(temp), '\n', temp)

<class 'pandas.core.series.Series'> 
 B    11
C    12
Name: a, dtype: int64
<class 'pandas.core.series.Series'> 
 B    11
C    12
Name: a, dtype: int64


In [11]:
temp = df.loc[['a']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc[['a'], :] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['a':'a'] # DataFrame
print(type(temp), '\n', temp)
temp = df.iloc[0:1] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13


In [12]:
temp = df.loc[['a'], 'B':'C'] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc[['a'], ['B','C']] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     B   C
a  11  12
<class 'pandas.core.frame.DataFrame'> 
     B   C
a  11  12


In [13]:
temp = df.loc[['a','c']] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
c  18  19  20  21


### Cell

In [14]:
print(type(df['A']['a']), df['A']['a']) # df['A','a'] : Error

<class 'numpy.int64'> 10


In [15]:
print(df.loc['a',"A"], df.loc['a']["A"])
print(df.iloc[0,0])
print(df.iloc[0][0]) # FutureWarning

10 10
10
10




### 중요

In [16]:
temp = df.loc[['a','b']]['A'] # Series
print(type(temp), '\n', temp)
temp = df.loc[['a','b'],'A'] # Series
print(type(temp), '\n', temp)
temp = df.loc[['a','b'],['A']] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.series.Series'> 
 a    10
b    14
Name: A, dtype: int64
<class 'pandas.core.series.Series'> 
 a    10
b    14
Name: A, dtype: int64
<class 'pandas.core.frame.DataFrame'> 
     A
a  10
b  14


In [17]:
temp = df.loc['a'] # Series
print(type(temp), '\n', temp)
temp = df.loc[['a']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['a', :] # Series
print(type(temp), '\n', temp)
temp = df.loc[['a'], :] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.series.Series'> 
 A    10
B    11
C    12
D    13
Name: a, dtype: int64
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
<class 'pandas.core.series.Series'> 
 A    10
B    11
C    12
D    13
Name: a, dtype: int64
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13


In [18]:
print(type(df), '\n', df)
print(line)
temp = df.loc['b':] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['b':,'A'] # Series
print(type(temp), '\n', temp)
temp = df.loc['b':]['A'] # Series
print(type(temp), '\n', temp)
temp = df.loc['b':][['A']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['b':,['A']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['b':,'A':'A'] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['b':]['b':'b'] # DataFrame ???????
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
-----------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 b    14
c    18
Name: A, dtype: int64
<class 'pandas.core.series.Series'> 
 b    14
c    18
Name: A, dtype: int64
<class 'pandas.core.frame.DataFrame'> 
     A
b  14
c  18
<class 'pandas.core.frame.DataFrame'> 
     A
b  14
c  18
<class 'pandas.core.frame.DataFrame'> 
     A
b  14
c  18
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17


In [19]:
temp = df['a':'b'] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc['a':'b'] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc[['a','b']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc[['a','b']][['B','D']] # DataFrame
print(type(temp), '\n', temp)
temp = df.loc[['a','b'],['B','D']] # DataFrame
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     B   D
a  11  13
b  15  17
<class 'pandas.core.frame.DataFrame'> 
     B   D
a  11  13
b  15  17


In [20]:
temp = df[1:2]
print(type(temp), '\n', temp)
temp = df.iloc[1:2]
print(type(temp), '\n', temp)
temp = df['b':'b']
print(type(temp), '\n', temp)
temp = df.loc['b':'b']
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17


In [21]:
temp = df.iloc[0,1]
print(type(temp), '\n', temp)
temp = df.iloc[0:2,1:2]
print(type(temp), '\n', temp)
temp = df.iloc[0:2][1:2]
print(type(temp), '\n', temp)

<class 'numpy.int64'> 
 11
<class 'pandas.core.frame.DataFrame'> 
     B
a  11
b  15
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17


#### index가 1부터 시작되는 정수인 경우 df[1:2] 결과는?

In [22]:
df1 = pd.DataFrame(np.arange(10,22).reshape(3,4), index=[1,3,2], columns=['A','B','C','D'])
print(type(df1.index), df1.index)
print(type(df1), '\n', df1)
print(line)
temp = df1[1:2] # 사용자 지정 index의 순서(zero based index) 기준으로 처리
print("# df1[1:2] : ", type(temp), '\n', temp)
temp = df1.iloc[1:2] # 사용자 지정 index의 순서(zero based index) 기준으로 처리
print("# df1.iloc[1:2] : ", type(temp), '\n', temp)
temp = df1.loc[1:2] # 생성시 주어진 index를 기준으로 처리
print("# df1.loc[1:2] : ", type(temp), '\n', temp)
print(type(df1.index), '\n', df1.index)

<class 'pandas.core.indexes.base.Index'> Index([1, 3, 2], dtype='int64')
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
1  10  11  12  13
3  14  15  16  17
2  18  19  20  21
-----------------------------------------------------------------------------
# df1[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
3  14  15  16  17
# df1.iloc[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
3  14  15  16  17
# df1.loc[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
1  10  11  12  13
3  14  15  16  17
2  18  19  20  21
<class 'pandas.core.indexes.base.Index'> 
 Index([1, 3, 2], dtype='int64')


In [23]:
df2 = pd.DataFrame(np.arange(10,22).reshape(3,4), columns=['A','B','C','D'])
print(type(df2.index), df2.index)
print(type(df2), '\n', df2)
print(line)
temp = df2[1:2]
print("# df2[1:2] : ", type(temp), '\n', temp)
temp = df2.iloc[1:2]
print("# df2.iloc[1:2] : ", type(temp), '\n', temp)
temp = df2.loc[1:2]
print("# df2.loc[1:2] : ", type(temp), '\n', temp)

<class 'pandas.core.indexes.range.RangeIndex'> RangeIndex(start=0, stop=3, step=1)
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
0  10  11  12  13
1  14  15  16  17
2  18  19  20  21
-----------------------------------------------------------------------------
# df2[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
1  14  15  16  17
# df2.iloc[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
1  14  15  16  17
# df2.loc[1:2] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
1  14  15  16  17
2  18  19  20  21


## Selection

### Column

In [24]:
print(type(df), '\n', df)
print(line)
temp = df[df["B"] > 11]
print("# df[df['B'] > 11] : ", type(temp), '\n', temp)
print(line)
s = df["B"] > 11
print("1st. df['B'] > 11 : ", type(s), '\n', s)
temp = df[s]
print("2nd. df[df['B'] > 11] : ", type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
-----------------------------------------------------------------------------
# df[df['B'] > 11] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21
-----------------------------------------------------------------------------
1st. df['B'] > 11 :  <class 'pandas.core.series.Series'> 
 a    False
b     True
c     True
Name: B, dtype: bool
2nd. df[df['B'] > 11] :  <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17
c  18  19  20  21


In [25]:
print("# df :", type(df), '\n', df)
s = pd.Series([False,True,False], index=['a','b','c'])
print("# s :", type(s), '\n', s)
print(line)
temp = df[s]
print("# df[s] :", type(temp), '\n', temp)

# df : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
# s : <class 'pandas.core.series.Series'> 
 a    False
b     True
c    False
dtype: bool
-----------------------------------------------------------------------------
# df[s] : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
b  14  15  16  17


### Row

In [26]:
print("# df :", type(df), '\n', df)
temp = df.loc[df.A > 15]
print("# df.loc[df.A > 15] :", type(temp), '\n', temp)
print(line)
s = df.A > 15
print("# df.A > 15 : ", type(s), '\n', s)
temp = df.loc[s]
print("# df.loc[df.A > 15] :", type(temp), '\n', temp)

# df : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
# df.loc[df.A > 15] : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
c  18  19  20  21
-----------------------------------------------------------------------------
# df.A > 15 :  <class 'pandas.core.series.Series'> 
 a    False
b    False
c     True
Name: A, dtype: bool
# df.loc[df.A > 15] : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
c  18  19  20  21


In [27]:
print("# df :", type(df), '\n', df)
s = pd.Series([True,True,False], index=['a','b','c'])
print("# s :", type(s), '\n', s)
temp = df.loc[s]
print("# df.loc[s] :", type(temp), '\n', temp)

# df : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
# s : <class 'pandas.core.series.Series'> 
 a     True
b     True
c    False
dtype: bool
# df.loc[s] : <class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17


## Function

### info()

In [28]:
temp = df.info()
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
 3   D       3 non-null      int64
dtypes: int64(4)
memory usage: 228.0+ bytes
<class 'NoneType'> 
 None


### describe()

In [29]:
temp = df.describe()
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
           A     B     C     D
count   3.0   3.0   3.0   3.0
mean   14.0  15.0  16.0  17.0
std     4.0   4.0   4.0   4.0
min    10.0  11.0  12.0  13.0
25%    12.0  13.0  14.0  15.0
50%    14.0  15.0  16.0  17.0
75%    16.0  17.0  18.0  19.0
max    18.0  19.0  20.0  21.0


### value_counts()

In [30]:
print(type(df), '\n', df)
df1 = df.copy()
print(id(df1) == id(df)) # False (깊은복사)
print(id(df1['A']) == id(df['A'])) # False (깊은복사)
print(id(df1.loc['a']) == id(df.loc['a'])) # True (얕은복사)
#
df1.loc['d'] = [10,11,12,13] # 행 추가
print(type(df1), '\n', df1)
vc = df1.value_counts()
print(type(vc), '\n', vc)
vc = df['A'].value_counts()
print(type(vc), '\n', vc)
vc = df['A'].value_counts(normalize=True)
print(type(vc), '\n', vc)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
False
False
True
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
d  10  11  12  13
<class 'pandas.core.series.Series'> 
 A   B   C   D 
10  11  12  13    2
14  15  16  17    1
18  19  20  21    1
Name: count, dtype: int64
<class 'pandas.core.series.Series'> 
 A
10    1
14    1
18    1
Name: count, dtype: int64
<class 'pandas.core.series.Series'> 
 A
10    0.333333
14    0.333333
18    0.333333
Name: proportion, dtype: float64


### count(axis=0) : NaN은 포함시키지 않음

In [31]:
print(type(df), '\n', df)
temp = df.count() # axis=0
print(type(temp), '\n', temp)
temp = df.count(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    3
B    3
C    3
D    3
dtype: int64
<class 'pandas.core.series.Series'> 
 a    4
b    4
c    4
dtype: int64


### sum(axis=0) : 합계

In [32]:
print(type(df), '\n', df)
temp = df.sum() # axis=0
print(type(temp), '\n', temp)
temp = df.sum(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    42
B    45
C    48
D    51
dtype: int64
<class 'pandas.core.series.Series'> 
 a    46
b    62
c    78
dtype: int64


### mean(axis=0) : 평균

In [33]:
print(type(df), '\n', df)
temp = df.mean() # axis=0
print(type(temp), '\n', temp)
temp = df.mean(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    14.0
B    15.0
C    16.0
D    17.0
dtype: float64
<class 'pandas.core.series.Series'> 
 a    11.5
b    15.5
c    19.5
dtype: float64


### min(axis=0) : 최소

In [34]:
print(type(df), '\n', df)
temp = df.min() # axis=0
print(type(temp), '\n', temp)
temp = df.min(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    10
B    11
C    12
D    13
dtype: int64
<class 'pandas.core.series.Series'> 
 a    10
b    14
c    18
dtype: int64


### max(axis=0) : 최대

In [35]:
print(type(df), '\n', df)
temp = df.max() # axis=0
print(type(temp), '\n', temp)
temp = df.max(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    18
B    19
C    20
D    21
dtype: int64
<class 'pandas.core.series.Series'> 
 a    13
b    17
c    21
dtype: int64


### var(axis=0) : 분산

In [36]:
print(type(df), '\n', df)
temp = df.var() # axis=0
print(type(temp), '\n', temp)
temp = df.var(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    16.0
B    16.0
C    16.0
D    16.0
dtype: float64
<class 'pandas.core.series.Series'> 
 a    1.666667
b    1.666667
c    1.666667
dtype: float64


### std(axis=0) : 표준편차

In [37]:
print(type(df), '\n', df)
temp = df.std() # axis=0
print(type(temp), '\n', temp)
temp = df.std(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    4.0
B    4.0
C    4.0
D    4.0
dtype: float64
<class 'pandas.core.series.Series'> 
 a    1.290994
b    1.290994
c    1.290994
dtype: float64


### median(axis=0) : 중앙값

In [38]:
print(type(df), '\n', df)
temp = df.median() # axis=0
print(type(temp), '\n', temp)
temp = df.median(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    14.0
B    15.0
C    16.0
D    17.0
dtype: float64
<class 'pandas.core.series.Series'> 
 a    11.5
b    15.5
c    19.5
dtype: float64


### quantile(q, ...) : 백분위수

In [39]:
print(type(df), '\n', df)
temp = df.quantile(0.5) # 중앙값
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.series.Series'> 
 A    14.0
B    15.0
C    16.0
D    17.0
Name: 0.5, dtype: float64


### sort_values(by=[], ascending=True)

In [40]:
print(type(df), '\n', df)
temp = df.sort_values(by='C')
print(type(temp), '\n', temp)
print(id(temp) == id(df)) # False
print(id(temp.iloc[0]) == id(df.iloc[0])) # True

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
False
True


In [41]:
print(type(temp), '\n', temp)
temp = df.sort_values(by=['C','B'], ascending=False)
print(type(temp), '\n', temp)
print(id(temp) == id(df)) # False
print(id(temp.iloc[0]) == id(df.iloc[0])) # True

<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
a  10  11  12  13
b  14  15  16  17
c  18  19  20  21
<class 'pandas.core.frame.DataFrame'> 
     A   B   C   D
c  18  19  20  21
b  14  15  16  17
a  10  11  12  13
False
True


### sort_index(ascending=True)

In [42]:
df1 = pd.DataFrame({'num_legs':[2,4,4,5], 'num_wings':[2,0,0,0]}, index=['falcon','cat','dog','ant'])
df1

Unnamed: 0,num_legs,num_wings
falcon,2,2
cat,4,0
dog,4,0
ant,5,0


In [43]:
temp = df1.sort_index()
print(type(temp), '\n', temp)
print(type(df1), '\n', df1)

<class 'pandas.core.frame.DataFrame'> 
         num_legs  num_wings
ant            5          0
cat            4          0
dog            4          0
falcon         2          2
<class 'pandas.core.frame.DataFrame'> 
         num_legs  num_wings
falcon         2          2
cat            4          0
dog            4          0
ant            5          0


In [44]:
temp = df1.sort_index(ascending=False)
print(type(temp), '\n', temp)
print(type(df1), '\n', df1)

<class 'pandas.core.frame.DataFrame'> 
         num_legs  num_wings
falcon         2          2
dog            4          0
cat            4          0
ant            5          0
<class 'pandas.core.frame.DataFrame'> 
         num_legs  num_wings
falcon         2          2
cat            4          0
dog            4          0
ant            5          0


### drop(index, columns, inplace=False ...)

In [45]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10,size=(4,8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [46]:
df2.loc['max'] = df2.max()
df2.loc['min'] = df2.min(axis=0)
df2['sum'] = df2[[0,1,2,3,4,5,6,7]].sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,sum
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42
max,9,9,9,9,9,7,4,7,63
min,4,7,2,4,0,0,0,1,18


In [47]:
print(type(df2), '\n', df2)
temp = df2.drop(index=['max','min'])
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
max  9  9  9  9  9  7  4  7   63
min  4  7  2  4  0  0  0  1   18
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  3  4  5  6  7  sum
0  5  8  9  5  0  0  1  7   35
1  6  9  2  4  5  2  4  2   34
2  4  7  7  9  1  7  0  6   41
3  9  9  7  6  9  1  0  1   42
<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
max  9  9  9  9  9  7  4  7   63
min  4  7  2  4  0  0  0  1   18


In [48]:
print(type(df2), '\n', df2)
temp = df2.drop(index='max', inplace=True)
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
max  9  9  9  9  9  7  4  7   63
min  4  7  2  4  0  0  0  1   18
<class 'NoneType'> 
 None
<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
min  4  7  2  4  0  0  0  1   18


In [49]:
print(type(df2), '\n', df2)
temp = df2.drop(columns=['sum',0])
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
min  4  7  2  4  0  0  0  1   18
<class 'pandas.core.frame.DataFrame'> 
      1  2  3  4  5  6  7
0    8  9  5  0  0  1  7
1    9  2  4  5  2  4  2
2    7  7  9  1  7  0  6
3    9  7  6  9  1  0  1
min  7  2  4  0  0  0  1
<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
min  4  7  2  4  0  0  0  1   18


In [50]:
print(type(df2), '\n', df2)
temp = df2.drop(columns='sum', inplace=True)
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7  sum
0    5  8  9  5  0  0  1  7   35
1    6  9  2  4  5  2  4  2   34
2    4  7  7  9  1  7  0  6   41
3    9  9  7  6  9  1  0  1   42
min  4  7  2  4  0  0  0  1   18
<class 'NoneType'> 
 None
<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7
0    5  8  9  5  0  0  1  7
1    6  9  2  4  5  2  4  2
2    4  7  7  9  1  7  0  6
3    9  9  7  6  9  1  0  1
min  4  7  2  4  0  0  0  1


In [51]:
print(type(df2), '\n', df2)
temp = df2.drop(index=[1,2], columns=[1,3,5,7])
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7
0    5  8  9  5  0  0  1  7
1    6  9  2  4  5  2  4  2
2    4  7  7  9  1  7  0  6
3    9  9  7  6  9  1  0  1
min  4  7  2  4  0  0  0  1
<class 'pandas.core.frame.DataFrame'> 
      0  2  4  6
0    5  9  0  1
3    9  7  9  0
min  4  2  0  0
<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7
0    5  8  9  5  0  0  1  7
1    6  9  2  4  5  2  4  2
2    4  7  7  9  1  7  0  6
3    9  9  7  6  9  1  0  1
min  4  7  2  4  0  0  0  1


In [52]:
print(type(df2), '\n', df2)
temp = df2.drop(index=[1,2], columns=[1,3,5,7], inplace=True)
print(type(temp), '\n', temp)
print(type(df2), '\n', df2)

<class 'pandas.core.frame.DataFrame'> 
      0  1  2  3  4  5  6  7
0    5  8  9  5  0  0  1  7
1    6  9  2  4  5  2  4  2
2    4  7  7  9  1  7  0  6
3    9  9  7  6  9  1  0  1
min  4  7  2  4  0  0  0  1
<class 'NoneType'> 
 None
<class 'pandas.core.frame.DataFrame'> 
      0  2  4  6
0    5  9  0  1
3    9  7  9  0
min  4  2  0  0


### dropna(axis, inplace=False)

In [53]:
df3 = pd.DataFrame(np.random.randint(10,size=(4,8)))
df3.loc[2,3] = np.nan
df3.loc[1,5] = np.nan
print(type(df3), '\n', df3)

<class 'pandas.core.frame.DataFrame'> 
    0  1  2    3  4    5  6  7
0  8  8  3  9.0  8  7.0  3  6
1  5  1  9  3.0  4  NaN  1  4
2  0  3  9  NaN  0  4.0  9  2
3  7  7  9  8.0  6  9.0  3  7


In [54]:
print(type(df3), '\n', df3)
temp = df3.dropna(axis=0) # 행 제거
print(type(temp), '\n', temp)
temp = df3.dropna(axis=1, inplace=True) # 열 제거
print(type(temp), '\n', temp)
print(type(df3), '\n', df3)

<class 'pandas.core.frame.DataFrame'> 
    0  1  2    3  4    5  6  7
0  8  8  3  9.0  8  7.0  3  6
1  5  1  9  3.0  4  NaN  1  4
2  0  3  9  NaN  0  4.0  9  2
3  7  7  9  8.0  6  9.0  3  7
<class 'pandas.core.frame.DataFrame'> 
    0  1  2    3  4    5  6  7
0  8  8  3  9.0  8  7.0  3  6
3  7  7  9  8.0  6  9.0  3  7
<class 'NoneType'> 
 None
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7


### fillna(value, inplace=False)

In [55]:
print(type(df3), '\n', df3)
temp = df3.fillna(0)
print(type(temp), '\n', temp)
print(type(df3), '\n', df3)
temp = df3.fillna(-1, inplace=True)
print(type(temp), '\n', temp)
print(type(df3), '\n', df3)

<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
<class 'NoneType'> 
 None
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7


### astype(type) : 형 변환

In [56]:
print(type(df3), '\n', df3)
temp = df3.astype(int)
print(type(temp), '\n', temp)
print(type(df3), '\n', df3)
print(id(temp) == id(df3)) # False
print(id(temp.loc[0]) == id(df3.loc[0])) # True ???
print(temp.loc[0,3], df3.loc[0,3]) # 5 5.0
print(type(temp.loc[0,3]), type(df3.loc[0,3])) # <class 'numpy.int64'> <class 'numpy.float64'>
print(id(temp.loc[0,3]) == id(df3.loc[0,3])) # False

<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
<class 'pandas.core.frame.DataFrame'> 
    0  1  2  4  6  7
0  8  8  3  8  3  6
1  5  1  9  4  1  4
2  0  3  9  0  9  2
3  7  7  9  6  3  7
False
True


KeyError: 3

### apply(function, axis=0)

In [None]:
print(type(df3), '\n', df3)
temp = df3.apply(lambda x : x.max() - x.min()) # 열 방향
print(type(temp), '\n', temp) 
temp = df3.apply(lambda x : x.max() - x.min(), axis=1) # 행 방향
print(type(temp), '\n', temp)
temp = df3.max(axis=1) - df3.min(axis=1)
print(type(temp), '\n', temp)

<class 'pandas.core.frame.DataFrame'> 
    0  1  2    3  4    5  6  7
0  7  8  3  7.0  8  7.0  6  5
1  1  9  8  6.0  9 -1.0  8  2
2  3  1  2 -1.0  3  8.0  9  7
3  9  1  0  4.0  2  0.0  9  3
<class 'pandas.core.series.Series'> 
 0    8.0
1    8.0
2    8.0
3    8.0
4    7.0
5    9.0
6    3.0
7    5.0
dtype: float64
<class 'pandas.core.series.Series'> 
 0     5.0
1    10.0
2    10.0
3     9.0
dtype: float64
<class 'pandas.core.series.Series'> 
 0     5.0
1    10.0
2    10.0
3     9.0
dtype: float64


### cut(x, bins, lables=None, ...) : 데이터값을 카테고리로 변환
- x : 데이터값 리스트
- bins : 구간 경계값 리스트 (최소값 < 구간 <= 최대값)
- labels : 구간 이름 리스트, 순서는 bins의 순서와 동일해야 함 (len(bins) - 1)

In [None]:
ages = [0,0.5,4,6,4,5,2,10,21,23,37,15,38,31,61,20,41,31,100]
bins = [0,4,18,25,35,60,100]
labels = ['영유아', '미성년자', '청년', '중년', '장년', '노년']
cats = pd.cut(ages, bins, labels=labels)
print(type(cats), '\n', cats)
print(type(cats.codes), cats.codes)
print(type(cats.categories), cats.categories)
print(line)
df4 = pd.DataFrame({'나이':ages, '연령대':list(cats)})
temp = df4['연령대'].value_counts()
print(type(temp), '\n', temp)
df4

<class 'pandas.core.arrays.categorical.Categorical'> 
 [NaN, '영유아', '영유아', '미성년자', '영유아', ..., '노년', '청년', '장년', '중년', '노년']
Length: 19
Categories (6, object): ['영유아' < '미성년자' < '청년' < '중년' < '장년' < '노년']
<class 'numpy.ndarray'> [-1  0  0  1  0  1  0  1  2  2  4  1  4  3  5  2  4  3  5]
<class 'pandas.core.indexes.base.Index'> Index(['영유아', '미성년자', '청년', '중년', '장년', '노년'], dtype='object')
-----------------------------------------------------------------------------
<class 'pandas.core.series.Series'> 
 연령대
영유아     4
미성년자    4
청년      3
장년      3
중년      2
노년      2
Name: count, dtype: int64


Unnamed: 0,나이,연령대
0,0.0,
1,0.5,영유아
2,4.0,영유아
3,6.0,미성년자
4,4.0,영유아
5,5.0,미성년자
6,2.0,영유아
7,10.0,미성년자
8,21.0,청년
9,23.0,청년


### qcut(x, q, labels=None, ...) : 데이터 개수가 같도록 지정한 수의 구간으로 분할 (구간 경계를 미지정)
- x : 분할할 데이터
- q : 구간 수
- labels : 구간 이름 리스트

In [None]:
# random 정수 20개를 생성하고 4개의 구간(Q1,Q2,Q3,Q4)으로 나누기
np.random.seed(2) # 재실행해도 랜덤정수가 변하지 않도록 처리
data = np.random.randint(20, size=20)
qcat = pd.qcut(data, 4, labels=["Q1", "Q2", "Q3", "Q4"])
print(type(data), np.sort(data))
print(type(qcat), '\n', qcat)
print(type(qcat.codes), qcat.codes)
print(type(qcat.categories), qcat.categories)
temp = qcat.value_counts()
print(type(temp), '\n', temp)
print(line)
df5 = pd.DataFrame(data, columns=["관측수"])
df5["범주"] = qcat
df5

<class 'numpy.ndarray'> [ 2  3  4  5  6  7  7  8  8  8 10 11 11 11 11 13 15 15 17 18]
<class 'pandas.core.arrays.categorical.Categorical'> 
 ['Q2', 'Q4', 'Q4', 'Q2', 'Q3', ..., 'Q1', 'Q1', 'Q1', 'Q3', 'Q3']
Length: 20
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
<class 'numpy.ndarray'> [1 3 3 1 2 3 2 1 1 0 3 2 3 0 1 0 0 0 2 2]
<class 'pandas.core.indexes.base.Index'> Index(['Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
<class 'pandas.core.series.Series'> 
 Q1    5
Q2    5
Q3    5
Q4    5
Name: count, dtype: int64
-----------------------------------------------------------------------------


Unnamed: 0,관측수,범주
0,8,Q2
1,15,Q4
2,13,Q4
3,8,Q2
4,11,Q3
5,18,Q4
6,11,Q3
7,8,Q2
8,7,Q2
9,2,Q1


### set_index(keys, inplace=False, verify_integrity=False) : <br>기존 행 인덱스를 제거하고 열 중 하나를 인덱스로 설정
- keys : 인덱스로 설정할 열의 키값
- inplace : True로 설정하면 원본에 반영
- verify_integrity : True로 설정하면 중복된 키값이 있는지 확인

In [None]:
df6 = pd.DataFrame({
    'a': [1, 3, 4, 3, 4],
    'b': [2, 3, 1, 4, 5],
    'c': [1, 5, 2, 4, 4]
})
print(type(df6), "\n", df6)
print(type(df6.index), df6.index)
temp = df6.set_index("a")
print(type(temp), "\n", temp)
print(type(temp.index), temp.index)
temp = df6.set_index(["a", "b"])
print(type(temp), "\n", temp)
print(type(temp.index), temp.index)
df6.set_index(["c"], inplace=True, verify_integrity=True)
df6 # [중료] 인덱스로 설정할 때 중복된 키값이 존재해도 는데 정상적으로 인덱스로 설정되는 문제 (verify_integrity을 True로 설정하면 ValueError 발생)
df6.loc[4] = [np.nan, np.nan]
df6 # row index 4인 행이 모두 변경됨 

<class 'pandas.core.frame.DataFrame'> 
    a  b  c
0  1  2  1
1  3  3  5
2  4  1  2
3  3  4  4
4  4  5  4
<class 'pandas.core.indexes.range.RangeIndex'> RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.frame.DataFrame'> 
    b  c
a      
1  2  1
3  3  5
4  1  2
3  4  4
4  5  4
<class 'pandas.core.indexes.base.Index'> Index([1, 3, 4, 3, 4], dtype='int64', name='a')
<class 'pandas.core.frame.DataFrame'> 
      c
a b   
1 2  1
3 3  5
4 1  2
3 4  4
4 5  4
<class 'pandas.core.indexes.multi.MultiIndex'> MultiIndex([(1, 2),
            (3, 3),
            (4, 1),
            (3, 4),
            (4, 5)],
           names=['a', 'b'])


ValueError: Index has duplicate keys: Index([4], dtype='int64', name='c')

### reset_index() : 기존 행 인덱스를 제거하고 기본 인덱스로 변경 (0부터 1씩 증가하는 정수 인덱스)

In [None]:
print(type(df6), "\n", df6)
temp = df6.reset_index()
print(type(temp), "\n", temp)

<class 'pandas.core.frame.DataFrame'> 
    a  b
c      
1  1  2
5  3  3
2  4  1
4  3  4
4  4  5
<class 'pandas.core.frame.DataFrame'> 
    c  a  b
0  1  1  2
1  5  3  3
2  2  4  1
3  4  3  4
4  4  4  5


## 병합

### merge(right, on, how="inner", ...)
- right : DataFrame or Series
- on : 병합시 기준이 되는 컬럼의 키값 (생략시 자동으로 컬럼명이 같은 컬럼으로 병합)
- how : 병합방법
    1. inner : 교집합
    2. outer : 합집합
    3. right : 오른쪽 테이블 기준
    4. left : 왼쪽 테이블 기준

In [None]:
df1 = pd.DataFrame({
    '고객번호': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    '이름': ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']
}, columns=['고객번호', '이름'])
df2 = pd.DataFrame({
    '고객번호': [1001, 1001, 1005, 1006, 1008, 1001],
    '금액' : [10000, 20000, 15000, 5000, 100000, 30000]
}, columns=['고객번호', '금액'])

In [None]:
temp = df1.merge(df2)
print(type(temp), "\n", temp)
temp = pd.merge(df1, df2, on="고객번호", how="inner")
print(type(temp), "\n", temp)
df1
df2

<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름     금액
0  1001   둘리  10000
1  1001   둘리  20000
2  1001   둘리  30000
3  1005   희동  15000
4  1006  마이콜   5000
<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름     금액
0  1001   둘리  10000
1  1001   둘리  20000
2  1001   둘리  30000
3  1005   희동  15000
4  1006  마이콜   5000


Unnamed: 0,고객번호,이름
0,1001,둘리
1,1002,도우너
2,1003,또치
3,1004,길동
4,1005,희동
5,1006,마이콜
6,1007,영희


Unnamed: 0,고객번호,금액
0,1001,10000
1,1001,20000
2,1005,15000
3,1006,5000
4,1008,100000
5,1001,30000


In [None]:
temp = df1.merge(df2, on="고객번호", how="outer")
print(type(temp), "\n", temp)
temp = pd.merge(df1, df2, on="고객번호", how="outer")
print(type(temp), "\n", temp)

<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름        금액
0  1001   둘리   10000.0
1  1001   둘리   20000.0
2  1001   둘리   30000.0
3  1002  도우너       NaN
4  1003   또치       NaN
5  1004   길동       NaN
6  1005   희동   15000.0
7  1006  마이콜    5000.0
8  1007   영희       NaN
9  1008  NaN  100000.0
<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름        금액
0  1001   둘리   10000.0
1  1001   둘리   20000.0
2  1001   둘리   30000.0
3  1002  도우너       NaN
4  1003   또치       NaN
5  1004   길동       NaN
6  1005   희동   15000.0
7  1006  마이콜    5000.0
8  1007   영희       NaN
9  1008  NaN  100000.0


In [None]:
temp = df1.merge(df2, on="고객번호", how="right")
print(type(temp), "\n", temp)
temp = pd.merge(df1, df2, on="고객번호", how="right")
print(type(temp), "\n", temp)

<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름      금액
0  1001   둘리   10000
1  1001   둘리   20000
2  1005   희동   15000
3  1006  마이콜    5000
4  1008  NaN  100000
5  1001   둘리   30000


In [None]:
temp = df1.merge(df2, on="고객번호", how="left")
print(type(temp), "\n", temp)
temp = pd.merge(df1, df2, on="고객번호", how="left")
print(type(temp), "\n", temp)

<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름       금액
0  1001   둘리  10000.0
1  1001   둘리  20000.0
2  1001   둘리  30000.0
3  1002  도우너      NaN
4  1003   또치      NaN
5  1004   길동      NaN
6  1005   희동  15000.0
7  1006  마이콜   5000.0
8  1007   영희      NaN


In [None]:
temp = pd.merge(df1, df2, on="고객번호", how="left")
print(type(temp), "\n", temp)

<class 'pandas.core.frame.DataFrame'> 
    고객번호   이름       금액
0  1001   둘리  10000.0
1  1001   둘리  20000.0
2  1001   둘리  30000.0
3  1002  도우너      NaN
4  1003   또치      NaN
5  1004   길동      NaN
6  1005   희동  15000.0
7  1006  마이콜   5000.0
8  1007   영희      NaN


In [None]:
# [setosa] df1:(1.4, 1.3), df2:(0.4)
# [versicolor] df1 2개 x df2 2개 = 4개로 생성
df1 = pd.DataFrame({
    '품종': ['setosa', 'setosa', 'virginica', 'virginica'],
    '꽃잎길이': [1.4, 1.3, 1.5, 1.3]
}, columns=['품종', '꽃잎길이'])
df2 = pd.DataFrame({
    '품종': ['setosa', 'virginica', 'virginica', 'ersicolor'],
    '꽃잎너비': [0.4, 0.3, 0.5, 0.3]
}, columns=['품종', '꽃잎너비'])
temp = pd.merge(df1, df2) # on='품종', how='inner'
temp

Unnamed: 0,품종,꽃잎길이,꽃잎너비
0,setosa,1.4,0.4
1,setosa,1.3,0.4
2,virginica,1.5,0.3
3,virginica,1.5,0.5
4,virginica,1.3,0.3
5,virginica,1.3,0.5


In [57]:
df1 = pd.DataFrame({
    '고객명': ['춘향', '춘향', '몽룡'],
    '날짜': ['2018-01-01', '2018-01-02', '2018-01-01'],
    '데이터': [20000, 30000, 100000]})
df2 = pd.DataFrame({
    '고객명': ['춘향', '몽룡'], 
    '데이터': ['여자', '남자']})
df1
df2
temp = pd.merge(df1, df2, on='고객명')
temp

Unnamed: 0,고객명,날짜,데이터
0,춘향,2018-01-01,20000
1,춘향,2018-01-02,30000
2,몽룡,2018-01-01,100000


Unnamed: 0,고객명,데이터
0,춘향,여자
1,몽룡,남자


Unnamed: 0,고객명,날짜,데이터_x,데이터_y
0,춘향,2018-01-01,20000,여자
1,춘향,2018-01-02,30000,여자
2,몽룡,2018-01-01,100000,남자


Unnamed: 0,고객명,날짜,데이터_x,데이터_y
0,춘향,2018-01-01,20000,여자
1,춘향,2018-01-02,30000,여자
2,몽룡,2018-01-01,100000,남자


In [None]:
df1 = pd.DataFrame({
    '이름': ['영희', '철수', '철수'],
    '성적': [90, 80, 80]
})
df2 = pd.DataFrame({
    '성명': ['영희', '영희', '철수'],
    '성적2': [100, 80, 90]
})
df1
df2
pd.merge(df1, df2, left_on='이름', right_on='성명') # how='inner'

Unnamed: 0,이름,성적
0,영희,90
1,철수,80
2,철수,80


Unnamed: 0,성명,성적2
0,영희,100
1,영희,80
2,철수,90


Unnamed: 0,이름,성적,성명,성적2
0,영희,90,영희,100
1,영희,90,영희,80
2,철수,80,철수,90
3,철수,80,철수,90


In [None]:
df1 = pd.DataFrame({
    '도시': ['서울', '서울', '서울', '부산', '부산'],
    '연도': [2000, 2005, 2010, 2000, 2005],
    '인구': [9853972, 9762546, 9631482, 3655437, 3512547]
})
df2 = pd.DataFrame(
    np.arange(12).reshape((6, 2)),
    index=[
        ['부산', '부산', '서울', '서울', '서울', '서울'],
        [2000, 2005, 2000, 2005, 2010, 2015]
    ], columns=['데이터1', '데이터2'])
df1
df2
temp = df2.loc['부산'].loc[2000]
print(type(temp), '\n', temp)

Unnamed: 0,도시,연도,인구
0,서울,2000,9853972
1,서울,2005,9762546
2,서울,2010,9631482
3,부산,2000,3655437
4,부산,2005,3512547


Unnamed: 0,Unnamed: 1,데이터1,데이터2
부산,2000,0,1
부산,2005,2,3
서울,2000,4,5
서울,2005,6,7
서울,2010,8,9
서울,2015,10,11


<class 'pandas.core.series.Series'> 
 데이터1    0
데이터2    1
Name: 2000, dtype: int64


### join(...) : 사용법은 merge와 동일

### concat

## Pivot Table

### pivot_table(data,values=None,index=None,columns=None,aggfunc='mean',margins=False,margins_name='All')
- data : 분석할 DataFrame, 메서드 형식일때는 필요하지 않음 (ex : df1.pivot_table())
- values : 분석할 DataFrame에서 분석할 열
- index : 행 인덱스로 들어갈 키열 또는 키열의 리스트
- columns : 열 인덱스로 들어갈 키열 또는 키열의 리스트
- aggfunc : 분석할 데이터의 집계 함수
- fill_value : NaN이 표출될 때 대체값 지정
- margins : 모든 데이터를 분석한 결과를 행으로 표출할 지 여부
- margins_name : margins가 표출될 때 그 열(행)의 이름
#### 피봇테이블을 작성할 때 반드시 설정해야 되는 인수
- data : 사용 데이터 프레임
- index : 행 인덱스로 사용할 필드(기준 필드로 작용됨)
- 인덱스 명을 제외한 나머지 값(data)은 수치 data 만 사용함
- 기본 함수가 평균(mean)함수 이기 때문에 각 데이터의 평균값이 반환

In [79]:
data_dic = {
"도시": ["서울", "서울", "서울", "부산", "부산", "부산", "인천", "인천"],
"연도": ["2015", "2010", "2005", "2015", "2010", "2005", "2015", "2010"],
"인구": [9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 263203],
"지역": ["수도권", "수도권", "수도권", "경상권", "경상권", "경상권", "수도권", "수도권"]
}
columns_list = ["도시", "연도", "인구", "지역"]
df = pd.DataFrame(data_dic, columns=columns_list)
df

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,263203,수도권


In [81]:
# 각 도시에 대한 연도별 인구
df.pivot_table(index="도시", columns='연도',values='인구')

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,263203.0,2890451.0


In [83]:
# 각 지역별 도시에 대한 연도별 인구
df.pivot_table(index=["지역","도시"],columns="연도", values="인구")

Unnamed: 0_level_0,연도,2005,2010,2015
지역,도시,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
경상권,부산,3512547.0,3393191.0,3448737.0
수도권,서울,9762546.0,9631482.0,9904312.0
수도권,인천,,263203.0,2890451.0


In [88]:
import seaborn as sns

df = sns.load_dataset('titanic')[['age','sex','class','fare','survived']]
# 각 선실 등급별 숙박객의 성별에따른 생존자 수와 생존율
pdf = pd.pivot_table(df,# 피벗할 데이터 프레임
index = 'class', # 행 인덱스로 사용
columns = 'sex', # 컬럼으로 사용
values='survived', # 계산데이터로 사용할 열
aggfunc=['mean','sum'] # 데이터 집계함수
)
pdf

Unnamed: 0_level_0,mean,mean,sum,sum
sex,female,male,female,male
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
First,0.968085,0.368852,91,45
Second,0.921053,0.157407,70,17
Third,0.5,0.135447,72,47


## Group 분석

### groupby(...)

In [97]:
np.random.seed(0)
df2 = pd.DataFrame({
'key1': ['A', 'A', 'B', 'B', 'A'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': [1, 2, 3, 4, 5],
'data2': [10, 20, 30, 40, 50]
})
df2
groups = df2.groupby(df2.key1)
print(type(groups.groups), '\n', groups.groups)
gdf = pd.DataFrame(groups)
gdf
gdf.loc[0].values

Unnamed: 0,key1,key2,data1,data2
0,A,one,1,10
1,A,two,2,20
2,B,one,3,30
3,B,two,4,40
4,A,one,5,50


<class 'pandas.io.formats.printing.PrettyDict'> 
 {'A': [0, 1, 4], 'B': [2, 3]}


Unnamed: 0,0,1
0,A,key1 key2 data1 data2 0 A one 1 ...
1,B,key1 key2 data1 data2 2 B one 3 ...


array(['A',   key1 key2  data1  data2
            0    A  one      1     10
            1    A  two      2     20
            4    A  one      5     50], dtype=object)

# 끝