# pandas data type (DataFrame) 판다스 자료형 (데이터프레임)

## 1. DataFrame
- 2차원 배열과 유사한 자료형
- 다차원 리스트, 딕셔너리 자료형으로 데이터 구성 가능
- 관계형 데이터베이스의 테이블 구조, excel/csv 데이터 구조와 유사
- 하나의 컬럼은 하나의 Series로서 하나의 Dataframe은 여러 개의 Series 묶음으로 구성됨
- index 특징
    - row index(행 인덱스) : 기본 숫자형 인덱스가 아닌 새롭게 지정한 로우명(라벨) 인덱스를 사용해도 기본 숫자형 인덱스를 함께 사용할 수 있음
    - column index(열 인덱스) : 새롭게 컬럼명(라벨) 인덱스를 사용하면 기본 숫자형 인덱스는 사용할 수 없음
<img src="img/df_example.png" width="500" align="center">

In [1]:
import pandas as pd

### 1.1. DataFrame 생성

- 다차원 자료형, dict 자료형 사용

- 주의점: data type에 따라 item length issue breakouts

- 다차원 list: item length가 동일, 서로 다른 item dtype

- dict: item lenght가 동일, 서로 다른 dtype

In [2]:
# DataFrame의 cell(tuple)에는 모든 dtype 및 여러 data type 혼합 가능

# N dimensions list: 2차원 item 3개, 1차원 item 4개
my_list1 = [[1, 2, 3, 4],
          ['a', 'b', 'c', 'd'],
          [0.1, 0.2, 0.5, 0.8]]

In [3]:
# 2차원 item 3개: row 3줄> DataFrame에 저장하는 data의 개수
# 1차원 item 4개: column 4줄

df_1 = pd.DataFrame(my_list1)
df_1

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,a,b,c,d
2,0.1,0.2,0.5,0.8


Unnamed: 0,0,1,2,3
0,1,2,3,4
1,a,b,c,d
2,0.1,0.2,0.5,0.8


In [4]:
# item length가 다른 N dimensions list
# max length의 row를 기준으로 DataFrame 구조가 생성됨
# 길이가 모자른 tuple(cell) > NaN으로 채워짐

my_list2 = [[1, 2, 3, 4, 5],
           ['a', 'b'],
           [0.1, 0.2, 0.5]]

df_2 = pd.DataFrame(my_list2)
df_2

Unnamed: 0,0,1,2,3,4
0,1,2,3.0,4.0,5.0
1,a,b,,,
2,0.1,0.2,0.5,,


Unnamed: 0,0,1,2,3,4
0,1,2,3.0,4.0,5.0
1,a,b,,,
2,0.1,0.2,0.5,,


In [5]:
# dict > DataFrane
my_dict1 = {'a':[10, 20, 30, 40],
           'b':[1, 2, 3, 4],
           'c':[5, 6, 7, 8]}

df_3 = pd.DataFrame(my_dict1)

print(type(df_3))
df_3

#dict의 value내부 item 개수는 standardized 돼야 함

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a,b,c
0,10,1,5
1,20,2,6
2,30,3,7
3,40,4,8


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a,b,c
0,10,1,5
1,20,2,6
2,30,3,7
3,40,4,8


In [6]:
# dict > DataFrame
# 아이템 개수 3개, value 길이는 다르게
# 개수가 모자란 튜플(셀):ValueError -> key별 매칭 value의 길이가 모두 동일해야함

my_dict2 = {'a':[10],
        'b':[1, 2, 3, 4],
        'c':[5, 6, 7]}


#df_3 = pd.DataFrame(my_dict2) # ValueError
df_3

Unnamed: 0,a,b,c
0,10,1,5
1,20,2,6
2,30,3,7
3,40,4,8


Unnamed: 0,a,b,c
0,10,1,5
1,20,2,6
2,30,3,7
3,40,4,8


In [7]:
# dict 아닌 data type이라도 > row, col 지정 가능
# index 지정 object 생성 > DataFrame method에서 parameter로 지정
# `columns` parameter: clumn name > columns 개수와 동일한 length를 가진 list 전달
# `index` parameter: row name > rows 개수와 동일한 length를 가진 list 전달
df_5 = pd.DataFrame(my_list1,
                   index=['r1', 'r2', 'r3'],
                   columns=['c1', 'c2', 'c3', 'c4'])

df_5

Unnamed: 0,c1,c2,c3,c4
r1,1,2,3,4
r2,a,b,c,d
r3,0.1,0.2,0.5,0.8


Unnamed: 0,c1,c2,c3,c4
r1,1,2,3,4
r2,a,b,c,d
r3,0.1,0.2,0.5,0.8


In [8]:
# dict > column 순서 변경해서 DataFrame 생성 가능
df_6 = pd.DataFrame( )

df_6

In [9]:
# data에 없는 or 더 많은 column name 전달 경우 > 전달된 column name으로 "NaN" data column 생성
df_7 = pd.DataFrame(my_dict1, columns=['a', 'b', 'd'])

df_7

Unnamed: 0,a,b,d
0,10,1,
1,20,2,
2,30,3,
3,40,4,


Unnamed: 0,a,b,d
0,10,1,
1,20,2,
2,30,3,
3,40,4,


In [10]:
# dict data row 개수와 일치하지 않는 index 개수를 전달하는 경우 > ValueError
df_8 = pd.DataFrame(my_dict1, index=[10, 20, 30, 40])

# df_8 = pd.DataFrame(my_dict1, index=[10, 20, 30, 40, 50]) # > ValueError

### 1.2. DataFrame 속성
- 속성은 소괄호를 붙이지 않음
- index : df 객체의 행 인덱스 배열을 반환
- columns : df 객체의 열 인덱스 배열을 반환
- axes : df 객체의 행, 열 인덱스를 아이템으로 가지는 배열을 반환
- values : df 객체의 데이터(값)를 아이템으로 가지는 2차원 배열을 반환
- dtypes : df 객체의 데이터 타입을 열 기준으로 반환
- size : df 객체의 데이터 개수(길이)를 반환
- shape : df 객체의 구조(행, 열, 차원)를 반환
- T : 행과 열을 전환시킴

In [11]:
# dict > DataFrame
# 지역별 연도별 유입 인구

pop_data = {'서울':[150, 180, 300],
            '경기':[200, 240, 450],
            '충청':[-10, 3, -13],
            '경상':[10, 20, 30],
            '전라':[5, 6, 7]
           }

pop_sample = pd.DataFrame(pop_data)
pop_sample

Unnamed: 0,서울,경기,충청,경상,전라
0,150,200,-10,10,5
1,180,240,3,20,6
2,300,450,-13,30,7


Unnamed: 0,서울,경기,충청,경상,전라
0,150,200,-10,10,5
1,180,240,3,20,6
2,300,450,-13,30,7


In [12]:
# raw index 입력 및 identifier 지정
year = [2016, 2017, 2018]
pop_sample.index = year
pop_sample.index.name = "year"

pop_sample

Unnamed: 0_level_0,서울,경기,충청,경상,전라
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,150,200,-10,10,5
2017,180,240,3,20,6
2018,300,450,-13,30,7


Unnamed: 0_level_0,서울,경기,충청,경상,전라
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,150,200,-10,10,5
2017,180,240,3,20,6
2018,300,450,-13,30,7


In [13]:
# column index

pop_sample.columns

Index(['서울', '경기', '충청', '경상', '전라'], dtype='object')

Index(['서울', '경기', '충청', '경상', '전라'], dtype='object')

In [14]:
# column index identifier 지정
pop_sample.columns.name = 'location'

pop_sample

location,서울,경기,충청,경상,전라
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,150,200,-10,10,5
2017,180,240,3,20,6
2018,300,450,-13,30,7


location,서울,경기,충청,경상,전라
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,150,200,-10,10,5
2017,180,240,3,20,6
2018,300,450,-13,30,7


In [15]:
# row index modification
# 속성값
# 1. row의 개수와 동일한 item수의 list 전달
# 2. 속성값으로 사용하는 index object는 single item 수정 불가

# pop_sample.index[0] = 1998 > TypeError
pop_sample.index = [1998, 1999, 2000]

pop_sample

location,서울,경기,충청,경상,전라
1998,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1998,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


- **index modification**
- DataFrame method: `df.rename(data, axis=0)`
- `axis`: default == `0` (== row index == `index`)
- column index에 대한 수정: `axis=1` or `axis='colums'`
- data: dict type, {'기존 인덱스명':'바꿀 인덱스명'}
- `inplace`: default == `inplace=False` > 바뀐 결과 적용X, simulation만 수행
- `inplace=True` > 바뀐 결과 바로 적용

In [16]:
pop_sample.rename({1998:1990}, inplace=True)

pop_sample

location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


<br>

- 열 인덱스(column) 변경: `axis=1` or `axis=columns`

- `inplace=False` (default): 원본 변경 X

In [17]:
pop_sample.rename({"전라":"제주"}, axis=1)

location,서울,경기,충청,경상,제주
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,제주
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


<br>

- 행, 열 index: `df.axes`

- return > list: 첫 번째 item == row index, 두 번째 item == columns index

In [18]:
pop_sample.axes

[Int64Index([1990, 1999, 2000], dtype='int64'),
 Index(['서울', '경기', '충청', '경상', '전라'], dtype='object', name='location')]

[Int64Index([1990, 1999, 2000], dtype='int64'),
 Index(['서울', '경기', '충청', '경상', '전라'], dtype='object', name='location')]

- `df.reset_index(drop=True)`: row index를 일괄 초기화(삭제)

- `drop=False` default > 이전 index data > "index" column으로 넘김, "기존 column명"에 row "int index" 추가

In [19]:
pop_sample.reset_index(drop=True)

location,서울,경기,충청,경상,전라
0,150,200,-10,10,5
1,180,240,3,20,6
2,300,450,-13,30,7


location,서울,경기,충청,경상,전라
0,150,200,-10,10,5
1,180,240,3,20,6
2,300,450,-13,30,7


In [20]:
pop_sample.reset_index(drop=False)

location,index,서울,경기,충청,경상,전라
0,1990,150,200,-10,10,5
1,1999,180,240,3,20,6
2,2000,300,450,-13,30,7


location,index,서울,경기,충청,경상,전라
0,1990,150,200,-10,10,5
1,1999,180,240,3,20,6
2,2000,300,450,-13,30,7


`df.values` > 2d array return

In [21]:
pop_sample.values

array([[150, 200, -10,  10,   5],
       [180, 240,   3,  20,   6],
       [300, 450, -13,  30,   7]], dtype=int64)

array([[150, 200, -10,  10,   5],
       [180, 240,   3,  20,   6],
       [300, 450, -13,  30,   7]], dtype=int64)

In [22]:
pop_sample

location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


<br>

- 각 열의 데이터 타입 `df.dtype`

In [23]:
pop_sample.columns.dtype

dtype('O')

dtype('O')

In [24]:
# 전체 셀(데이터) 개수
pop_sample.size

15

15

In [25]:
# len(df) > 2d data로 간주 > 가장 큰 dimension = row개수만 반환됨
len(pop_sample)

3

3

In [26]:
# row, columns transpose
sample_T = pop_sample.T

sample_T

Unnamed: 0_level_0,1990,1999,2000
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
서울,150,180,300
경기,200,240,450
충청,-10,3,-13
경상,10,20,30
전라,5,6,7


Unnamed: 0_level_0,1990,1999,2000
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
서울,150,180,300
경기,200,240,450
충청,-10,3,-13
경상,10,20,30
전라,5,6,7


In [27]:
# transposed df row index
sample_T.index

Index(['서울', '경기', '충청', '경상', '전라'], dtype='object', name='location')

Index(['서울', '경기', '충청', '경상', '전라'], dtype='object', name='location')

In [28]:
# transposed df columns index
sample_T.columns

Int64Index([1990, 1999, 2000], dtype='int64')

Int64Index([1990, 1999, 2000], dtype='int64')

In [29]:
# transpose는 call by value > 원본 영향 X
pop_sample

location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


### 1.3. 인덱싱(indexing)
- columns 조회: default > return Series type

    - `df[col]`

    - `df.col`

    - `df.get(col)` 
    
- `iloc`, `loc` 메서드로 row 조회
    
    - `df.iloc[idx]` : 기본 숫자형 인덱스
    
    - `df.loc[label]` : 새롭게 지정한 인덱스(숫자형이어도 기본 인덱스가 아니면 모두 loc 메서드로 조회)

In [30]:
# 기본적인 indexing > DataFrame의 columns에서 value 조회
# obj[idx_n]

pop_sample["서울"]

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

- "서울" col 조회 3가지 방법

1. 기본적인 indexing 기호: `df[colname]`
    
2. `df.[컬럼명]` > 컬럼명이 변수명으로 사용할 수 있을 때만 가능

3. `df.get(colname)`

In [31]:
pop_sample['서울']

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

In [32]:
pop_sample.서울

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

In [33]:
pop_sample.get("서울")

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

1990    150
1999    180
2000    300
Name: 서울, dtype: int64

In [34]:
pop_sample

location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6
2000,300,450,-13,30,7


<br>

**row or columns 내부 item 값 변경 > 해당 identifier 사라짐**

<br>

In [35]:
# 첫 번째 row 조회 > index 0 reference
# return > Series
# label index: 기존 DataFrame의 columns
# Series name: 기존 DataFrame에서 reference한 1번 row의 label index

pop_sample.iloc[0]

location
서울    150
경기    200
충청    -10
경상     10
전라      5
Name: 1990, dtype: int64

location
서울    150
경기    200
충청    -10
경상     10
전라      5
Name: 1990, dtype: int64

In [36]:
pop_sample.loc[1999]

location
서울    180
경기    240
충청      3
경상     20
전라      6
Name: 1999, dtype: int64

location
서울    180
경기    240
충청      3
경상     20
전라      6
Name: 1999, dtype: int64

In [37]:
# 여러개의 columns 조회 > list in list로 colname 나열해서 전달 > `df[[colname1, colname2, ...]]`: return DataFrame

pop_sample[["서울", "경기"]]

location,서울,경기
1990,150,200
1999,180,240
2000,300,450


location,서울,경기
1990,150,200
1999,180,240
2000,300,450


In [38]:
# "서울", "경기" 지역 1999 data만 조회

pop_sample[["서울", "경기"]].loc[1999]

location
서울    180
경기    240
Name: 1999, dtype: int64

location
서울    180
경기    240
Name: 1999, dtype: int64

In [39]:
pop_sample.loc[1999][["서울", "경기"]]

location
서울    180
경기    240
Name: 1999, dtype: int64

location
서울    180
경기    240
Name: 1999, dtype: int64

In [40]:
# 여러 row 조회 > 1990, 1999

pop_sample.loc[[1990, 1999]]
# 두 개 이상 조회 > 이중 list 활용 > DataFrame 형태로 출력!

location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6


location,서울,경기,충청,경상,전라
1990,150,200,-10,10,5
1999,180,240,3,20,6


In [41]:
# 1990, 1999, 충청지역

pop_sample.loc[[1990, 1999]][["충청"]]

location,충청
1990,-10
1999,3


location,충청
1990,-10
1999,3


In [42]:
print(type(pop_sample[["충청"]].loc[[1990, 1999]]))

pop_sample[["충청"]].loc[[1990, 1999]]

<class 'pandas.core.frame.DataFrame'>


location,충청
1990,-10
1999,3


<class 'pandas.core.frame.DataFrame'>


location,충청
1990,-10
1999,3


In [43]:
print(type(pop_sample.get("충청").loc[[1990, 1999]]))

pop_sample.get("충청").loc[[1990, 1999]]

<class 'pandas.core.series.Series'>


1990   -10
1999     3
Name: 충청, dtype: int64

<class 'pandas.core.series.Series'>


1990   -10
1999     3
Name: 충청, dtype: int64

### 1.4. 슬라이싱 slicing
- row(행) 슬라이싱
    - 순서가 있으며 로우 단독으로 슬라이싱 가능
    - 기본 슬라이싱 문법은 기본 숫자형 인덱스를 기준으로 적용
    - 기본 숫자형 인덱스로 슬라이싱할 때는 마지막 인덱스는 포함하지 않고 라벨 인덱스로 슬라이싱할 때는 마지막 인덱스를 포함
- col(열) 슬라이싱
    - 순서가 없기 때문에 컬럼 단독으로 슬라이싱할 수 없음
    - 라벨 기준으로 로우 기준 슬라이싱 결과에 대해 컬럼 슬라이싱 가능(기본 숫자형 인덱스는 적용 불가)
    - 마지막 인덱스를 포함

In [44]:
pop_sample.index = [1991, 1992, 1993]

# row slicing: df[start:stop:step]

pop_sample[0:2]

location,서울,경기,충청,경상,전라
1991,150,200,-10,10,5
1992,180,240,3,20,6


location,서울,경기,충청,경상,전라
1991,150,200,-10,10,5
1992,180,240,3,20,6


In [45]:
# index 0 to 2, step = 2 row

pop_sample[0:3:2]

location,서울,경기,충청,경상,전라
1991,150,200,-10,10,5
1993,300,450,-13,30,7


location,서울,경기,충청,경상,전라
1991,150,200,-10,10,5
1993,300,450,-13,30,7


In [46]:
# 전체 row, step = -1: 행 역순 나열

pop_sample[::-1]

location,서울,경기,충청,경상,전라
1993,300,450,-13,30,7
1992,180,240,3,20,6
1991,150,200,-10,10,5


location,서울,경기,충청,경상,전라
1993,300,450,-13,30,7
1992,180,240,3,20,6
1991,150,200,-10,10,5


<br>

- columns slicing

In [47]:
# row slicing result > columns slicing
# `df[:, start:end:step]`
# 숫자형 index 기준 slicing 대신 > columns name 사용

# pop_sample[:, 1:2]

In [48]:
# col, row idx: 둘 다 int > 4 X 4 shape, item value == 0, DataFrame 생성
import numpy as np

data = np.zeros((4, 4))
print(data)

df_zr = pd.DataFrame(data)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [49]:
df_zr

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


In [50]:
# int slicing > [col][row]
# colname == Series name
# only row has lavel idx and int index

# row, col == default int type
df_zr[:3][:2]

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0


In [51]:
#df_zr[:3, :2]

### > 연습문제

- 아래와 같은 데이터프레임을 생성하고 출력화면과 동일한 결과를 생성하세요.

<img src="img/df_practice1.png" width="250" align="left">

In [52]:
Col1 = pd.Series([0, 3, 'ks01', 2, 5])
Col2 = pd.Series(["big", "data", "is", "very", "good"])

In [53]:
Col3 = pd.Series([2.7, -5.0, 2.12, 8.31, -1.34])
Col4 = pd.Series([True, True, False, False, True])

col_list = [Col1, Col2, Col3, Col4]

df_test = pd.DataFrame(col_list)
df_test

Unnamed: 0,0,1,2,3,4
0,0,3,ks01,2,5
1,big,data,is,very,good
2,2.7,-5.0,2.12,8.31,-1.34
3,True,True,False,False,True


Unnamed: 0,0,1,2,3,4
0,0,3,ks01,2,5
1,big,data,is,very,good
2,2.7,-5.0,2.12,8.31,-1.34
3,True,True,False,False,True


In [54]:
df_test = df_test.T
df_test

Unnamed: 0,0,1,2,3
0,0,big,2.7,True
1,3,data,-5.0,True
2,ks01,is,2.12,False
3,2,very,8.31,False
4,5,good,-1.34,True


Unnamed: 0,0,1,2,3
0,0,big,2.7,True
1,3,data,-5.0,True
2,ks01,is,2.12,False
3,2,very,8.31,False
4,5,good,-1.34,True


In [55]:
df_test.index = ["A", "B", "C", "D", "E"]

df_test

Unnamed: 0,0,1,2,3
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


Unnamed: 0,0,1,2,3
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


In [56]:
df_test.columns = ["Col1", "Col2", "Col3", "Col4"]
df_test

Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


- 정답

In [57]:
data = {"Col1":[0, 3, 'ks01', 2, 5],
        "Col2":["big", "data", "is", "very", "good"],
        "Col3":[2.70, -5.00, 2.12, 8.31, -1.34],
        "Col4":[True, True, False, False, True]}

df = pd.DataFrame(data)
df

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,big,2.7,True
1,3,data,-5.0,True
2,ks01,is,2.12,False
3,2,very,8.31,False
4,5,good,-1.34,True


Unnamed: 0,Col1,Col2,Col3,Col4
0,0,big,2.7,True
1,3,data,-5.0,True
2,ks01,is,2.12,False
3,2,very,8.31,False
4,5,good,-1.34,True


In [58]:
df.index = list("ABCDE")
df

Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
B,3,data,-5.0,True
C,ks01,is,2.12,False
D,2,very,8.31,False
E,5,good,-1.34,True


### > 연습문제

In [59]:
# 1. Col1, Col3 조회
df_test[["Col1", "Col3"]]

Unnamed: 0,Col1,Col3
A,0,2.7
B,3,-5.0
C,ks01,2.12
D,2,8.31
E,5,-1.34


Unnamed: 0,Col1,Col3
A,0,2.7
B,3,-5.0
C,ks01,2.12
D,2,8.31
E,5,-1.34


In [60]:
# 2. A C D row 조회
df_test.iloc[[0, 2, 3]]

Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
C,ks01,is,2.12,False
D,2,very,8.31,False


Unnamed: 0,Col1,Col2,Col3,Col4
A,0,big,2.7,True
C,ks01,is,2.12,False
D,2,very,8.31,False


In [61]:
# 3. B, D row > Col1, Col2 

df_test[["Col1", "Col2"]].loc[["B", "D"]]

Unnamed: 0,Col1,Col2
B,3,data
D,2,very


Unnamed: 0,Col1,Col2
B,3,data
D,2,very


In [62]:
df_test[["Col1", "Col2"]].iloc[[1, 3]]

Unnamed: 0,Col1,Col2
B,3,data
D,2,very


Unnamed: 0,Col1,Col2
B,3,data
D,2,very


### 1.5. columns, row 추가
- 컬럼 추가 / 변경
    - 컬럼 인덱싱 = 스칼라 값
    - 컬럼 인덱싱 = 배열, 리스트(로우 개수와 아이템 개수 일치)
    - 컬럼 인덱싱 = 컬럼 간의 연산 (a col $\pm$ b col = c col)
    - 컬럼 인덱싱 = series
- 로우 추가
    - 로우 인덱싱 = 스칼라 값
    - 로우 인덱싱 = 로우 간의 연산
- 데이터 분석에서 컬럼과 로우의 의미
    - 컬럼 : 변수(특성)
    - 로우 : 개체별 데이터(레코드)

> 전체 데이터를 구성하는 변수(columns)를 추가/삭제하는 일은 빈번하게 발생하지만,   
특정 인덱스를 기준으로 전체 row data(record)를 추가/삭제하는 일은 자주 발생하지 않으며,   
데이터 처리를 하는 과정에서 권장하지 않는 작업

### cloums 추가

In [63]:
# columns 추가
# 1. 모든 row에 대해서 동일한 value를 가지는 col > scalar value(single value)
# `df["colname"] = scalar value

pop_sample["제주"] = 1

pop_sample

location,서울,경기,충청,경상,전라,제주
1991,150,200,-10,10,5,1
1992,180,240,3,20,6,1
1993,300,450,-13,30,7,1


location,서울,경기,충청,경상,전라,제주
1991,150,200,-10,10,5,1
1992,180,240,3,20,6,1
1993,300,450,-13,30,7,1


In [64]:
# 2. 서로 다른 value를 가지는 data로 구성되는 colo
# 조건: 전달하는 자료형(arr, list)의 length == row length

pop_sample["부산"] = np.arange(5, 8)

pop_sample

location,서울,경기,충청,경상,전라,제주,부산
1991,150,200,-10,10,5,1,5
1992,180,240,3,20,6,1,6
1993,300,450,-13,30,7,1,7


location,서울,경기,충청,경상,전라,제주,부산
1991,150,200,-10,10,5,1,5
1992,180,240,3,20,6,1,6
1993,300,450,-13,30,7,1,7


In [65]:
# 3. col간의 연산: 파생변수

pop_sample["수도권"] = pop_sample["서울"] + pop_sample["경기"]

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권
1991,150,200,-10,10,5,1,5,350
1992,180,240,3,20,6,1,6,420
1993,300,450,-13,30,7,1,7,750


location,서울,경기,충청,경상,전라,제주,부산,수도권
1991,150,200,-10,10,5,1,5,350
1992,180,240,3,20,6,1,6,420
1993,300,450,-13,30,7,1,7,750


In [66]:
# 4. Series object를 col로 전달
# 조건: 대상 DataFrame의 구조와 추가할 item(=Series)의 구조 파악 필수
# label index 기준으로 Series data와 DataFrame data가 mapping
# Series에 없는 label index > NaN
# 반드시 길이가 일치하지 않아도 된다.

sr_1 = pd.Series([9, -99], index=[1990, 1992])
sr_1

1990     9
1992   -99
dtype: int64

1990     9
1992   -99
dtype: int64

In [67]:
pop_sample["강원"] = sr_1

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원
1991,150,200,-10,10,5,1,5,350,
1992,180,240,3,20,6,1,6,420,-99.0
1993,300,450,-13,30,7,1,7,750,


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원
1991,150,200,-10,10,5,1,5,350,
1992,180,240,3,20,6,1,6,420,-99.0
1993,300,450,-13,30,7,1,7,750,


In [68]:
# data 개수가 동일해도 label 기준 mapping > Series에 없는 col label index == NaN

sr_2 = pd.Series([100, 100, 100])
sr_2

0    100
1    100
2    100
dtype: int64

0    100
1    100
2    100
dtype: int64

In [69]:
pop_sample['test'] = sr_2

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,


### row 추가

- 로우 추가
    - 로우 인덱싱 = 스칼라 값
    - 로우 인덱싱 = 로우 간의 연산
ㅡ    - 로우 인덱싱 = 자료형(배열, 리스트 / 컬럼 개수와 아이템 개수 일치)

In [70]:
# row 추가 > df.loc 활용 > col추가와 동일

pop_sample.loc[1994] = 0

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0


In [71]:
pop_sample.shape

(4, 10)

(4, 10)

In [72]:
# row 추가 가능 data type: arr, list, dict
# col 개수와 item 개수 일치

pop_sample.loc[1995] = np.arange(0, 10)

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0
1995,0,1,2,3,4,5,6,7,8.0,9.0


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0
1995,0,1,2,3,4,5,6,7,8.0,9.0


In [73]:
# dict > {"key":"value"} > "col":"value" > col별 value 지정 가능

pop_sample.loc[1996] = {'서울':10, '경기':20, '충청':40, '경상':21, '전라':37,
                   '제주':103, '부산':28, '수도권':30, '강원':15, 'test':0}
pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0
1995,0,1,2,3,4,5,6,7,8.0,9.0
1996,10,20,40,21,37,103,28,30,15.0,0.0


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150,200,-10,10,5,1,5,350,,
1992,180,240,3,20,6,1,6,420,-99.0,
1993,300,450,-13,30,7,1,7,750,,
1994,0,0,0,0,0,0,0,0,0.0,0.0
1995,0,1,2,3,4,5,6,7,8.0,9.0
1996,10,20,40,21,37,103,28,30,15.0,0.0


In [74]:
# row간 연산

pop_sample.loc[1997] = pop_sample.loc[1995] + pop_sample.loc[1996]

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150.0,200.0,-10.0,10.0,5.0,1.0,5.0,350.0,,
1992,180.0,240.0,3.0,20.0,6.0,1.0,6.0,420.0,-99.0,
1993,300.0,450.0,-13.0,30.0,7.0,1.0,7.0,750.0,,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
1996,10.0,20.0,40.0,21.0,37.0,103.0,28.0,30.0,15.0,0.0
1997,10.0,21.0,42.0,24.0,41.0,108.0,34.0,37.0,23.0,9.0


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원,test
1991,150.0,200.0,-10.0,10.0,5.0,1.0,5.0,350.0,,
1992,180.0,240.0,3.0,20.0,6.0,1.0,6.0,420.0,-99.0,
1993,300.0,450.0,-13.0,30.0,7.0,1.0,7.0,750.0,,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
1996,10.0,20.0,40.0,21.0,37.0,103.0,28.0,30.0,15.0,0.0
1997,10.0,21.0,42.0,24.0,41.0,108.0,34.0,37.0,23.0,9.0


### 1. 6. row, columns 삭제
- columns 삭제
    - del 키워드 + 컬럼 인덱싱
    - df.drop(col, axis=1)
    - df.drop(columns=col)
- row 삭제
    - df.drop(idx) : axis = 0 (기본값)

<br>

- 컬럼 삭제

In [75]:
# 1. del.df[col name] > 원본 영향 있음

del pop_sample["test"]

pop_sample

location,서울,경기,충청,경상,전라,제주,부산,수도권,강원
1991,150.0,200.0,-10.0,10.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,3.0,20.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,-13.0,30.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
1996,10.0,20.0,40.0,21.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,42.0,24.0,41.0,108.0,34.0,37.0,23.0


location,서울,경기,충청,경상,전라,제주,부산,수도권,강원
1991,150.0,200.0,-10.0,10.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,3.0,20.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,-13.0,30.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
1996,10.0,20.0,40.0,21.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,42.0,24.0,41.0,108.0,34.0,37.0,23.0


In [76]:
# columns 삭제 2: `df.drop(columns=column name)` > 원본 영향X, `inplace=True` opt설정 시 반영

pop_sample.drop(columns="경상", inplace=True)

In [77]:
# 컬럼 삭제3: `df.drop(column name, axis=1) > 원본 반영 X, `inplace=True` opt 필요

pop_sample.drop('충청', axis=1, inplace=True)

pop_sample

location,서울,경기,전라,제주,부산,수도권,강원
1991,150.0,200.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,4.0,5.0,6.0,7.0,8.0
1996,10.0,20.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,41.0,108.0,34.0,37.0,23.0


location,서울,경기,전라,제주,부산,수도권,강원
1991,150.0,200.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,1.0,4.0,5.0,6.0,7.0,8.0
1996,10.0,20.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,41.0,108.0,34.0,37.0,23.0


<br>

- row 삭제

In [78]:
# 1. `df.drop(row, axis=0) > defalut axis=0

pop_sample.drop(1995, inplace=True)

pop_sample

location,서울,경기,전라,제주,부산,수도권,강원
1991,150.0,200.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,10.0,20.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,41.0,108.0,34.0,37.0,23.0


location,서울,경기,전라,제주,부산,수도권,강원
1991,150.0,200.0,5.0,1.0,5.0,350.0,
1992,180.0,240.0,6.0,1.0,6.0,420.0,-99.0
1993,300.0,450.0,7.0,1.0,7.0,750.0,
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,10.0,20.0,37.0,103.0,28.0,30.0,15.0
1997,10.0,21.0,41.0,108.0,34.0,37.0,23.0


<br>

- 두 개 이상 col이나 row 삭제 > list로 묶어서 전달


In [80]:
# '제주', '강원' columns 삭제

pop_sample.drop(['제주', '강원'], axis=1, inplace=True)

pop_sample

location,서울,경기,전라,부산,수도권
1991,150.0,200.0,5.0,5.0,350.0
1992,180.0,240.0,6.0,6.0,420.0
1993,300.0,450.0,7.0,7.0,750.0
1994,0.0,0.0,0.0,0.0,0.0
1996,10.0,20.0,37.0,28.0,30.0
1997,10.0,21.0,41.0,34.0,37.0


location,서울,경기,전라,부산,수도권
1991,150.0,200.0,5.0,5.0,350.0
1992,180.0,240.0,6.0,6.0,420.0
1993,300.0,450.0,7.0,7.0,750.0
1994,0.0,0.0,0.0,0.0,0.0
1996,10.0,20.0,37.0,28.0,30.0
1997,10.0,21.0,41.0,34.0,37.0


In [82]:
# row > list로 전달 > 동시에 두 개 이상 삭제 가능
# 1994, 1997 데이터 삭제

pop_sample.drop([1994, 1997], inplace=True)

pop_sample

location,서울,경기,전라,부산,수도권
1991,150.0,200.0,5.0,5.0,350.0
1992,180.0,240.0,6.0,6.0,420.0
1993,300.0,450.0,7.0,7.0,750.0
1996,10.0,20.0,37.0,28.0,30.0


location,서울,경기,전라,부산,수도권
1991,150.0,200.0,5.0,5.0,350.0
1992,180.0,240.0,6.0,6.0,420.0
1993,300.0,450.0,7.0,7.0,750.0
1996,10.0,20.0,37.0,28.0,30.0


### 1.7. 산술연산
- dataframe 과 스칼라 값 산술연산
- dataframe 과 series 간의 산술연산
- dataframe 간의 산술연산
    - row, columns sorted됨
    - 컬럼, 로우 인덱스를 기준으로 연산 수행
    - 공통으로 존재하지 않는 경우 NaN 반환
    - fill_value 인자 값을 통해 NaN이 아닌 값으로 대체 가능
- 연산의 종류
    - 더하기 : +, add() 메서드
    - 빼기 : -, sub() 메서드
    - 곱하기 : *, mul() 메서드
    - 나머지만 반환 : %
    - 몫만 반환 : //

In [86]:
# columns name: 서울, 경기, 인천, row name: a, b, c
# 0이상 1씩 증가하는 int data DataFrame

op_df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                    index=list("abc"),
                    columns=["서울", "경기", "인천"])

op_df1

Unnamed: 0,서울,경기,인천
a,0,1,2
b,3,4,5
c,6,7,8


Unnamed: 0,서울,경기,인천
a,0,1,2
b,3,4,5
c,6,7,8


In [88]:
# columns = 서울, 경기, 인천, 세종, 강원
# row = a, b, c, d
# data = 0이상 int
# DataFrame

data_arr = np.arange(20).reshape(4, 5)

op_df2 = pd.DataFrame(data_arr,
                     columns=["서울", "경기", "인천", "세종", "강원"],
                     index=list("abcd"))

op_df2

Unnamed: 0,서울,경기,인천,세종,강원
a,0,1,2,3,4
b,5,6,7,8,9
c,10,11,12,13,14
d,15,16,17,18,19


Unnamed: 0,서울,경기,인천,세종,강원
a,0,1,2,3,4
b,5,6,7,8,9
c,10,11,12,13,14
d,15,16,17,18,19


In [89]:
# df간의 더하기 연산
# result > 공통 컬럼, 로우인 data만 정상 더하기 연산됨, 아닌 부분은 NaN처리

op_df1 + op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,2.0,0.0,,4.0
b,,10.0,8.0,,12.0
c,,18.0,16.0,,20.0
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,2.0,0.0,,4.0
b,,10.0,8.0,,12.0
c,,18.0,16.0,,20.0
d,,,,,


In [90]:
# fill_value: add method의 parameter

op_df1.add(op_df2, fill_value=0)

Unnamed: 0,강원,경기,서울,세종,인천
a,4.0,2.0,0.0,3.0,4.0
b,9.0,10.0,8.0,8.0,12.0
c,14.0,18.0,16.0,13.0,20.0
d,19.0,16.0,15.0,18.0,17.0


Unnamed: 0,강원,경기,서울,세종,인천
a,4.0,2.0,0.0,3.0,4.0
b,9.0,10.0,8.0,8.0,12.0
c,14.0,18.0,16.0,13.0,20.0
d,19.0,16.0,15.0,18.0,17.0


In [91]:
# 빼기 연산

op_df1 - op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,0.0,0.0,,0.0
b,,-2.0,-2.0,,-2.0
c,,-4.0,-4.0,,-4.0
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,0.0,0.0,,0.0
b,,-2.0,-2.0,,-2.0
c,,-4.0,-4.0,,-4.0
d,,,,,


In [92]:
op_df1.sub(op_df2, fill_value=0)

Unnamed: 0,강원,경기,서울,세종,인천
a,-4.0,0.0,0.0,-3.0,0.0
b,-9.0,-2.0,-2.0,-8.0,-2.0
c,-14.0,-4.0,-4.0,-13.0,-4.0
d,-19.0,-16.0,-15.0,-18.0,-17.0


Unnamed: 0,강원,경기,서울,세종,인천
a,-4.0,0.0,0.0,-3.0,0.0
b,-9.0,-2.0,-2.0,-8.0,-2.0
c,-14.0,-4.0,-4.0,-13.0,-4.0
d,-19.0,-16.0,-15.0,-18.0,-17.0


<br>

- df간의 곱하기 연산

In [93]:
op_df1 * op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,0.0,,4.0
b,,24.0,15.0,,35.0
c,,77.0,60.0,,96.0
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,0.0,,4.0
b,,24.0,15.0,,35.0
c,,77.0,60.0,,96.0
d,,,,,


In [94]:
# df1.mul(df2, fill_vaule=value)

op_df1.mul(op_df2, fill_value=1)

Unnamed: 0,강원,경기,서울,세종,인천
a,4.0,1.0,0.0,3.0,4.0
b,9.0,24.0,15.0,8.0,35.0
c,14.0,77.0,60.0,13.0,96.0
d,19.0,16.0,15.0,18.0,17.0


Unnamed: 0,강원,경기,서울,세종,인천
a,4.0,1.0,0.0,3.0,4.0
b,9.0,24.0,15.0,8.0,35.0
c,14.0,77.0,60.0,13.0,96.0
d,19.0,16.0,15.0,18.0,17.0


<br>

- 나누기

In [95]:
op_df1 / op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,,,1.0
b,,0.666667,0.6,,0.714286
c,,0.636364,0.6,,0.666667
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,,,1.0
b,,0.666667,0.6,,0.714286
c,,0.636364,0.6,,0.666667
d,,,,,


In [99]:
op_df1 // op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,,,1.0
b,,0.0,0.0,,0.0
c,,0.0,0.0,,0.0
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,1.0,,,1.0
b,,0.0,0.0,,0.0
c,,0.0,0.0,,0.0
d,,,,,


In [100]:
op_df1 % op_df2

Unnamed: 0,강원,경기,서울,세종,인천
a,,0.0,,,0.0
b,,4.0,3.0,,5.0
c,,7.0,6.0,,8.0
d,,,,,


Unnamed: 0,강원,경기,서울,세종,인천
a,,0.0,,,0.0
b,,4.0,3.0,,5.0
c,,7.0,6.0,,8.0
d,,,,,


In [97]:
op_df1.div(op_df2, fill_value=1)

op_df1

Unnamed: 0,강원,경기,서울,세종,인천
a,0.25,1.0,,0.333333,1.0
b,0.111111,0.666667,0.6,0.125,0.714286
c,0.071429,0.636364,0.6,0.076923,0.666667
d,0.052632,0.0625,0.066667,0.055556,0.058824


Unnamed: 0,강원,경기,서울,세종,인천
a,0.25,1.0,,0.333333,1.0
b,0.111111,0.666667,0.6,0.125,0.714286
c,0.071429,0.636364,0.6,0.076923,0.666667
d,0.052632,0.0625,0.066667,0.055556,0.058824


In [98]:
op_df1.divide(op_df2, fill_value=1)

op_df1

Unnamed: 0,서울,경기,인천
a,0,1,2
b,3,4,5
c,6,7,8


Unnamed: 0,서울,경기,인천
a,0,1,2
b,3,4,5
c,6,7,8


In [101]:
op_df1.mod(op_df2, fill_value=1)

Unnamed: 0,강원,경기,서울,세종,인천
a,1.0,0.0,,1.0,0.0
b,1.0,4.0,3.0,1.0,5.0
c,1.0,7.0,6.0,1.0,8.0
d,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,강원,경기,서울,세종,인천
a,1.0,0.0,,1.0,0.0
b,1.0,4.0,3.0,1.0,5.0
c,1.0,7.0,6.0,1.0,8.0
d,1.0,1.0,1.0,1.0,1.0


### 1.8. DataFrame과 Series 간의 연산(Operation between DataFrame and Series)
- 기본적인 동작은 Series 객체의 인덱스를 DataFrame 객체의 컬럼 인덱스와 매핑하여 브로드캐스팅과 유사하게 연산 수행
- 두 객체 간의 공통된 인덱스가 아닌 대상은 NaN 값으로 대입
- 메서드를 사용하여 연산을 수행할 때는 axis 파라미터를 통해 연산을 적용할 축 지정(0:행, 1:열)
- 연산의 종류
    - 더하기 : +, add() 메서드
    - 빼기 : -,  sub() 메서드
    - 곱하기 : *, mul() 메서드

In [103]:
# col name: a, b, c, d
# row name: 2010, 2011, 2012
# data: 0이상 1씩 증가하는 정수

data_arr = np.arange(12).reshape(3, 4)

ds_df= pd.DataFrame(data_arr, index=[2010, 2011, 2012], columns=list("abcd"))

ds_df

Unnamed: 0,a,b,c,d
2010,0,1,2,3
2011,4,5,6,7
2012,8,9,10,11


Unnamed: 0,a,b,c,d
2010,0,1,2,3
2011,4,5,6,7
2012,8,9,10,11


In [110]:
# 첫번째 행 추출
# sr_1: label index가 적용된 Series

sr_1 = ds_df.iloc[0]

sr_1

a    0
b    1
c    2
d    3
Name: 2010, dtype: int32

a    0
b    1
c    2
d    3
Name: 2010, dtype: int32

In [105]:
# Series(label index a, b, c, d)와 DataFrame의 결합 기준: 이름이 일치할 때만 연산

ds_df + sr_1

Unnamed: 0,a,b,c,d
2010,0,2,4,6
2011,4,6,8,10
2012,8,10,12,14


Unnamed: 0,a,b,c,d
2010,0,2,4,6
2011,4,6,8,10
2012,8,10,12,14


In [116]:
# DataFrame column: a, b, c, d, e, data = 20개의 0

zero_arr = np.zeros((4, 5), dtype=int)

sd_df = pd.DataFrame(zero_arr, columns=list("abcde"))

sd_df

Unnamed: 0,a,b,c,d,e
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0


Unnamed: 0,a,b,c,d,e
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0


In [115]:
# 기본 index를 가진 Series
# data = 0 ~ 4

sr_2 = pd.Series(np.arange(5))

sr_2

0    0
1    1
2    2
3    3
4    4
dtype: int32

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [117]:

# columns단위 산술 연산
# 기본 동작: Series row index vs. DataFrame columns > mapping
# row, columns에 일치하는 identifier가 없음 > 그냥 추가

sd_df.sub(sr_2)

Unnamed: 0,a,b,c,d,e,0,1,2,3,4
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,


Unnamed: 0,a,b,c,d,e,0,1,2,3,4
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,


In [118]:
# axis=0: DataFrame row vs. Series label index > mapping, 일치시 연산

sd_df.sub(sr_2, axis=0)

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,0.0
1,-1.0,-1.0,-1.0,-1.0,-1.0
2,-2.0,-2.0,-2.0,-2.0,-2.0
3,-3.0,-3.0,-3.0,-3.0,-3.0
4,,,,,


Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,0.0
1,-1.0,-1.0,-1.0,-1.0,-1.0
2,-2.0,-2.0,-2.0,-2.0,-2.0
3,-3.0,-3.0,-3.0,-3.0,-3.0
4,,,,,


In [119]:
# DataFrame의 columns에 없는 index를 가진 Series
# index = a, c, e
# data = 3, 3, 3

sr_3 = pd.Series([3, 3, 3], index=list("ace"))

sr_3

a    3
c    3
e    3
dtype: int64

a    3
c    3
e    3
dtype: int64

In [122]:
# 공통되지 않은 (mapping 불가능 한) index value and columns value > NaN
# Series와 DataFrame 연산> fill_value parameter X

ds_df - sr_3

Unnamed: 0,a,b,c,d,e
2010,-3.0,,-1.0,,
2011,1.0,,3.0,,
2012,5.0,,7.0,,


Unnamed: 0,a,b,c,d,e
2010,-3.0,,-1.0,,
2011,1.0,,3.0,,
2012,5.0,,7.0,,


In [123]:
ds_df.sub(sr_3)

Unnamed: 0,a,b,c,d,e
2010,-3.0,,-1.0,,
2011,1.0,,3.0,,
2012,5.0,,7.0,,


Unnamed: 0,a,b,c,d,e
2010,-3.0,,-1.0,,
2011,1.0,,3.0,,
2012,5.0,,7.0,,
