In [None]:
import numpy as np
import pandas as pd

In [None]:
# data
my_dict = {
  'kor': {
    'a': 10, 
    'b': 30,
    'c': 50,
    'd': 70,
    'e': 30,
  },
  'eng': {
    'a': 20, 
    'b': 40, 
    'c': 60, 
    'd': 80,
    'e': 100,
  },
  'math': {
    'a': 33, 
    'b': 44, 
    'c': 55, 
    'd': 66,
    'e': 77,
  },
}

In [None]:
my_df = pd.DataFrame(my_dict)
my_df

Unnamed: 0,kor,eng,math
a,10,20,33
b,30,40,44
c,50,60,55
d,70,80,66
e,30,100,77


# Data Manipulation
- assign
- where & mask
- apply, map, and applymap

## assign
- dict 표현식과 동일

In [None]:
my_df.assign(total=lambda df:df.sum(axis='columns'))

Unnamed: 0,kor,eng,math,total
a,10,20,33,63
b,30,40,44,114
c,50,60,55,165
d,70,80,66,216
e,30,100,77,207


In [None]:
my_df['total'] = my_df.sum(axis='columns')
my_df

Unnamed: 0,kor,eng,math,total
a,10,20,33,63
b,30,40,44,114
c,50,60,55,165
d,70,80,66,216
e,30,100,77,207


In [None]:
my_df.assign(total=lambda df:df.sum(axis='columns'), avg=lambda df:df.mean(axis='columns'))

Unnamed: 0,kor,eng,math,total,avg
a,10,20,33,126,47.25
b,30,40,44,228,85.5
c,50,60,55,330,123.75
d,70,80,66,432,162.0
e,30,100,77,414,155.25


## where & mask
- 값 선택시:
  - where: 조건에 맞는 값 선택
  - mask: 조건에 맞지 않는 값 선택
- 값을 변경:
  - where: 조건에 맞지 않는 값을 변경
  - **mask**: 조건엔 맞는 값을 변경
    - nan
    - minus, '', ...

In [None]:
my_df.where(my_df>50)

Unnamed: 0,kor,eng,math,total
a,,,,63
b,,,,114
c,,60.0,55.0,165
d,70.0,80.0,66.0,216
e,,100.0,77.0,207


In [None]:
my_df.mask(my_df>50)

Unnamed: 0,kor,eng,math,total
a,10.0,20.0,33.0,
b,30.0,40.0,44.0,
c,50.0,,,
d,,,,
e,30.0,,,


In [None]:
my_df.where(my_df>50, -100)

Unnamed: 0,kor,eng,math,total
a,-100,-100,-100,63
b,-100,-100,-100,114
c,-100,60,55,165
d,70,80,66,216
e,-100,100,77,207


In [None]:
my_df.mask(my_df>50, -100) #

Unnamed: 0,kor,eng,math,total
a,10,20,33,-100
b,30,40,44,-100
c,50,-100,-100,-100
d,-100,-100,-100,-100
e,30,-100,-100,-100


In [None]:
my_df.mask(my_df>50, my_df.mean(), axis='columns')

Unnamed: 0,kor,eng,math,total
a,10,20,33,153
b,30,40,44,153
c,50,60,55,153
d,38,60,55,153
e,30,60,55,153


In [None]:
my_df.mask(my_df>40, my_df.mean(axis='columns'), axis='index')

Unnamed: 0,kor,eng,math,total
a,10.0,20.0,33.0,31.5
b,30.0,40.0,57.0,57.0
c,82.5,82.5,82.5,82.5
d,108.0,108.0,108.0,108.0
e,30.0,103.5,103.5,103.5


## apply, map, applymap
- map: python의 map을 구현
  - Series에만 구현
- apply: 
  - Series: 각 값에 적용
  - DataFrame: 각 열에 적용
- applymap:
  - DataFrame만 
  - 각 원소에 대해서


In [None]:
list(map(lambda x:x+1, [1, 2, 3]))

[2, 3, 4]

In [None]:
my_df.kor.map(lambda x:x+1)

a    11
b    31
c    51
d    71
e    31
Name: kor, dtype: int64

In [None]:
my_df.kor.apply(lambda x:x+1)

a    11
b    31
c    51
d    71
e    31
Name: kor, dtype: int64

In [None]:
my_df.apply(lambda s:s-s.mean())

Unnamed: 0,kor,eng,math,total
a,-28.0,-40.0,-22.0,-90.0
b,-8.0,-20.0,-11.0,-39.0
c,12.0,0.0,0.0,12.0
d,32.0,20.0,11.0,63.0
e,-8.0,40.0,22.0,54.0


In [None]:
my_df.applymap(lambda x:x+10)

Unnamed: 0,kor,eng,math,total
a,20,30,43,73
b,40,50,54,124
c,60,70,65,175
d,80,90,76,226
e,40,110,87,217


## 집계 (aggregation)
- reshape
  - (5, 3) => (10, 5)
- 행과 열, 그룹 특징(통계량)

In [None]:
def my_mean(ss):
  return ss.mean() + 10

In [None]:
my_df.agg([my_mean, np.mean])

Unnamed: 0,kor,eng,math,total
my_mean,48.0,70.0,65.0,163.0
mean,38.0,60.0,55.0,153.0


In [None]:
my_df.mean()

kor       38.0
eng       60.0
math      55.0
total    153.0
dtype: float64

In [None]:
my_df.median()

kor       30.0
eng       60.0
math      55.0
total    165.0
dtype: float64

In [None]:
my_df.describe().loc[['mean', 'count']]

Unnamed: 0,kor,eng,math,total
mean,38.0,60.0,55.0,153.0
count,5.0,5.0,5.0,5.0


In [None]:
my_df.aggregate('mean')

kor       38.0
eng       60.0
math      55.0
total    153.0
dtype: float64

In [None]:
my_df.agg({'kor': np.mean, 'eng': np.median})

kor    38.0
eng    60.0
dtype: float64