# 그룹화(group ) 실습 ---------
- DF/SR.groupby() 메서드로 생성
- 그룹단위로 평균, 최대, 최소 등과 같은 집계 메서드 적용

In [24]:
# module import
import pandas as pd
import numpy as np 

In [25]:
df = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

In [26]:
df.head()

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [27]:
print(f'df.columns = {df.columns}', f'df.index = {df.index}', sep='\n')

df.columns = Index(['class', 'order', 'max_speed'], dtype='object')
df.index = Index(['falcon', 'parrot', 'lion', 'monkey', 'leopard'], dtype='object')


In [28]:
# 객체변수명.groupby(컬럼명)
classGroup = df.groupby(by='class')
classGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D87BDDFCD0>

In [29]:
# __iter__ 반복이 가능한 객체 => for ~ in 
for group in classGroup:
    print(group)

('bird',        class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0)
('mammal',           class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0)


In [30]:
for key, group in classGroup:
    print(f'key = {key}')
    print(f'group = \n{group}\n')

key = bird
group = 
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0

key = mammal
group = 
          class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0



In [31]:
# GroupBy 객체 속성 살펴보기
# 그룹키 : [요소 인덱스 ...]
classGroup.groups

{'bird': ['falcon', 'parrot'], 'mammal': ['lion', 'monkey', 'leopard']}

In [32]:
# 특정 그룹키에 해당하는 데이터만 추출 => get_group(키)
classGroup.get_group('bird')

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0


In [33]:
# 그룹 전체에 대한 메서드 적용
classGroup.describe()

Unnamed: 0_level_0,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
bird,2.0,206.5,258.093975,24.0,115.25,206.5,297.75,389.0
mammal,2.0,69.1,15.697771,58.0,63.55,69.1,74.65,80.2


In [34]:
classGroup.describe(include='all')

Unnamed: 0_level_0,order,order,order,order,order,order,order,order,order,order,...,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed,max_speed
Unnamed: 0_level_1,count,unique,top,freq,mean,std,min,25%,50%,75%,...,unique,top,freq,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
bird,2,2,Falconiformes,1,,,,,,,...,,,,206.5,258.093975,24.0,115.25,206.5,297.75,389.0
mammal,3,2,Carnivora,2,,,,,,,...,,,,69.1,15.697771,58.0,63.55,69.1,74.65,80.2


In [35]:
classGroup.mean()

Unnamed: 0_level_0,max_speed
class,Unnamed: 1_level_1
bird,206.5
mammal,69.1


In [36]:
# 특정 그룹에 대한 것만 처리 -> get_groups(key)
birdGroup = classGroup.get_group('bird')

In [37]:
birdGroup.describe()

Unnamed: 0,max_speed
count,2.0
mean,206.5
std,258.093975
min,24.0
25%,115.25
50%,206.5
75%,297.75
max,389.0


In [38]:
# 집계함수를 한꺼번에 적용 => agg(['함수명', '함수명, ...], axis=0)
birdGroup.agg(['min', 'max', 'sum'])

Unnamed: 0,class,order,max_speed
min,bird,Falconiformes,24.0
max,bird,Psittaciformes,389.0
sum,birdbird,FalconiformesPsittaciformes,413.0


In [39]:
birdGroup.agg({'order': 'max', 'max_speed': ['min', 'max']})

Unnamed: 0,order,max_speed
max,Psittaciformes,389.0
min,,24.0


In [40]:
# 사용자 정의 함수
def getMaxMin(x):
    print(f'x ----------- \n{x}\n')
    return x.max() - x.min()

birdGroup.agg({'max_speed': getMaxMin})

x ----------- 
389.0

x ----------- 
falcon    389.0
parrot     24.0
Name: max_speed, dtype: float64



max_speed    365.0
dtype: float64

In [41]:
# 그룹에서 필터링하기 ==> filter(True/False)

In [42]:
birdGroup

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0


In [43]:
birdGroup.filter(lambda x: x['max_speed'] >= 300)

TypeError: 'function' object is not iterable

In [44]:
classGroup.filter(lambda x: x['max_speed'] >= 300)

TypeError: filter function returned a Series, but expected a scalar bool

In [47]:
idGroup = df.rename_axis('id').groupby('id')

In [48]:
idGroup.filter(lambda x: x['max_speed'] >= 300)

Unnamed: 0_level_0,class,order,max_speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
falcon,bird,Falconiformes,389.0
