In [1]:
import pandas as pd
import numpy as np

In [2]:
# 샘플 데이터프레임 생성
idx = ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C','D', 'D', 'D', 'D', 'E', 'E', 'E']
col = ['col1', 'col2', 'col3']
data = np.random.randint(0,9, (15,3))
df1 = pd.DataFrame(data=data, index=idx, columns = col).reset_index()
df1

Unnamed: 0,index,col1,col2,col3
0,A,7,8,0
1,A,3,5,8
2,B,7,7,8
3,B,1,1,2
4,B,3,8,7
5,C,3,2,3
6,C,3,5,5
7,C,0,4,5
8,D,0,5,6
9,D,0,6,6


In [3]:
# 특정 컬럼별 통계치 구하기
df1.groupby('index').mean()  # sum() 합계 | mean() 평균 | count() 개수 | val() 분산 | std() 표준편차 | max() 최대값 | min() 최소값

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,5.0,6.5,4.0
B,3.666667,5.333333,5.666667
C,2.0,3.666667,4.333333
D,1.0,5.75,4.75
E,3.666667,3.0,2.666667


In [4]:
# 둘 이상의 통게치 동시에 구하기
df1.groupby('index').agg(['sum', 'mean'])

Unnamed: 0_level_0,col1,col1,col2,col2,col3,col3
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,5.0,13,6.5,8,4.0
B,11,3.666667,16,5.333333,17,5.666667
C,6,2.0,11,3.666667,13,4.333333
D,4,1.0,23,5.75,19,4.75
E,11,3.666667,9,3.0,8,2.666667


In [18]:
# 둘 이상의 통게치 동시에 구할 때 컬럼의 멀티인덱스 정리, 소수점도 정리
def flatten_cols(df):
    df.columns = [' / '.join(x) for x in df.columns.to_flat_index()]
    return df

df1.groupby('index').agg(['sum', 'mean']).pipe(flatten_cols).round(2)

Unnamed: 0_level_0,col1 / sum,col1 / mean,col2 / sum,col2 / mean,col3 / sum,col3 / mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,10,5.0,13,6.5,8,4.0
B,11,3.67,16,5.33,17,5.67
C,3,1.5,6,3.0,8,4.0
D,4,1.0,23,5.75,19,4.75
E,11,3.67,9,3.0,8,2.67


In [5]:
# apply 메서드 활용, 각 그룹별 col1 상위 2위만 추출
def top(df1, n=2, col='col1'):
    return df1.sort_values(by=col)[-n:]
df1.groupby('index', group_keys=False).apply(top)

Unnamed: 0,index,col1,col2,col3
1,A,3,5,8
0,A,7,8,0
4,B,3,8,7
2,B,7,7,8
5,C,3,2,3
6,C,3,5,5
11,D,1,6,0
10,D,3,6,7
14,E,3,1,2
13,E,8,8,0


In [23]:
# 조금더 쉽게
def get_top2(x):
    return x.sort_values('col1').head(2)
df1.groupby('index').apply(get_top2)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,A,3,5,8
A,0,A,7,8,0
B,3,B,1,1,2
B,4,B,3,8,7
C,7,C,0,4,5
C,5,C,3,2,3
D,8,D,0,5,6
D,9,D,0,6,6
E,12,E,0,0,6
E,14,E,3,1,2


In [25]:
# lambda 로도 가능
df1.groupby('index').apply(lambda x:x.sort_values('col1').head(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,index,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,A,3,5,8
A,0,A,7,8,0
B,3,B,1,1,2
B,4,B,3,8,7
C,7,C,0,4,5
C,5,C,3,2,3
D,8,D,0,5,6
D,9,D,0,6,6
E,12,E,0,0,6
E,14,E,3,1,2


In [6]:
# 기존 인덱스 유지, 아래의 reset_index() 붙인 거와 동일 (그래서 별 의미 없을수도)
df1.groupby('index', as_index=False).sum()

Unnamed: 0,index,col1,col2,col3
0,A,10,13,8
1,B,11,16,17
2,C,6,11,13
3,D,4,23,19
4,E,11,9,8


In [7]:
df1.groupby('index').sum().reset_index()

Unnamed: 0,index,col1,col2,col3
0,A,10,13,8
1,B,11,16,17
2,C,6,11,13
3,D,4,23,19
4,E,11,9,8


In [8]:
# Nan 값은 계산에서 제외
df1.loc[6, 'index'] = np.NaN
df1.groupby('index').sum()

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,10,13,8
B,11,16,17
C,3,6,8
D,4,23,19
E,11,9,8


In [9]:
# 하지만 dropna=False 인 경우 인덱스에 Nan 포함되어 계산
df1.groupby('index', dropna=False).sum()

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,10,13,8
B,11,16,17
C,3,6,8
D,4,23,19
E,11,9,8
,3,5,5


In [15]:
df1.groupby('index').agg({
    'col1' : 'sum',
    'col2' : ['sum', 'var']})


Unnamed: 0_level_0,col1,col2,col2
Unnamed: 0_level_1,sum,sum,var
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,10,13,4.5
B,11,16,14.333333
C,3,6,2.0
D,4,23,0.25
E,11,9,19.0


In [20]:
# 그룹별로 각각 출력
for idx, group in df1.groupby('index'):
    display(group.head())

Unnamed: 0,index,col1,col2,col3
0,A,7,8,0
1,A,3,5,8


Unnamed: 0,index,col1,col2,col3
2,B,7,7,8
3,B,1,1,2
4,B,3,8,7


Unnamed: 0,index,col1,col2,col3
5,C,3,2,3
7,C,0,4,5


Unnamed: 0,index,col1,col2,col3
8,D,0,5,6
9,D,0,6,6
10,D,3,6,7
11,D,1,6,0


Unnamed: 0,index,col1,col2,col3
12,E,0,0,6
13,E,8,8,0
14,E,3,1,2
