In [1]:
import pandas as pd
import numpy as np

In [2]:
# 샘플 데이터프레임 생성
idx = ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C','D', 'D', 'D', 'D', 'E', 'E', 'E']
col = ['col1', 'col2', 'col3']
data = np.random.randint(0,9, (15,3))
df1 = pd.DataFrame(data=data, index=idx, columns = col).reset_index()
df1

Unnamed: 0,index,col1,col2,col3
0,A,4,3,3
1,A,5,7,8
2,B,3,6,8
3,B,0,0,3
4,B,4,8,2
5,C,7,0,7
6,C,8,6,4
7,C,1,2,3
8,D,8,6,7
9,D,0,6,1


In [3]:
# groupby() : 특정 컬럼별 통계치 구하기
df1.groupby('index').mean()  # sum() 합계 | mean() 평균 | count() 개수 | val() 분산 | std() 표준편차 | max() 최대값 | min() 최소값

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,4.5,5.0,5.5
B,2.333333,4.666667,4.333333
C,5.333333,2.666667,4.666667
D,4.0,4.75,5.5
E,5.333333,4.0,4.0


In [4]:
# 둘 이상의 통게치 동시에 구하기
df1.groupby('index').agg(['sum', 'mean'])

Unnamed: 0_level_0,col1,col1,col2,col2,col3,col3
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,9,4.5,10,5.0,11,5.5
B,7,2.333333,14,4.666667,13,4.333333
C,16,5.333333,8,2.666667,14,4.666667
D,16,4.0,19,4.75,22,5.5
E,16,5.333333,12,4.0,12,4.0


In [5]:
# 둘 이상의 통게치 동시에 구할 때 컬럼의 멀티인덱스 정리, 소수점도 정리
def flatten_cols(df):
    df.columns = [' / '.join(x) for x in df.columns.to_flat_index()]
    return df

df1.groupby('index').agg(['sum', 'mean']).pipe(flatten_cols).round(2)

Unnamed: 0_level_0,col1 / sum,col1 / mean,col2 / sum,col2 / mean,col3 / sum,col3 / mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,9,4.5,10,5.0,11,5.5
B,7,2.33,14,4.67,13,4.33
C,16,5.33,8,2.67,14,4.67
D,16,4.0,19,4.75,22,5.5
E,16,5.33,12,4.0,12,4.0


In [7]:
df1.groupby('index').describe().T

Unnamed: 0,index,A,B,C,D,E
col1,count,2.0,3.0,3.0,4.0,3.0
col1,mean,4.5,2.333333,5.333333,4.0,5.333333
col1,std,0.707107,2.081666,3.785939,3.265986,1.154701
col1,min,4.0,0.0,1.0,0.0,4.0
col1,25%,4.25,1.5,4.0,3.0,5.0
col1,50%,4.5,3.0,7.0,4.0,6.0
col1,75%,4.75,3.5,7.5,5.0,6.0
col1,max,5.0,4.0,8.0,8.0,6.0
col2,count,2.0,3.0,3.0,4.0,3.0
col2,mean,5.0,4.666667,2.666667,4.75,4.0


In [10]:
# apply 메서드 활용, 각 그룹별 col1 상위 2위만 추출
def top(df1, n=2, col='col1'):
    return df1.sort_values(by=col)[-n:]
df1.groupby('index', group_keys=False).apply(top)

Unnamed: 0,index,col1,col2,col3
0,A,4,3,3
1,A,5,7,8
2,B,3,6,8
4,B,4,8,2
5,C,7,0,7
6,C,8,6,4
11,D,4,0,6
8,D,8,6,7
13,E,6,4,1
14,E,6,1,8


In [7]:
# 조금더 쉽게
def get_top2(x):
    return x.sort_values('col1').head(2)
df1.groupby('index').apply(get_top2)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,A,6,8,1
A,0,A,7,8,3
B,2,B,2,6,1
B,4,B,3,0,3
C,6,C,1,1,4
C,7,C,3,6,0
D,10,D,1,4,4
D,9,D,3,8,0
E,14,E,2,1,5
E,12,E,6,8,6


In [8]:
# lambda 로도 가능
df1.groupby('index').apply(lambda x:x.sort_values('col1').head(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,index,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,A,6,8,1
A,0,A,7,8,3
B,2,B,2,6,1
B,4,B,3,0,3
C,6,C,1,1,4
C,7,C,3,6,0
D,10,D,1,4,4
D,9,D,3,8,0
E,14,E,2,1,5
E,12,E,6,8,6


In [14]:
# transform() : 그룹별 대표값만 남기지 않고, 기존 크기 유지
df2 = df1.copy()
df2['col1_mean'] = df2.groupby('index').col1.transform('mean')
df2

Unnamed: 0,index,col1,col2,col3,col1_mean
0,A,4,3,3,4.5
1,A,5,7,8,4.5
2,B,3,6,8,2.333333
3,B,0,0,3,2.333333
4,B,4,8,2,2.333333
5,C,7,0,7,5.333333
6,C,8,6,4,5.333333
7,C,1,2,3,5.333333
8,D,8,6,7,4.0
9,D,0,6,1,4.0


In [9]:
# 기존 인덱스 유지, 아래의 reset_index() 붙인 거와 동일 (그래서 별 의미 없을수도)
df1.groupby('index', as_index=False).sum()

Unnamed: 0,index,col1,col2,col3
0,A,13,16,4
1,B,11,11,6
2,C,10,12,5
3,D,15,21,14
4,E,15,13,15


In [10]:
df1.groupby('index').sum().reset_index()

Unnamed: 0,index,col1,col2,col3
0,A,13,16,4
1,B,11,11,6
2,C,10,12,5
3,D,15,21,14
4,E,15,13,15


In [11]:
# Nan 값은 계산에서 제외
df1.loc[6, 'index'] = np.NaN
df1.groupby('index').sum()

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,13,16,4
B,11,11,6
C,9,11,1
D,15,21,14
E,15,13,15


In [12]:
# 하지만 dropna=False 인 경우 인덱스에 Nan 포함되어 계산
df1.groupby('index', dropna=False).sum()

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,13,16,4
B,11,11,6
C,9,11,1
D,15,21,14
E,15,13,15
,1,1,4


In [13]:
df1.groupby('index').agg({
    'col1' : 'sum',
    'col2' : ['sum', 'var']})


Unnamed: 0_level_0,col1,col2,col2
Unnamed: 0_level_1,sum,sum,var
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,13,16,0.0
B,11,11,10.333333
C,9,11,0.5
D,15,21,7.583333
E,15,13,12.333333


In [14]:
# 그룹별로 각각 출력
for idx, group in df1.groupby('index'):
    display(group.head())

Unnamed: 0,index,col1,col2,col3
0,A,7,8,3
1,A,6,8,1


Unnamed: 0,index,col1,col2,col3
2,B,2,6,1
3,B,6,5,2
4,B,3,0,3


Unnamed: 0,index,col1,col2,col3
5,C,6,5,1
7,C,3,6,0


Unnamed: 0,index,col1,col2,col3
8,D,6,2,2
9,D,3,8,0
10,D,1,4,4
11,D,5,7,8


Unnamed: 0,index,col1,col2,col3
12,E,6,8,6
13,E,7,4,4
14,E,2,1,5
