# sw 소모임 1107

1. 데이터 처리 inplace, replace
2. 결측치 제어
3. 데이터 정렬
4. 데이터 추가-삭제
5. 데이터 그룹화(그룹 별 통계내기)


**샘플 데이터**
- 프로그래밍 언어 점유율 (https://www.tiobe.com/tiobe-index/)

In [4]:
import numpy as np
import pandas as pd

# 데이터 불러오기
df = pd.read_csv('lang.csv')
df

Unnamed: 0,Oct-24,Oct-23,증감,언어,점유율,변화량
0,1,1,,Python,21.90%,7.08%
1,2,3,+,C++,11.60%,0.93%
2,3,4,+,Java,10.51%,1.59%
3,4,2,-,C,8.38%,-3.70%
4,5,5,,C#,5.62%,-2.09%
5,6,6,,JavaScript,3.54%,0.64%
6,7,7,,Visual Basic,2.35%,0.22%
7,8,11,+,Go,2.02%,0.65%
8,9,16,++,Fortran,1.80%,0.78%
9,10,13,+,Delphi/Object Pascal,1.68%,0.38%


## 데이터 처리

`inplace=True`
- 함수가 원본 데이터를 변경하도록 하기
- 기본적으로 함수들이 새 데이터를 반환하기 때문에 inplace사용하거나 변수에 대입

`replace()`
- 특정 값을 다른 값으로 바꾸는 함수
- `df.replace(교체할 값, 새 값)`


In [5]:
# index 명 수정
df.index = [str(i)+'번' for i in df.index]

In [6]:
# column 명 수정 (rename)
df.rename(columns={"Oct-24":"2024", "Oct-23":"2023"}, inplace=True)

In [7]:
# column 순서 바꾸기
# df.columns로 컬럼 리스트 구해서 2024와 2023의 순서 바꾼 리스트 얻기
col = list(df.columns)
col[0], col[1] = col[1], col[0]
print(col)

['2023', '2024', '증감', '언어', '점유율', '변화량']


In [8]:
# 위에서 얻은 리스트의 순서로 원본 데이터 업데이트
df = df[col]

In [9]:
# 원본 변경 확인
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0번,1,1,,Python,21.90%,7.08%
1번,3,2,+,C++,11.60%,0.93%
2번,4,3,+,Java,10.51%,1.59%
3번,2,4,-,C,8.38%,-3.70%
4번,5,5,,C#,5.62%,-2.09%
5번,6,6,,JavaScript,3.54%,0.64%
6번,7,7,,Visual Basic,2.35%,0.22%
7번,11,8,+,Go,2.02%,0.65%
8번,16,9,++,Fortran,1.80%,0.78%
9번,13,10,+,Delphi/Object Pascal,1.68%,0.38%


In [10]:
# 인덱스 숫자로 초기화하기
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,1,1,,Python,21.90%,7.08%
1,3,2,+,C++,11.60%,0.93%
2,4,3,+,Java,10.51%,1.59%
3,2,4,-,C,8.38%,-3.70%
4,5,5,,C#,5.62%,-2.09%
5,6,6,,JavaScript,3.54%,0.64%
6,7,7,,Visual Basic,2.35%,0.22%
7,11,8,+,Go,2.02%,0.65%
8,16,9,++,Fortran,1.80%,0.78%
9,13,10,+,Delphi/Object Pascal,1.68%,0.38%


In [11]:
df.info()
# object는 문자열, 리스트, 딕셔너리등이 저장된 자료형

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   2023    20 non-null     int64 
 1   2024    20 non-null     int64 
 2   증감      15 non-null     object
 3   언어      20 non-null     object
 4   점유율     20 non-null     object
 5   변화량     20 non-null     object
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


In [12]:
# dtypes로 컬럼들의 자료형 타입만 확인 가능
df.dtypes

2023     int64
2024     int64
증감      object
언어      object
점유율     object
변화량     object
dtype: object

### str

pandas에서 문자열 포함 열에 문자열 메소드 사용할 수 있게 하는 속성

str.upper(): 문자열 대문자 변환

str.lower(): 문자열 소문자로 변환

str.contains(): 문자열에 특정 패턴 포함 여부 확인

str.split(): 문자열 구분자로 분리


In [13]:
# 점유율, 변화량 실수형 데이터로 바꾸기
# str.replace('%','')로 '%'를 ''로 변경해 %를 없앤 뒤, .astype(float)로 실수형으로 타입캐스팅
df['점유율'] = df['점유율'].str.replace('%','').astype(float)
df['변화량'] = df['변화량'].str.replace('%','').astype(float)
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,1,1,,Python,21.9,7.08
1,3,2,+,C++,11.6,0.93
2,4,3,+,Java,10.51,1.59
3,2,4,-,C,8.38,-3.7
4,5,5,,C#,5.62,-2.09
5,6,6,,JavaScript,3.54,0.64
6,7,7,,Visual Basic,2.35,0.22
7,11,8,+,Go,2.02,0.65
8,16,9,++,Fortran,1.8,0.78
9,13,10,+,Delphi/Object Pascal,1.68,0.38


## 결측치 제어
`isnull() / notnull()`
- 각 값이 결측치인지 아닌지 확인해 True/False로 반환하기

`fillna()`
- 결측치를 특정 값으로 채우기
- 매개변수 **inplace=True**로 원본에 적용 가능
- `df.ffill()` / `df.bfill()` 로 바로 앞, 뒤 값으로 채우기 가능

`dropna()`
- 결측치가 포함된 행/열 삭제하기
- 매개변수 **axis**로 행, 열 축 설정 가능 
- **axis 0(index), axis=1(columns)** 매개변수 작성하지 않은 기본값은 0
- 매개변수 **how**로 어떻게 지울지 설정 가능
    - **how = any** 는 결측치 하나라도 있는 데이터 삭제
    - **how = all** 은 데이터가 전부 결측치일 때 삭제

In [14]:
# 결측치 확인
df.isnull()

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,False,False,True,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,True,False,False,False
5,False,False,True,False,False,False
6,False,False,True,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [15]:
# 결측값 공백으로 채우기
df.fillna('')

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,1,1,,Python,21.9,7.08
1,3,2,+,C++,11.6,0.93
2,4,3,+,Java,10.51,1.59
3,2,4,-,C,8.38,-3.7
4,5,5,,C#,5.62,-2.09
5,6,6,,JavaScript,3.54,0.64
6,7,7,,Visual Basic,2.35,0.22
7,11,8,+,Go,2.02,0.65
8,16,9,++,Fortran,1.8,0.78
9,13,10,+,Delphi/Object Pascal,1.68,0.38


In [16]:
# 결측값 직전 값으로 채우기
df.ffill()

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,1,1,,Python,21.9,7.08
1,3,2,+,C++,11.6,0.93
2,4,3,+,Java,10.51,1.59
3,2,4,-,C,8.38,-3.7
4,5,5,-,C#,5.62,-2.09
5,6,6,-,JavaScript,3.54,0.64
6,7,7,-,Visual Basic,2.35,0.22
7,11,8,+,Go,2.02,0.65
8,16,9,++,Fortran,1.8,0.78
9,13,10,+,Delphi/Object Pascal,1.68,0.38


In [17]:
# 결측치를 포함하는 데이터를 지우기
df.dropna()

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
1,3,2,+,C++,11.6,0.93
2,4,3,+,Java,10.51,1.59
3,2,4,-,C,8.38,-3.7
7,11,8,+,Go,2.02,0.65
8,16,9,++,Fortran,1.8,0.78
9,13,10,+,Delphi/Object Pascal,1.68,0.38
10,9,11,-,SQL,1.64,-0.15
11,14,12,+,MATLAB,1.48,0.22
12,20,13,++,Rust,1.45,0.53
13,12,14,-,Scratch,1.41,0.05


In [18]:
# dropna axis 매개변수로 축 설정하기
# df.dropna(axis=0)
df.dropna(axis=1)

Unnamed: 0,2023,2024,언어,점유율,변화량
0,1,1,Python,21.9,7.08
1,3,2,C++,11.6,0.93
2,4,3,Java,10.51,1.59
3,2,4,C,8.38,-3.7
4,5,5,C#,5.62,-2.09
5,6,6,JavaScript,3.54,0.64
6,7,7,Visual Basic,2.35,0.22
7,11,8,Go,2.02,0.65
8,16,9,Fortran,1.8,0.78
9,13,10,Delphi/Object Pascal,1.68,0.38


In [19]:
# inplace 매개변수를 사용해 실제 데이터에 반영하기
df.fillna('No change', inplace=True)
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
0,1,1,No change,Python,21.9,7.08
1,3,2,+,C++,11.6,0.93
2,4,3,+,Java,10.51,1.59
3,2,4,-,C,8.38,-3.7
4,5,5,No change,C#,5.62,-2.09
5,6,6,No change,JavaScript,3.54,0.64
6,7,7,No change,Visual Basic,2.35,0.22
7,11,8,+,Go,2.02,0.65
8,16,9,++,Fortran,1.8,0.78
9,13,10,+,Delphi/Object Pascal,1.68,0.38


## 데이터 정렬

`sort_values()`
- 값에 따라 데이터 정렬
- df.sort_values(by, axis=0, ascending=True)
- **by**: 정렬할 레이블 이름
- **axis**: 축
- **ascending**: **True**는 오름차순 정렬 **False**는 내림차순 정렬

`sort_index()`
- 인덱스를 기준으로 정렬

In [20]:
df.sort_values('점유율')
# Ruby와 COBOL이 같은 점유율이라 2024 순위와 다르게 정렬됨

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
19,15,20,--,Swift,0.98,-0.09
17,19,18,+,Ruby,0.99,0.07
18,24,19,++,COBOL,0.99,0.23
16,17,17,No change,R,1.09,0.12
15,10,16,--,Assembly language,1.13,-0.51
14,8,15,--,PHP,1.21,-0.69
13,12,14,-,Scratch,1.41,0.05
12,20,13,++,Rust,1.45,0.53
11,14,12,+,MATLAB,1.48,0.22
10,9,11,-,SQL,1.64,-0.15


In [21]:
# '점유율'로 정렬하고 여기서 같으면 '2024'기준으로 내림차순 정렬하기
df.sort_values(['점유율', '변화량'], ascending=[True, False])

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
19,15,20,--,Swift,0.98,-0.09
18,24,19,++,COBOL,0.99,0.23
17,19,18,+,Ruby,0.99,0.07
16,17,17,No change,R,1.09,0.12
15,10,16,--,Assembly language,1.13,-0.51
14,8,15,--,PHP,1.21,-0.69
13,12,14,-,Scratch,1.41,0.05
12,20,13,++,Rust,1.45,0.53
11,14,12,+,MATLAB,1.48,0.22
10,9,11,-,SQL,1.64,-0.15


In [22]:
# 인덱스 기준 정렬
df.sort_index(ascending=False)

Unnamed: 0,2023,2024,증감,언어,점유율,변화량
19,15,20,--,Swift,0.98,-0.09
18,24,19,++,COBOL,0.99,0.23
17,19,18,+,Ruby,0.99,0.07
16,17,17,No change,R,1.09,0.12
15,10,16,--,Assembly language,1.13,-0.51
14,8,15,--,PHP,1.21,-0.69
13,12,14,-,Scratch,1.41,0.05
12,20,13,++,Rust,1.45,0.53
11,14,12,+,MATLAB,1.48,0.22
10,9,11,-,SQL,1.64,-0.15


## 데이터 추가, 삭제

In [23]:
# 언어 이름 리스트 뽑아오기
languages = list(df['언어'])
print(*languages)

Python C++ Java C C# JavaScript Visual Basic Go Fortran Delphi/Object Pascal SQL MATLAB Rust Scratch PHP Assembly language R Ruby COBOL Swift


In [24]:
# 각 언어별 발표년도
language_release_years = {
    "Python": 1991,
    "C++": 1983,
    "Java": 1995,
    "C": 1972,
    "C#": 2000,
    "JavaScript": 1995,
    "Visual Basic": 1991,
    "Go": 2009,
    "Fortran": 1957,
    "Delphi/Object Pascal": 1995,
    "SQL": 1974,
    "MATLAB": 1984,
    "Rust": 2010,
    "Scratch": 2003,
    "PHP": 1995,
    "Assembly language": 1949,
    "R": 1993,
    "Ruby": 1995,
    "COBOL": 1959,
    "Swift": 2014
}

In [25]:
# 발표년도 딕셔너리에서 년도만 추출
years = list(language_release_years.values())
print(years)

[1991, 1983, 1995, 1972, 2000, 1995, 1991, 2009, 1957, 1995, 1974, 1984, 2010, 2003, 1995, 1949, 1993, 1995, 1959, 2014]


In [26]:
# "발표년도" 컬럼 추가
df["발표년도"] = np.nan
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,
1,3,2,+,C++,11.6,0.93,
2,4,3,+,Java,10.51,1.59,
3,2,4,-,C,8.38,-3.7,
4,5,5,No change,C#,5.62,-2.09,
5,6,6,No change,JavaScript,3.54,0.64,
6,7,7,No change,Visual Basic,2.35,0.22,
7,11,8,+,Go,2.02,0.65,
8,16,9,++,Fortran,1.8,0.78,
9,13,10,+,Delphi/Object Pascal,1.68,0.38,


In [27]:
# 위에서 얻은 리스트로 데이터 넣기
df["발표년도"] = years
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
1,3,2,+,C++,11.6,0.93,1983
2,4,3,+,Java,10.51,1.59,1995
3,2,4,-,C,8.38,-3.7,1972
4,5,5,No change,C#,5.62,-2.09,2000
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
7,11,8,+,Go,2.02,0.65,2009
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995


### df.drop()

`drop(labels=None, axis=0, index=None, columns=None, inplace=False)`

- labels 이름과 axis 지정해서 삭제하거나

- index, columns로 지정해서 삭제

In [28]:
# df.drop()으로 column 지우기 axis 사용
df.drop(labels='증감', axis=1)


Unnamed: 0,2023,2024,언어,점유율,변화량,발표년도
0,1,1,Python,21.9,7.08,1991
1,3,2,C++,11.6,0.93,1983
2,4,3,Java,10.51,1.59,1995
3,2,4,C,8.38,-3.7,1972
4,5,5,C#,5.62,-2.09,2000
5,6,6,JavaScript,3.54,0.64,1995
6,7,7,Visual Basic,2.35,0.22,1991
7,11,8,Go,2.02,0.65,2009
8,16,9,Fortran,1.8,0.78,1957
9,13,10,Delphi/Object Pascal,1.68,0.38,1995


In [29]:
# df.drop() index 지우기 index 사용
df.drop(index=[1, 2, 3, 4])

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
7,11,8,+,Go,2.02,0.65,2009
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995
10,9,11,-,SQL,1.64,-0.15,1974
11,14,12,+,MATLAB,1.48,0.22,1984
12,20,13,++,Rust,1.45,0.53,2010
13,12,14,-,Scratch,1.41,0.05,2003


In [30]:
# 2000년 이후에 나온 언어만 보기
filt = df['발표년도'] >= 2000
fillterd_df = df[filt]
fillterd_df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
4,5,5,No change,C#,5.62,-2.09,2000
7,11,8,+,Go,2.02,0.65,2009
12,20,13,++,Rust,1.45,0.53,2010
13,12,14,-,Scratch,1.41,0.05,2003
19,15,20,--,Swift,0.98,-0.09,2014


In [31]:
# 위 데이터를 이용해 drop
df.drop(index=fillterd_df.index)

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
1,3,2,+,C++,11.6,0.93,1983
2,4,3,+,Java,10.51,1.59,1995
3,2,4,-,C,8.38,-3.7,1972
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995
10,9,11,-,SQL,1.64,-0.15,1974
11,14,12,+,MATLAB,1.48,0.22,1984


### DataFrame 셀 수정

In [32]:
# Row 추가 loc 사용
df.loc[20] = [18, 21, np.nan, "Kotlin", 0.97, np.nan, 2011]
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
1,3,2,+,C++,11.6,0.93,1983
2,4,3,+,Java,10.51,1.59,1995
3,2,4,-,C,8.38,-3.7,1972
4,5,5,No change,C#,5.62,-2.09,2000
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
7,11,8,+,Go,2.02,0.65,2009
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995


In [33]:
# loc 사용한 수정 index=20 columns="변화량"
df.loc[20, "변화량"] = 0.01
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
1,3,2,+,C++,11.6,0.93,1983
2,4,3,+,Java,10.51,1.59,1995
3,2,4,-,C,8.38,-3.7,1972
4,5,5,No change,C#,5.62,-2.09,2000
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
7,11,8,+,Go,2.02,0.65,2009
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995


In [34]:
# iloc 사용한 수정 20번 행 2번 열 
df.iloc[20, 2] = "-"
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
0,1,1,No change,Python,21.9,7.08,1991
1,3,2,+,C++,11.6,0.93,1983
2,4,3,+,Java,10.51,1.59,1995
3,2,4,-,C,8.38,-3.7,1972
4,5,5,No change,C#,5.62,-2.09,2000
5,6,6,No change,JavaScript,3.54,0.64,1995
6,7,7,No change,Visual Basic,2.35,0.22,1991
7,11,8,+,Go,2.02,0.65,2009
8,16,9,++,Fortran,1.8,0.78,1957
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995


## 데이터 그룹화

데이터를 기준에 따라 그룹으로 나누고 그룹들에 대해 통계작업 수행할 수 있는 기능

`df.groupby(by='column_name')`사용

- column 기준으로 데이터 분할

In [35]:
# "증감" 기준으로 데이터 분할하고 "++"인 데이터 보기
df.groupby("증감").get_group("++")

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도
8,16,9,++,Fortran,1.8,0.78,1957
12,20,13,++,Rust,1.45,0.53,2010
18,24,19,++,COBOL,0.99,0.23,1959


In [36]:
# 그룹화용으로 컬럼 추가
# 2000 이후에 나온 언어들은 'before' 이전에 나온 언어들은 'after'

# 리스트 컴프리헨션 사용
# k = ['before' if i < 2000 else 'after' for i in df['발표년도']]

# 반복문 사용
k = []
for i in df['발표년도']:
    if i < 2000:
        k.append('before')
    else:
        k.append('after')

print(k)

['before', 'before', 'before', 'before', 'after', 'before', 'before', 'after', 'before', 'before', 'before', 'before', 'after', 'after', 'before', 'before', 'before', 'before', 'before', 'after', 'after']


In [37]:
df['2k'] = k
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도,2k
0,1,1,No change,Python,21.9,7.08,1991,before
1,3,2,+,C++,11.6,0.93,1983,before
2,4,3,+,Java,10.51,1.59,1995,before
3,2,4,-,C,8.38,-3.7,1972,before
4,5,5,No change,C#,5.62,-2.09,2000,after
5,6,6,No change,JavaScript,3.54,0.64,1995,before
6,7,7,No change,Visual Basic,2.35,0.22,1991,before
7,11,8,+,Go,2.02,0.65,2009,after
8,16,9,++,Fortran,1.8,0.78,1957,before
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995,before


### groupby() 후 사용할 수 있는 함수들

`sum()`: 그룹 합계

`mean()`: 그룹별 평균

`count()`: 그룹별 개수 (NaN 값 무시)

`size()`: 객체 길이 반환 (NaN 포함)

`min()`, `max()`: 그룹별 최소값, 최대값

`agg()`: 여러 함수 적용

In [38]:
df.iloc[20, 5] = np.nan
df

Unnamed: 0,2023,2024,증감,언어,점유율,변화량,발표년도,2k
0,1,1,No change,Python,21.9,7.08,1991,before
1,3,2,+,C++,11.6,0.93,1983,before
2,4,3,+,Java,10.51,1.59,1995,before
3,2,4,-,C,8.38,-3.7,1972,before
4,5,5,No change,C#,5.62,-2.09,2000,after
5,6,6,No change,JavaScript,3.54,0.64,1995,before
6,7,7,No change,Visual Basic,2.35,0.22,1991,before
7,11,8,+,Go,2.02,0.65,2009,after
8,16,9,++,Fortran,1.8,0.78,1957,before
9,13,10,+,Delphi/Object Pascal,1.68,0.38,1995,before


In [39]:
# size, count의 차이 (NaN 포함 여부)
print(df.groupby('2k')['변화량'].count())
print()
print(df.groupby('2k').size())

2k
after      5
before    15
Name: 변화량, dtype: int64

2k
after      6
before    15
dtype: int64


In [40]:
# 2k로 나눈 그룹의 '점유율' sum
df.groupby('2k')['점유율'].sum()

2k
after     12.45
before    70.29
Name: 점유율, dtype: float64

In [41]:
# 2k로 나눈 그룹의 '점유율' 기준 max값
df.groupby('2k')['점유율'].max()

2k
after      5.62
before    21.90
Name: 점유율, dtype: float64

In [42]:
# 여러 기준으로 나누기
df.groupby(['2k', '증감'])[['점유율','변화량']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,점유율,변화량
2k,증감,Unnamed: 2_level_1,Unnamed: 3_level_1
after,+,2.02,0.65
after,++,1.45,0.53
after,-,2.38,0.05
after,--,0.98,-0.09
after,No change,5.62,-2.09
before,+,26.26,3.19
before,++,2.79,1.01
before,-,10.02,-3.85
before,--,2.34,-1.2
before,No change,28.88,8.06


In [43]:
# 함수 여러 개 적용하기 .agg
df.groupby('2k')['점유율'].agg(['sum','mean','count'])

Unnamed: 0_level_0,sum,mean,count
2k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
after,12.45,2.075,6
before,70.29,4.686,15
