In [1]:
import pandas as pd
import numpy as np
import re

from pydataset import data
pd.set_option('display.max_rows', 10)

In [2]:
df1 = data('esoph')
df1

Unnamed: 0,agegp,alcgp,tobgp,ncases,ncontrols
1,25-34,0-39g/day,0-9g/day,0,40
2,25-34,0-39g/day,10-19,0,10
3,25-34,0-39g/day,20-29,0,6
4,25-34,0-39g/day,30+,0,5
5,25-34,40-79,0-9g/day,0,27
...,...,...,...,...,...
84,75+,40-79,30+,1,1
85,75+,80-119,0-9g/day,1,1
86,75+,80-119,10-19,1,1
87,75+,120+,0-9g/day,2,2


In [3]:
df1.alcgp.value_counts()

0-39g/day    23
40-79        23
80-119       21
120+         21
Name: alcgp, dtype: int64

In [4]:
# alcgp에 g/day 포함되어 있는 행 추출하기
df1[df1.alcgp.str.contains('g/day')]

Unnamed: 0,agegp,alcgp,tobgp,ncases,ncontrols
1,25-34,0-39g/day,0-9g/day,0,40
2,25-34,0-39g/day,10-19,0,10
3,25-34,0-39g/day,20-29,0,6
4,25-34,0-39g/day,30+,0,5
16,35-44,0-39g/day,0-9g/day,0,60
...,...,...,...,...,...
65,65-74,0-39g/day,20-29,2,7
66,65-74,0-39g/day,30+,0,2
78,75+,0-39g/day,0-9g/day,1,18
79,75+,0-39g/day,10-19,2,6


In [5]:
# alcgp에 알파벳 포함되어 있는 행 추출하기
df1[df1.alcgp.str.contains(r'[A-Za-z]+')]

Unnamed: 0,agegp,alcgp,tobgp,ncases,ncontrols
1,25-34,0-39g/day,0-9g/day,0,40
2,25-34,0-39g/day,10-19,0,10
3,25-34,0-39g/day,20-29,0,6
4,25-34,0-39g/day,30+,0,5
16,35-44,0-39g/day,0-9g/day,0,60
...,...,...,...,...,...
65,65-74,0-39g/day,20-29,2,7
66,65-74,0-39g/day,30+,0,2
78,75+,0-39g/day,0-9g/day,1,18
79,75+,0-39g/day,10-19,2,6


In [6]:
# alcgp에 숫자 범위로 된 행 추출하기 (40-79, 80-119, 120+)
df1[df1.alcgp.str.contains(r'\d{2}-\d{2,3}|\d{3}')]

Unnamed: 0,agegp,alcgp,tobgp,ncases,ncontrols
5,25-34,40-79,0-9g/day,0,27
6,25-34,40-79,10-19,0,7
7,25-34,40-79,20-29,0,4
8,25-34,40-79,30+,0,7
9,25-34,80-119,0-9g/day,0,2
...,...,...,...,...,...
84,75+,40-79,30+,1,1
85,75+,80-119,0-9g/day,1,1
86,75+,80-119,10-19,1,1
87,75+,120+,0-9g/day,2,2


In [7]:
# alcgp에 숫자 범위 정보만 추출하기 
df1['alcgp_new'] = df1.alcgp.str.extract(r'(\d+-\d\d+|\d{3}\+)')
df1.alcgp_new.value_counts()

0-39      23
40-79     23
80-119    21
120+      21
Name: alcgp_new, dtype: int64

In [8]:
# tobgp에 알파벳 포함되어 있는 갯수 확인하기
sum(df1.tobgp.str.contains(r'[A-Za-z]'))

24

In [9]:
# map, lambda와 re.sub를 활용하여 대체하기
df1['tobgp_new'] = df1.tobgp.map(lambda x: re.sub('[A-Za-z/]+', '', x))
df1.tobgp_new.value_counts()

0-9      24
10-19    24
20-29    20
30+      20
Name: tobgp_new, dtype: int64