# Pandas - 문자열 조작

In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('법정동코드 전체자료.txt', sep='\t', encoding='EUC-KR')
df.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46328 entries, 0 to 46327
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   법정동코드	법정동명	폐지여부  46328 non-null  object
dtypes: object(1)
memory usage: 362.1+ KB


In [5]:
len(df), df.shape

(46328, (46328, 3))

In [6]:
# 폐지된 주소는 삭제되도록 필터링
df = df[df.폐지여부 == '존재']
df.shape

(20551, 3)

- 인덱싱

In [7]:
df.법정동명.str[:5].head()

0    서울특별시
1    서울특별시
2    서울특별시
3    서울특별시
4    서울특별시
Name: 법정동명, dtype: object

- 찾기:find

In [10]:
df.법정동명.str.find('서울').head()         # 0 이상인 값이면 문자열 포함, 아니면 아님

0    0
1    0
2    0
3    0
4    0
Name: 법정동명, dtype: int64

In [11]:
df.법정동명.str.find('서울').tail()  

46323   -1
46324   -1
46325   -1
46326   -1
46327   -1
Name: 법정동명, dtype: int64

- 분할: split

In [12]:
df.법정동명.str.split().head()  

0              [서울특별시]
1         [서울특별시, 종로구]
2    [서울특별시, 종로구, 청운동]
3    [서울특별시, 종로구, 신교동]
4    [서울특별시, 종로구, 궁정동]
Name: 법정동명, dtype: object

- 포함글자 인식

In [13]:
df.법정동명.str.contains('서울').head()

0    True
1    True
2    True
3    True
4    True
Name: 법정동명, dtype: bool

In [14]:
# numpy bool indexing 처럼 사용
seocho = df[df.법정동명.str.contains('서초')]
seocho

Unnamed: 0,법정동코드,법정동명,폐지여부
975,1165000000,서울특별시 서초구,존재
977,1165010100,서울특별시 서초구 방배동,존재
978,1165010200,서울특별시 서초구 양재동,존재
979,1165010300,서울특별시 서초구 우면동,존재
980,1165010400,서울특별시 서초구 원지동,존재
982,1165010600,서울특별시 서초구 잠원동,존재
983,1165010700,서울특별시 서초구 반포동,존재
984,1165010800,서울특별시 서초구 서초동,존재
985,1165010900,서울특별시 서초구 내곡동,존재
986,1165011000,서울특별시 서초구 염곡동,존재


- 문자 대체: replace

In [15]:
df.법정동명.str.replace(' ','_').head()

0            서울특별시
1        서울특별시_종로구
2    서울특별시_종로구_청운동
3    서울특별시_종로구_신교동
4    서울특별시_종로구_궁정동
Name: 법정동명, dtype: object

In [16]:
# .str.replace()에서는 정규표현식 사용가능
seocho.법정동명 = seocho.법정동명.str.replace('[^가-힣]', ' ')
seocho.head()

  seocho.법정동명 = seocho.법정동명.str.replace('[^가-힣]', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,법정동코드,법정동명,폐지여부
975,1165000000,서울특별시 서초구,존재
977,1165010100,서울특별시 서초구 방배동,존재
978,1165010200,서울특별시 서초구 양재동,존재
979,1165010300,서울특별시 서초구 우면동,존재
980,1165010400,서울특별시 서초구 원지동,존재


In [17]:
filename = '서울시CCTV설치운영현황(자치구)_년도별_211231기준.csv'
cctv = pd.read_csv(filename, skiprows=1, encoding='EUC-KR')
cctv['총계'].apply(lambda x: int(x.replace(',',''))).head()

0    83557
1     1715
2     2447
3     2611
4     3829
Name: 총계, dtype: int64

In [18]:
cctv['총계'].str.replace(',','').astype(int).head()

0    83557
1     1715
2     2447
3     2611
4     3829
Name: 총계, dtype: int32

In [20]:
np.random.seed(2022)
data = np.vstack([list('ABCDE'), np.random.rand(3,5).round(2)])
data

array([['A', 'B', 'C', 'D', 'E'],
       ['0.01', '0.5', '0.11', '0.05', '0.69'],
       ['0.49', '0.9', '0.65', '0.9', '0.72'],
       ['0.83', '0.83', '0.83', '0.96', '0.37']], dtype='<U32')

In [21]:
data

array([['A', 'B', 'C', 'D', 'E'],
       ['0.01', '0.5', '0.11', '0.05', '0.69'],
       ['0.49', '0.9', '0.65', '0.9', '0.72'],
       ['0.83', '0.83', '0.83', '0.96', '0.37']], dtype='<U32')

In [22]:
data.T

array([['A', '0.01', '0.49', '0.83'],
       ['B', '0.5', '0.9', '0.83'],
       ['C', '0.11', '0.65', '0.83'],
       ['D', '0.05', '0.9', '0.96'],
       ['E', '0.69', '0.72', '0.37']], dtype='<U32')

In [24]:
df = pd.DataFrame(data.T, columns='C1 C2 C3 C4'.split())
df

Unnamed: 0,C1,C2,C3,C4
0,A,0.01,0.49,0.83
1,B,0.5,0.9,0.83
2,C,0.11,0.65,0.83
3,D,0.05,0.9,0.96
4,E,0.69,0.72,0.37


In [25]:
df2 = df.set_index('C1')
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.01,0.49,0.83
B,0.5,0.9,0.83
C,0.11,0.65,0.83
D,0.05,0.9,0.96
E,0.69,0.72,0.37


In [26]:
df.set_index('C1', inplace=True)
df

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.01,0.49,0.83
B,0.5,0.9,0.83
C,0.11,0.65,0.83
D,0.05,0.9,0.96
E,0.69,0.72,0.37


In [27]:
df.index.name = 'Index'
df

Unnamed: 0_level_0,C2,C3,C4
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.01,0.49,0.83
B,0.5,0.9,0.83
C,0.11,0.65,0.83
D,0.05,0.9,0.96
E,0.69,0.72,0.37


In [28]:
df.reset_index(inplace=True)
df

Unnamed: 0,Index,C2,C3,C4
0,A,0.01,0.49,0.83
1,B,0.5,0.9,0.83
2,C,0.11,0.65,0.83
3,D,0.05,0.9,0.96
4,E,0.69,0.72,0.37


- cctv 사례

In [29]:
filename = '서울시CCTV설치운영현황(자치구)_년도별_211231기준.csv'
cctv = pd.read_csv(filename, skiprows=1, encoding='EUC-KR')
cctv.head()

Unnamed: 0,구분,총계,2012년 이전,2012년,2013년,2014년,2015년,2016년,2017년,2018년,2019년,2020년,2021년
0,계,83557,4812,1851.0,3434.0,4295,6840,8708,11572,10627,12267,11247,7904
1,종로구,1715,815,,,195,150,0,261,85,9,200,0
2,중 구,2447,16,114.0,87.0,77,236,240,372,386,155,361,403
3,용산구,2611,34,71.0,234.0,125,221,298,351,125,307,617,228
4,성동구,3829,163,144.0,208.0,107,325,255,967,415,490,472,283


In [30]:
# 계 행을 삭제
# 구분 열을 인덱스로 변경하고 이름도 자치구로 변경
cctv.drop([0], inplace=True)
cctv.set_index('구분', inplace=True)
cctv.index.name='자치구'
cctv.head()

Unnamed: 0_level_0,총계,2012년 이전,2012년,2013년,2014년,2015년,2016년,2017년,2018년,2019년,2020년,2021년
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
종로구,1715,815,,,195,150,0,261,85,9,200,0
중 구,2447,16,114.0,87.0,77,236,240,372,386,155,361,403
용산구,2611,34,71.0,234.0,125,221,298,351,125,307,617,228
성동구,3829,163,144.0,208.0,107,325,255,967,415,490,472,283
광진구,3211,35,57.0,100.0,187,98,52,675,465,712,175,655


In [33]:
# '중 구'를 '중구'로 바꾸기
cctv.index = cctv.index.str.replace(' ','')
cctv.index[1]

'중구'