## 파이썬 데이터분석
# Pandas 심화

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터를 코드숫자로 변환하기

In [3]:
subway = pd.read_csv('CARD_SUBWAY_MONTH_201905.csv')
subway

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
0,20190501,일산선,1955,마두,8528,8796,20190504
1,20190501,일산선,1954,백석,10118,10399,20190504
2,20190501,일산선,1953,대곡,1655,1406,20190504
3,20190501,일산선,1952,화정,20102,20951,20190504
4,20190501,일산선,1951,원당,11419,11064,20190504
...,...,...,...,...,...,...,...
17741,20190531,수인선,1885,연수,6405,6455,20190603
17742,20190531,수인선,1884,원인재,5465,5571,20190603
17743,20190531,수인선,1883,남동인더스파크,2582,2910,20190603
17744,20190531,8호선,2822,산성,7161,6841,20190603


In [4]:
subway.노선명

0        일산선
1        일산선
2        일산선
3        일산선
4        일산선
        ... 
17741    수인선
17742    수인선
17743    수인선
17744    8호선
17745    중앙선
Name: 노선명, Length: 17746, dtype: object

In [5]:
subway.노선명.unique()

array(['일산선', '우이신설선', '안산선', '수인선', '분당선', '과천선', '공항철도 1호선', '경춘선',
       '경인선', '경의선', '장항선', '중앙선', '1호선', '2호선', '3호선', '4호선', '5호선',
       '6호선', '7호선', '8호선', '9호선', '9호선2~3단계', '경강선', '경부선', '경원선'],
      dtype=object)

In [6]:
lanes = np.sort(subway.노선명.unique())
lanes

array(['1호선', '2호선', '3호선', '4호선', '5호선', '6호선', '7호선', '8호선', '9호선',
       '9호선2~3단계', '경강선', '경부선', '경원선', '경의선', '경인선', '경춘선', '공항철도 1호선',
       '과천선', '분당선', '수인선', '안산선', '우이신설선', '일산선', '장항선', '중앙선'],
      dtype=object)

In [7]:
lanes_dict = {l:i for i,l in enumerate(lanes)}
lanes_dict

{'1호선': 0,
 '2호선': 1,
 '3호선': 2,
 '4호선': 3,
 '5호선': 4,
 '6호선': 5,
 '7호선': 6,
 '8호선': 7,
 '9호선': 8,
 '9호선2~3단계': 9,
 '경강선': 10,
 '경부선': 11,
 '경원선': 12,
 '경의선': 13,
 '경인선': 14,
 '경춘선': 15,
 '공항철도 1호선': 16,
 '과천선': 17,
 '분당선': 18,
 '수인선': 19,
 '안산선': 20,
 '우이신설선': 21,
 '일산선': 22,
 '장항선': 23,
 '중앙선': 24}

In [8]:
subway.노선명.map(lanes_dict)

0        22
1        22
2        22
3        22
4        22
         ..
17741    19
17742    19
17743    19
17744     7
17745    24
Name: 노선명, Length: 17746, dtype: int64

In [9]:
subway['노선코드'] = subway.노선명.map(lanes_dict)
subway

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자,노선코드
0,20190501,일산선,1955,마두,8528,8796,20190504,22
1,20190501,일산선,1954,백석,10118,10399,20190504,22
2,20190501,일산선,1953,대곡,1655,1406,20190504,22
3,20190501,일산선,1952,화정,20102,20951,20190504,22
4,20190501,일산선,1951,원당,11419,11064,20190504,22
...,...,...,...,...,...,...,...,...
17741,20190531,수인선,1885,연수,6405,6455,20190603,19
17742,20190531,수인선,1884,원인재,5465,5571,20190603,19
17743,20190531,수인선,1883,남동인더스파크,2582,2910,20190603,19
17744,20190531,8호선,2822,산성,7161,6841,20190603,7


### 문자열 처리하기
- str 지시자 사용
- 파이썬의 문자열 함수들을 사용할 수 있다

In [12]:
subway.노선명.str[:-1]

0        일산
1        일산
2        일산
3        일산
4        일산
         ..
17741    수인
17742    수인
17743    수인
17744    8호
17745    중앙
Name: 노선명, Length: 17746, dtype: object

In [13]:
dir('')

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isascii',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',


In [16]:
subway.역명.str.replace('마','마늘')

0            마늘두
1             백석
2             대곡
3             화정
4             원당
          ...   
17741         연수
17742        원인재
17743    남동인더스파크
17744         산성
17745         지평
Name: 역명, Length: 17746, dtype: object

In [18]:
subway.역명.str[:] + '역'

0             마두역
1             백석역
2             대곡역
3             화정역
4             원당역
           ...   
17741         연수역
17742        원인재역
17743    남동인더스파크역
17744         산성역
17745         지평역
Name: 역명, Length: 17746, dtype: object

### 날자 처리하기

In [23]:
pd.to_datetime(['2024/1/1', '2024/1/2', '2024/1/3'])

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03'], dtype='datetime64[ns]', freq=None)

In [24]:
pd.to_datetime([20240101, 20240102, 20240103], format='%Y%m%d')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03'], dtype='datetime64[ns]', freq=None)

In [26]:
pd.to_datetime(subway.사용일자, format='%Y%m%d')

0       2019-05-01
1       2019-05-01
2       2019-05-01
3       2019-05-01
4       2019-05-01
           ...    
17741   2019-05-31
17742   2019-05-31
17743   2019-05-31
17744   2019-05-31
17745   2019-05-31
Name: 사용일자, Length: 17746, dtype: datetime64[ns]

In [27]:
subway['일자_dt'] = pd.to_datetime(subway.사용일자, format='%Y%m%d')
subway

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자,노선코드,일자_dt
0,20190501,일산선,1955,마두,8528,8796,20190504,22,2019-05-01
1,20190501,일산선,1954,백석,10118,10399,20190504,22,2019-05-01
2,20190501,일산선,1953,대곡,1655,1406,20190504,22,2019-05-01
3,20190501,일산선,1952,화정,20102,20951,20190504,22,2019-05-01
4,20190501,일산선,1951,원당,11419,11064,20190504,22,2019-05-01
...,...,...,...,...,...,...,...,...,...
17741,20190531,수인선,1885,연수,6405,6455,20190603,19,2019-05-31
17742,20190531,수인선,1884,원인재,5465,5571,20190603,19,2019-05-31
17743,20190531,수인선,1883,남동인더스파크,2582,2910,20190603,19,2019-05-31
17744,20190531,8호선,2822,산성,7161,6841,20190603,7,2019-05-31


- dt 지시자 사용법

In [28]:
subway.일자_dt.dt.dayofweek # 0:월, 6:일

0        2
1        2
2        2
3        2
4        2
        ..
17741    4
17742    4
17743    4
17744    4
17745    4
Name: 일자_dt, Length: 17746, dtype: int64

In [31]:
subway.일자_dt.dt.year

0        2019
1        2019
2        2019
3        2019
4        2019
         ... 
17741    2019
17742    2019
17743    2019
17744    2019
17745    2019
Name: 일자_dt, Length: 17746, dtype: int64

In [32]:
subway.일자_dt.dt.month

0        5
1        5
2        5
3        5
4        5
        ..
17741    5
17742    5
17743    5
17744    5
17745    5
Name: 일자_dt, Length: 17746, dtype: int64

In [33]:
subway.일자_dt.dt.day

0         1
1         1
2         1
3         1
4         1
         ..
17741    31
17742    31
17743    31
17744    31
17745    31
Name: 일자_dt, Length: 17746, dtype: int64

In [34]:
subway['year'] = subway.일자_dt.dt.year
subway['month'] = subway.일자_dt.dt.month
subway['day'] = subway.일자_dt.dt.day
subway['yoil'] = subway.일자_dt.dt.dayofweek
subway

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자,노선코드,일자_dt,year,month,day,yoil
0,20190501,일산선,1955,마두,8528,8796,20190504,22,2019-05-01,2019,5,1,2
1,20190501,일산선,1954,백석,10118,10399,20190504,22,2019-05-01,2019,5,1,2
2,20190501,일산선,1953,대곡,1655,1406,20190504,22,2019-05-01,2019,5,1,2
3,20190501,일산선,1952,화정,20102,20951,20190504,22,2019-05-01,2019,5,1,2
4,20190501,일산선,1951,원당,11419,11064,20190504,22,2019-05-01,2019,5,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17741,20190531,수인선,1885,연수,6405,6455,20190603,19,2019-05-31,2019,5,31,4
17742,20190531,수인선,1884,원인재,5465,5571,20190603,19,2019-05-31,2019,5,31,4
17743,20190531,수인선,1883,남동인더스파크,2582,2910,20190603,19,2019-05-31,2019,5,31,4
17744,20190531,8호선,2822,산성,7161,6841,20190603,7,2019-05-31,2019,5,31,4


In [35]:
subway.일자_dt.dt.strftime('%Y%m%d')

0        20190501
1        20190501
2        20190501
3        20190501
4        20190501
           ...   
17741    20190531
17742    20190531
17743    20190531
17744    20190531
17745    20190531
Name: 일자_dt, Length: 17746, dtype: object

In [36]:
subway.일자_dt.dt.strftime('%Y%m%d').astype('int')

0        20190501
1        20190501
2        20190501
3        20190501
4        20190501
           ...   
17741    20190531
17742    20190531
17743    20190531
17744    20190531
17745    20190531
Name: 일자_dt, Length: 17746, dtype: int32

In [38]:
pd.date_range('2024/1/1', '2024/1/31')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12',
               '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16',
               '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20',
               '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24',
               '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28',
               '2024-01-29', '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', freq='D')

### 조건 검색

In [39]:
subway = pd.read_csv('CARD_SUBWAY_MONTH_201905.csv')
subway

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
0,20190501,일산선,1955,마두,8528,8796,20190504
1,20190501,일산선,1954,백석,10118,10399,20190504
2,20190501,일산선,1953,대곡,1655,1406,20190504
3,20190501,일산선,1952,화정,20102,20951,20190504
4,20190501,일산선,1951,원당,11419,11064,20190504
...,...,...,...,...,...,...,...
17741,20190531,수인선,1885,연수,6405,6455,20190603
17742,20190531,수인선,1884,원인재,5465,5571,20190603
17743,20190531,수인선,1883,남동인더스파크,2582,2910,20190603
17744,20190531,8호선,2822,산성,7161,6841,20190603


In [40]:
subway[subway.노선명=='1호선']

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
181,20190501,1호선,150,서울역,46080,43047,20190504
182,20190501,1호선,151,시청,24025,27974,20190504
183,20190501,1호선,152,종각,33436,28875,20190504
184,20190501,1호선,153,종로3가,39000,34964,20190504
185,20190501,1호선,154,종로5가,29937,29147,20190504
...,...,...,...,...,...,...,...
17721,20190531,1호선,154,종로5가,31108,31544,20190603
17722,20190531,1호선,153,종로3가,37748,35488,20190603
17723,20190531,1호선,152,종각,50960,50645,20190603
17724,20190531,1호선,151,시청,31325,32669,20190603


In [41]:
subway[(subway.노선명=='1호선') & (subway.역명=='서울역')]

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
181,20190501,1호선,150,서울역,46080,43047,20190504
747,20190502,1호선,150,서울역,63425,61511,20190505
1754,20190503,1호선,150,서울역,72225,71936,20190506
1977,20190504,1호선,150,서울역,59503,63654,20190507
2947,20190505,1호선,150,서울역,45856,42422,20190508
3539,20190506,1호선,150,서울역,47378,42597,20190509
3978,20190507,1호선,150,서울역,62052,56163,20190510
4548,20190508,1호선,150,서울역,58497,55197,20190511
4789,20190509,1호선,150,서울역,60877,57579,20190512
5560,20190510,1호선,150,서울역,69643,66277,20190513


In [44]:
subway[subway.역명.str[0]>'하']

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
3,20190501,일산선,1952,화정,20102,20951,20190504
15,20190501,우이신설선,4705,화계,3178,2879,20190504
28,20190501,안산선,1755,한대앞,9517,9155,20190504
41,20190501,수인선,1882,호구포,2907,2779,20190504
78,20190501,분당선,1024,한티,12404,13204,20190504
...,...,...,...,...,...,...,...
17661,20190531,3호선,314,홍제,23386,21844,20190603
17678,20190531,2호선,239,홍대입구,85289,97293,20190603
17679,20190531,2호선,238,합정,38989,43256,20190603
17707,20190531,2호선,209,한양대,15030,17281,20190603


In [47]:
subway[subway.역명.str.len()>10]

Unnamed: 0,사용일자,노선명,역ID,역명,승차총승객수,하차총승객수,등록일자
161,20190501,중앙선,1202,상봉(시외버스터미널),5872,5683,20190504
189,20190501,1호선,158,청량리(서울시립대입구),25512,26381,20190504
218,20190501,2호선,228,서울대입구(관악구청),51093,49771,20190504
249,20190501,3호선,317,경복궁(정부서울청사),32266,32454,20190504
263,20190501,3호선,331,남부터미널(예술의전당),24362,24965,20190504
...,...,...,...,...,...,...,...
17626,20190531,4호선,415,미아(서울사이버대학),21283,19928,20190603
17644,20190531,3호선,331,남부터미널(예술의전당),41891,41809,20190603
17658,20190531,3호선,317,경복궁(정부서울청사),30539,31269,20190603
17689,20190531,2호선,228,서울대입구(관악구청),63803,60207,20190603
