In [1]:
%matplotlib inline

import geopandas as gpd

import pandas as pd
import os
import requests
import folium
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
import matplotlib.font_manager as fm

for font in fm.fontManager.ttflist:
    if font.name in ['AppleGothic', 'Malgun Gothic']:
        plt.rcParams['font.family'] = font.name
        break

plt.rcParams['font.family']

['Malgun Gothic']

In [336]:
#2018년 7월 승차일 기준 1~4일의 버스 카드태깅 정보를 담고있습니다.
#미터 / 분
TripChain = pd.read_csv('../Data/PJT001_TripChain.csv')

#17~18년 기준, 경기도 버스 정류장에 대한 정보를 담고있습니다.
# 성보님과 은솔님이 채워주신 데이터 바로 이용
StationTable = pd.read_csv('../Data/station_null.csv')

#2018년 7월 1~4일의 행정동별 이동 인구수 정보를 담고 있습니다.
AreaPeople = pd.read_csv('../Data/PJT001_sk_emd_od.csv')

#버스노선-정류장 매핑 테이블 정보 입니다.
RouteStationInfo = pd.read_csv('../Data/PJT001_routestationinfo.csv')

#
RouteStationMapping = pd.read_csv('../Data/PJT001_routestationmapping.csv')

In [337]:
TripChain.columns=['암호화카드번호', '트랜잭션ID', '환승횟수', '교통카드발행사ID',
       '총이용객수', '사용자구분', '교통수단CD1', '교통수단CD2',
       '교통수단CD3', '교통수단CD4', '교통수단CD5', '버스노선ID1',
       '버스노선ID2', '버스노선ID3', '버스노선ID4', '버스노선ID5',
       '차량ID1', '차량ID2', '차량ID3', '차량ID4',
       '차량ID5', '총통행거리', '총탑승시간', '총소요시간',
       '승차일시1', '승차일시2', '승차일시3', '승차일시4',
       '승차일시5', '하차일시1', '하차일시2', '하차일시3',
       '하차일시4', '하차일시5', '최초승차일시', '최종하차일시',
       '승차역ID1', '승차역ID2', '승차역ID3', '승차역ID4',
       '승차역ID5', '하차역ID1', '하차역ID2', '하차역ID3',
       '하차역ID4', '하차역ID5', '최초승차역ID', '최종하차역ID',
       '총이용금액', '수집건수', '트립체인완료코드']

## 다회환승자 (3회이상) 분석

In [6]:
Over3Transfer = pd.read_csv('Over3Transfer.csv')

In [7]:
Over3Transfer.head()

Unnamed: 0.1,Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
0,4,900508818160,7,202,5-1,,,10930,24,24,...,,,20180701204621,20180701213623,4108196.0,4199077.0,1350,3,1,1
1,5,900351733805,13-5,400-4,22,,,16150,49,49,...,,,20180701145032,20180701161318,4108106.0,4102942.0,1450,3,1,1
2,6,900351733805,11,13-5,13-4,,,11350,24,24,...,,,20180701212521,20180701221059,4105311.0,4117053.0,1350,3,1,1
3,8,900028085150,62-1,88-1,11-1,,,7140,26,26,...,,,20180701182945,20180701191500,4116853.0,4170093.0,870,3,1,4
4,9,900028088225,720-2,32,30-1,,,13690,46,46,...,,,20180701141014,20180701150226,4170102.0,4130061.0,1350,3,1,1


```
1일반 23678
2어린이 30
4청소년 1037
8경로 0명
```

In [9]:
Over3Transfer.shape

(24745, 28)

### Q1. 어떤 버스를 많이 타는지 알아보자

In [14]:
arr1 = Over3Transfer['버스노선ID1'].unique()
arr2 = Over3Transfer['버스노선ID2'].unique()
arr3 = Over3Transfer['버스노선ID3'].unique()
arr4 = Over3Transfer['버스노선ID4'].unique()
arr5 = Over3Transfer['버스노선ID5'].unique()

In [15]:
bus_num = list(set().union(arr1,arr2,arr3,arr4,arr5))

In [24]:
busline_hash = {}
for num in bus_num:
    busline_hash[num] = 0

In [26]:
del busline_hash[np.nan]

In [47]:
tmp1 = Over3Transfer['버스노선ID1']
tmp2 = Over3Transfer['버스노선ID2']
tmp3 = Over3Transfer['버스노선ID3']
tmp4 = Over3Transfer['버스노선ID4']
tmp5 = Over3Transfer['버스노선ID5']

tm = np.concatenate([tmp1, tmp2])
tm = np.concatenate([tm, tmp3])
tm = np.concatenate([tm, tmp4])
tm = np.concatenate([tm, tmp5])

for t_num in tm:
    if t_num is np.nan:
        continue
    busline_hash[t_num] += 1

In [48]:
busline_hash

{'6002': 21,
 '92-1': 1889,
 '4000': 47,
 '1-2': 161,
 '5-2': 221,
 '1570': 4,
 '138-5': 1,
 '9302': 10,
 '11-2': 42,
 '501B': 1,
 '9300': 8,
 '200': 11,
 '17-1A': 8,
 'M4102': 9,
 '30-3': 6,
 '54': 27,
 '98': 1620,
 '1005-1': 5,
 '111': 173,
 '22': 474,
 '77': 98,
 '5': 1455,
 '4-1': 454,
 '810': 1,
 '88-1': 351,
 '8906': 1,
 '4403': 5,
 '330': 109,
 '1330-44': 1,
 '59-1': 3,
 '5604': 2,
 '63': 246,
 '1500-2': 37,
 '31-2': 4,
 '240': 36,
 '2-2': 349,
 '23-1': 1,
 '17': 127,
 '82': 1,
 '5500-1': 11,
 '3600': 3,
 '16-2': 34,
 '720-1': 1943,
 '3007': 46,
 '71-2': 2,
 '9007': 4,
 '165': 16,
 '76-9': 1,
 'M5121': 20,
 '81': 585,
 '37-1': 3,
 '92': 1329,
 '24': 37,
 '1008': 22,
 '400-4': 284,
 '909': 992,
 '7007': 5,
 '3003': 59,
 '310': 1119,
 '733': 2,
 '66-4': 587,
 '101': 182,
 '75': 9,
 '5-1': 211,
 '24-1': 16,
 '21': 42,
 '6-3': 9,
 '79': 15,
 '340': 48,
 '31-7': 21,
 '16-1': 32,
 '5006': 19,
 '50-5': 26,
 '3101': 4,
 '82-2': 184,
 'M5107': 84,
 '510': 1,
 '1200': 14,
 '34-1': 597,
 '

In [70]:
len(busline_hash)

445

In [142]:
import operator
sorted_busline = sorted(busline_hash.items(), key=operator.itemgetter(1), reverse=True)
sorted_busline

[('62-1', 2884),
 ('720-2', 2625),
 ('301', 2131),
 ('720-1', 1943),
 ('13-1', 1894),
 ('92-1', 1889),
 ('3', 1768),
 ('98', 1620),
 ('700-2', 1507),
 ('5', 1455),
 ('900', 1451),
 ('65', 1424),
 ('92', 1329),
 ('2-1', 1277),
 ('13', 1184),
 ('112', 1178),
 ('7-1', 1174),
 ('310', 1119),
 ('20', 1083),
 ('13-5', 1073),
 ('35', 1069),
 ('300', 1058),
 ('88', 1025),
 ('202', 999),
 ('909', 992),
 ('82-1', 963),
 ('99', 959),
 ('13-4', 884),
 ('11', 878),
 ('25', 864),
 ('37', 812),
 ('64', 804),
 ('83-1', 767),
 ('80', 759),
 ('777', 758),
 ('30', 750),
 ('8', 730),
 ('66', 713),
 ('30-1', 698),
 ('11-1', 669),
 ('10', 666),
 ('9', 666),
 ('99-2', 599),
 ('34-1', 597),
 ('34', 591),
 ('66-4', 587),
 ('81', 585),
 ('707', 555),
 ('51', 549),
 ('720-3', 534),
 ('38', 513),
 ('7-2', 512),
 ('61', 508),
 ('15-1', 493),
 ('22', 474),
 ('400', 461),
 ('27', 458),
 ('4-1', 454),
 ('46', 419),
 ('9-1', 409),
 ('60', 395),
 ('2007', 389),
 ('116-1', 377),
 ('708', 369),
 ('9-2', 365),
 ('55', 362

In [232]:
RouteStationInfo_latlon = pd.read_csv('RouteStationInfo_latlon.csv')

In [125]:
busline = RouteStationInfo_latlon['bus_line_no'].unique()

In [126]:
busline

array(['24', '1004', '330', '33-1', '2', '4', '5-2', '5-4', '15', '14',
       '9', '19', '3-1', '3', '2-4', '2-3', '16', '2-2', '18', '4-1', '6',
       '11', '6-2', '17', '6-3', '8', '8-3', '1', '8-2', '1-1', '2-1',
       '7-1', '8-1', '7', '11-5', '11-2', '11-4', '10', '1004-1', '340',
       '1000', '20', '21', '330-1', '23', '38', '25', '26', '27', '340-1',
       '1008', '33-2', '712', '73-1', '6001', '116-3', '1001', '8501',
       '6002', '8155', '1002', '201', '6002-1', '202', '710', '150',
       '203', '8156', '2000A', '2000B', '4403', '6003', '28', '200',
       '8156(급행)', '4108', '9802', '8472', '8471', '39', 'H6005', '116-5',
       'H6007', 'H6006', '205', '720-3', '6004', '721', '116-2', '9-1',
       '31-3', '3-2', '50', '76', '77', '78', '79', '80', '81', '333',
       '709', '709-1', 'H4', 'H5', 'H6(A)', 'H6(B)', '19-1', '19-2', '29',
       '19-3', 'H1', 'H3', 'H2', '20(마산포)', '20(어도동내)', '20(어도펜션)',
       '공영1(사곳.궁평)', '공영1(매화.궁평)', '공영1(궁평.사곳)', '공영1(지화2리)',
  

#### 화성시 내에서만 다니는 버스들에 대해서만 걸러내기

In [67]:
# busline과 sorted_busline 인 데이터만 찾기
in_busline_num = []

for elem in range(len(sorted_busline)):
    if sorted_busline[elem][0] in busline:
        in_busline_num.append(sorted_busline[elem])

In [69]:
len(in_busline_num)

117

In [71]:
in_busline_num

[('13-1', 1894),
 ('3', 1768),
 ('2-1', 1277),
 ('13', 1184),
 ('7-1', 1174),
 ('20', 1083),
 ('13-5', 1073),
 ('35', 1069),
 ('202', 999),
 ('13-4', 884),
 ('11', 878),
 ('25', 864),
 ('37', 812),
 ('80', 759),
 ('8', 730),
 ('66', 713),
 ('11-1', 669),
 ('10', 666),
 ('9', 666),
 ('81', 585),
 ('720-3', 534),
 ('38', 513),
 ('22', 474),
 ('27', 458),
 ('4-1', 454),
 ('9-1', 409),
 ('2-2', 349),
 ('116-3', 311),
 ('7', 298),
 ('10-5', 288),
 ('10-2', 275),
 ('15', 256),
 ('5-2', 221),
 ('19', 208),
 ('116-2', 198),
 ('33-1', 198),
 ('2', 194),
 ('721', 187),
 ('8155', 164),
 ('3-1', 155),
 ('712', 154),
 ('17', 127),
 ('150', 114),
 ('330', 109),
 ('1-1', 105),
 ('50', 105),
 ('38-1', 102),
 ('1004', 99),
 ('77', 98),
 ('73-1', 92),
 ('11-5', 90),
 ('18', 77),
 ('203', 76),
 ('23', 73),
 ('1', 65),
 ('340-1', 57),
 ('8-2', 56),
 ('8-1', 56),
 ('6003', 56),
 ('333', 55),
 ('16', 53),
 ('340', 48),
 ('11-3', 48),
 ('201', 46),
 ('1001', 43),
 ('11-2', 42),
 ('21', 42),
 ('710', 40),
 ('

In [505]:
All_busline = []
bus_num_ = 0

for j in range(len(in_busline_num)):
    tmp_busline = RouteStationInfo_latlon[RouteStationInfo_latlon['bus_line_no'] == in_busline_num[j][0]]
    tmp_buslineArr = []
    for i in range(len(tmp_busline)):
        tmp = tmp_busline.iloc[i]
        tmp_buslineArr.append((tmp['WGS84위도'], tmp['WGS84경도']))
    All_busline.append(tmp_buslineArr)
    bus_num_ += 1

In [506]:
# 117개의 버스노선이 들어가있다
bus_num_

117

In [554]:
Busline = folium.Map(location = [37.234300,127.213333])

# 왜 앙대닝 흐엉
# for i in range(10):
#     if(len(All_busline[i]) == 0):
#         continue
#     folium.PolyLine(All_busline[i], color="red", weight=2.5, opacity=1).add_to(Busline)



# folium.PolyLine(All_busline[0], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[1], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[2], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[3], color="red", weight=2.5, opacity=1).add_to(Busline)
folium.PolyLine(All_busline[4], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[5], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[6], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[7], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[8], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[9], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[10], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[14], color="blue", weight=2.5, opacity=1).add_to(Busline)

Busline

### 버스 번호가 겹쳐서 망했다! 하하하하하하

In [95]:
bus_dup = []
for a in range(len(in_busline_num)):
    id_ = RouteStationInfo_latlon[RouteStationInfo_latlon['bus_line_no']== in_busline_num[a][0]]['pr_station_id'].nunique()
    bus_dup.append((in_busline_num[a][0], id_))
    
bus_dup

[('13-1', 1),
 ('3', 1),
 ('2-1', 1),
 ('13', 1),
 ('7-1', 2),
 ('20', 1),
 ('13-5', 1),
 ('35', 1),
 ('202', 1),
 ('13-4', 1),
 ('11', 1),
 ('25', 1),
 ('37', 1),
 ('80', 1),
 ('8', 1),
 ('66', 1),
 ('11-1', 1),
 ('10', 2),
 ('9', 2),
 ('81', 1),
 ('720-3', 1),
 ('38', 2),
 ('22', 1),
 ('27', 2),
 ('4-1', 1),
 ('9-1', 2),
 ('2-2', 1),
 ('116-3', 1),
 ('7', 2),
 ('10-5', 1),
 ('10-2', 1),
 ('15', 1),
 ('5-2', 2),
 ('19', 2),
 ('116-2', 1),
 ('33-1', 1),
 ('2', 1),
 ('721', 1),
 ('8155', 1),
 ('3-1', 1),
 ('712', 1),
 ('17', 2),
 ('150', 1),
 ('330', 1),
 ('1-1', 1),
 ('50', 1),
 ('38-1', 1),
 ('1004', 1),
 ('77', 1),
 ('73-1', 1),
 ('11-5', 1),
 ('18', 1),
 ('203', 1),
 ('23', 1),
 ('1', 1),
 ('340-1', 1),
 ('8-2', 1),
 ('8-1', 1),
 ('6003', 1),
 ('333', 1),
 ('16', 1),
 ('340', 1),
 ('11-3', 1),
 ('201', 1),
 ('1001', 1),
 ('11-2', 2),
 ('21', 2),
 ('710', 1),
 ('6', 1),
 ('26', 1),
 ('24', 2),
 ('50-2', 1),
 ('1000', 1),
 ('27-1', 1),
 ('12', 2),
 ('50-5', 1),
 ('5-4', 1),
 ('8156', 

RouteStationMapping과 TripChain을 merge 하여 얻은 것이 Over3Transfer 파일이다.<br>
따라서 RouteStationInfo_latlon를 이용해 노선을 확인할 때 RouteStationMapping과 매치되는 하나의 노선만 확인하면 된다

#### 9-1 노선이 확인이 안된다 ㅜ
RouteStationInfo 의 pr_station_id와 Mapping Table에서 매칭되는 이비노선 ID가 없음<br>
둘 중 한 곳의 누락인가?!

In [286]:
RouteStationMapping[RouteStationMapping['노선명']=='6-3']

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명
885,경기시내,경진여객,4100800,233000045,41008128,6-3
927,경기시내,금강고속,4100900,240000019,41009145,6-3
1332,경기시내,삼영운수,4102100,208000038,41021037,6-3


In [287]:
RouteStationInfo_latlon[RouteStationInfo_latlon['bus_line_no']=='6-3']['pr_station_id'].unique()

array([233000045, 241205001], dtype=int64)

### 중복된 노선 따로 파악하기 - 환승횟수 100 이상의 중복노선만 확인함
```
RouteStation에서 쓸 pr_station_id

7-1 : 233000056
10 : 233000064
9: 233000025
38 : 233000086
27: 233000096
7 : 233000058	
5-2 : 233000019
19: 233000026
17: 233000044
```

In [134]:
busline_dup = {'7-1':233000056, 
               '10' : 233000064,
               '9': 233000025,
               '38' : 233000086,
               '27': 233000096,
               '7' : 233000058,   
               '5-2' : 233000019,
               '19': 233000026,
               '17': 233000044}
busline_dup.keys()

dict_keys(['7-1', '10', '9', '38', '27', '7', '5-2', '19', '17'])

In [135]:
All_busline = []
bus_num_ = 0
busline_dup = {'7-1':233000056, 
               '10' : 233000064,
               '9': 233000025,
               '38' : 233000086,
               '27': 233000096,
               '7' : 233000058,   
               '5-2' : 233000019,
               '19': 233000026,
               '17': 233000044}
dup_arr = busline_dup.keys()

for j in range(len(in_busline_num)):
    tmp_busline = RouteStationInfo_latlon[RouteStationInfo_latlon['bus_line_no'] == in_busline_num[j][0]]
    if in_busline_num[j][0] in dup_arr:
        tmp_busline = tmp_busline[tmp_busline['pr_station_id'] == busline_dup[in_busline_num[j][0]]]
    tmp_buslineArr = []
    for i in range(len(tmp_busline)):
        tmp = tmp_busline.iloc[i]
        tmp_buslineArr.append((tmp['WGS84위도'], tmp['WGS84경도']))
    All_busline.append(tmp_buslineArr)
    bus_num_ += 1

In [138]:
in_busline_num[0][0]

'13-1'

In [210]:
in_busline_num

[('13-1', 1894),
 ('3', 1768),
 ('2-1', 1277),
 ('13', 1184),
 ('7-1', 1174),
 ('20', 1083),
 ('13-5', 1073),
 ('35', 1069),
 ('202', 999),
 ('13-4', 884),
 ('11', 878),
 ('25', 864),
 ('37', 812),
 ('80', 759),
 ('8', 730),
 ('66', 713),
 ('11-1', 669),
 ('10', 666),
 ('9', 666),
 ('81', 585),
 ('720-3', 534),
 ('38', 513),
 ('22', 474),
 ('27', 458),
 ('4-1', 454),
 ('9-1', 409),
 ('2-2', 349),
 ('116-3', 311),
 ('7', 298),
 ('10-5', 288),
 ('10-2', 275),
 ('15', 256),
 ('5-2', 221),
 ('19', 208),
 ('116-2', 198),
 ('33-1', 198),
 ('2', 194),
 ('721', 187),
 ('8155', 164),
 ('3-1', 155),
 ('712', 154),
 ('17', 127),
 ('150', 114),
 ('330', 109),
 ('1-1', 105),
 ('50', 105),
 ('38-1', 102),
 ('1004', 99),
 ('77', 98),
 ('73-1', 92),
 ('11-5', 90),
 ('18', 77),
 ('203', 76),
 ('23', 73),
 ('1', 65),
 ('340-1', 57),
 ('8-2', 56),
 ('8-1', 56),
 ('6003', 56),
 ('333', 55),
 ('16', 53),
 ('340', 48),
 ('11-3', 48),
 ('201', 46),
 ('1001', 43),
 ('11-2', 42),
 ('21', 42),
 ('710', 40),
 ('

In [240]:
only_bus_num = []
for i in in_busline_num:
    only_bus_num.append(i[0])


In [377]:
only_bus_num

['13-1',
 '3',
 '2-1',
 '13',
 '7-1',
 '20',
 '13-5',
 '35',
 '202',
 '13-4',
 '11',
 '25',
 '37',
 '80',
 '8',
 '66',
 '11-1',
 '10',
 '9',
 '81',
 '720-3',
 '38',
 '22',
 '27',
 '4-1',
 '9-1',
 '2-2',
 '116-3',
 '7',
 '10-5',
 '10-2',
 '15',
 '5-2',
 '19',
 '116-2',
 '33-1',
 '2',
 '721',
 '8155',
 '3-1',
 '712',
 '17',
 '150',
 '330',
 '1-1',
 '50',
 '38-1',
 '1004',
 '77',
 '73-1',
 '11-5',
 '18',
 '203',
 '23',
 '1',
 '340-1',
 '8-2',
 '8-1',
 '6003',
 '333',
 '16',
 '340',
 '11-3',
 '201',
 '1001',
 '11-2',
 '21',
 '710',
 '6',
 '26',
 '24',
 '50-2',
 '1000',
 '27-1',
 '12',
 '50-5',
 '5-4',
 '8156',
 '31-3',
 '1008',
 '6002',
 '36',
 '3-2',
 '31',
 '79',
 '8501',
 '17-1',
 '200',
 '709-1',
 '6001',
 '10-4',
 '6-3',
 '1002',
 '78',
 '2000B',
 '76',
 '1004-1',
 '100',
 '4',
 '33-2',
 '35-2',
 '4403',
 '10-1',
 '31-2',
 '2000A',
 '13-2',
 '22-2',
 '50-1',
 '50-4',
 '330-1',
 '6-1',
 '35-1',
 '11-4',
 '29',
 '39',
 '6-2',
 '12-1']

In [548]:
Busline = folium.Map(location = [37.234300,127.213333])

# for i in range(10):
#     if(len(All_busline[i]) == 0):
#         continue
#     folium.PolyLine(All_busline[i], color="red", weight=2.5, opacity=1).add_to(Busline)



folium.PolyLine(All_busline[0], color="red", weight=2.5, opacity=1).add_to(Busline)
folium.PolyLine(All_busline[1], color="yellow", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[2], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[3], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[4], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[5], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[6], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[7], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[8], color="red", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[9], color="blue", weight=2.5, opacity=1).add_to(Busline)
# folium.PolyLine(All_busline[10], color="red", weight=2.5, opacity=1).add_to(Busline)
Busline

### Q2. 왜 이 버스를 많이 탈까?

#### 1. 13-1 번 버스!

In [143]:
Over3Transfer.sample()

Unnamed: 0.1,Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
14884,34943,900361527465,720-1,51,720-1,,,4950,21,21,...,,,20180703180804,20180703185441,4100463.0,4100464.0,1250,3,1,1


In [149]:
Over3Transfer.loc[:1]

Unnamed: 0.1,Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
0,4,900508818160,7,202,5-1,,,10930,24,24,...,,,20180701204621,20180701213623,4108196.0,4199077.0,1350,3,1,1
1,5,900351733805,13-5,400-4,22,,,16150,49,49,...,,,20180701145032,20180701161318,4108106.0,4102942.0,1450,3,1,1


In [155]:
pd.DataFrame(Over3Transfer.iloc[0]).T

Unnamed: 0.1,Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
0,4,900508818160,7,202,5-1,,,10930,24,24,...,,,20180701204621,20180701213623,4108200.0,4199080.0,1350,3,1,1


In [169]:
p_13_1 = Over3Transfer[Over3Transfer['버스노선ID1']== '13-1']
p_13_1 = pd.concat([p_13_1, (Over3Transfer[Over3Transfer['버스노선ID2']== '13-1'])])
p_13_1 = pd.concat([p_13_1, (Over3Transfer[Over3Transfer['버스노선ID3']== '13-1'])])
p_13_1 = pd.concat([p_13_1, (Over3Transfer[Over3Transfer['버스노선ID4']== '13-1'])])
p_13_1 = pd.concat([p_13_1, (Over3Transfer[Over3Transfer['버스노선ID5']== '13-1'])])
# for i in range(len(Over3Transfer)):
#     tmp = tmp_busline.iloc[i]
#     if (tmp['버스노선ID1'] == '13-1') or (tmp['버스노선ID2'] == '13-1') or (tmp['버스노선ID3'] == '13-1') or (tmp['버스노선ID4'] == '13-1') or (tmp['버스노선ID5'] == '13-1'):
#         p_13_1 = pd.concat([p_13_1, (pd.DataFrame(tmp).T)])

In [198]:
p_13_1.head()

Unnamed: 0.1,Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
16,35,900469165316,13-1,202,720-2,,,12900,39,39,...,,,20180701101431,20180701110143,4108087.0,4111742.0,1350,3,1,1
42,101,900494029206,13-1,9-2,5,,,8180,21,21,...,,,20180701103105,20180701105656,4117050.0,4100118.0,870,3,1,4
83,189,100471846143,13-1,13,88,,,8240,20,20,...,,,20180701214644,20180701221814,4114206.0,4117061.0,870,3,1,4
152,338,900460181046,13-1,62-1,25-2,,,10530,40,40,...,,,20180701105321,20180701114145,4160312.0,4151749.0,1350,3,1,1
181,402,100519610238,13-1,1112,16,,,37740,61,61,...,,,20180701065548,20180701080835,4108218.0,4103151.0,2600,3,1,1


In [222]:
routestation_busline = RouteStationInfo_latlon['bus_line_no'].unique()



In [None]:
### Q3. 얼마나 오래 탈까?

In [None]:
### Q4. 

## 정류소명을 이용해 분류해보자

1. 필요한 데이터만 분류 (위의 pr_station_id를 이용해 필요없는 노선 및 정류소는 delete)

In [233]:
RouteStationInfo_latlon.shape

(7505, 14)

In [230]:
RouteStationInfo_latlon.sample()

Unnamed: 0.1,Unnamed: 0,seq,pr_station_id,bus_line_no,bus_line_no_seq,station_nm,station_id,mobile_no,정류소명,표준정류장ID,WGS84위도,WGS84경도,관할관청,이비카드정류장ID
4301,25556,163907,241317004,19-1,58,반도.모아아파트,233001491,37551.0,반도.모아아파트,233001491.0,37.201917,127.11415,경기도 화성시,4170942.0


In [290]:
filteredRoute = RouteStationInfo_latlon.copy()
tmp_df = RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000056]
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000064]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000025]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000086]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000096]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000058]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000019]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000026]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000044]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000060]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==241323004]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000080]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000081]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==241317009]])
tmp_df = pd.concat([tmp_df, RouteStationInfo_latlon[RouteStationInfo_latlon['pr_station_id']==233000045]])

In [291]:
# 중복되는 값 다 지우기
# Get names of indexes for which column Age has value 30
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '7-1' ].index
 
# Delete these row indexes from dataFrame
filteredRoute.drop(indexNames , inplace=True)


indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '10' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '9' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '38' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '27' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '7' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '5-2' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '19' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '17' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '11-2' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '12' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '21' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '24' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '39' ].index
filteredRoute.drop(indexNames , inplace=True)
indexNames = filteredRoute[ filteredRoute['bus_line_no'] == '6-3' ].index
filteredRoute.drop(indexNames , inplace=True)

In [292]:
filteredRoute = pd.concat([filteredRoute, tmp_df])

In [298]:
filteredRoute.shape

(6879, 14)

### 2-1. 이용자별로 많이 타는 노선이 무엇인지 분류 (TripChain 이용)

#### 수집건수 (==환승횟수) 1 인 사람

In [300]:
TripChain.sample()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID1,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드
454335,900384067254,14,1,9000922,1,1,500.0,,,,...,,,,,,,4119151.0,1250,1,;


In [306]:
TripChain['최초승차역ID'] = TripChain['승차역ID1']

In [368]:
one_transfer = TripChain[TripChain['수집건수']==1]
one_transfer.drop_duplicates(subset =["암호화카드번호", "최초승차일시", "최종하차일시"], 
                     keep = 'first', inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [369]:
one_transfer.shape

(613293, 51)

In [370]:
one_t_merge = pd.merge(one_transfer,RouteStationMapping[['표준노선ID', '이비노선ID', '노선명']], 
                  how = 'left', left_on = '버스노선ID1', 
                        right_on = '표준노선ID')

In [371]:
one_t_merge['버스노선ID1'] = one_t_merge['노선명']
one_t_merge = one_t_merge.dropna(subset=['노선명'])
del one_t_merge['노선명']
del one_t_merge['표준노선ID']
del one_t_merge['이비노선ID']
one_t_merge.head()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID1,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드
0,900079697651,5,1,9000923,1,1,500.0,,,,...,,,,,,,4117269.0,1250,1,;
1,900079698254,32,1,9000923,1,1,500.0,,,,...,,,,,,,4107936.0,1550,1,;
2,900079699257,80,1,9000923,1,1,500.0,,,,...,,,,,,,4116717.0,1250,1,;
3,900079701419,64,1,9000923,1,1,530.0,,,,...,,,,,,,4116848.0,2050,1,;
4,900079701419,65,1,9000923,1,1,530.0,,,,...,,,,,,,4100121.0,2050,1,;


#### 수집건수 (==환승횟수) 2 인 사람

In [316]:
two_transfer = TripChain[TripChain['수집건수']==2]
two_transfer.drop_duplicates(subset =["암호화카드번호", "최초승차일시", "최종하차일시"], 
                     keep = 'first', inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [317]:
two_t_merge_1 = pd.merge(two_transfer,RouteStationMapping[['표준노선ID', '이비노선ID', '노선명']], 
                  how = 'left', left_on = '버스노선ID1', 
                        right_on = '표준노선ID')

In [318]:
two_t_merge_1['버스노선ID1'] = two_t_merge_1['노선명']
two_t_merge_1 = two_t_merge_1.dropna(subset=['노선명'])
del two_t_merge_1['노선명']
del two_t_merge_1['표준노선ID']
del two_t_merge_1['이비노선ID']
two_t_merge_1.head()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID1,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드
0,900079696430,56,2,9000923,1,1,500.0,500.0,,,...,,,,,,4116828.0,4116708.0,1350,2,;
1,900079971595,86,2,9000923,1,1,500.0,500.0,,,...,,,,,,4116704.0,4151749.0,1250,2,;
2,900079974199,44,2,9000923,1,1,530.0,583.0,,,...,,,,,,4100084.0,4170526.0,2050,2,;
4,900079985178,50,2,9000923,1,1,500.0,500.0,,,...,,,,,,4120515.0,4199616.0,1250,2,;
7,900517099631,17,2,9000921,1,1,500.0,500.0,,,...,,,,,,4116808.0,4108222.0,1250,2,;


In [320]:
two_t_merge_2 = pd.merge(two_t_merge_1,RouteStationMapping[['표준노선ID', '이비노선ID', '노선명']], 
                  how = 'left', left_on = '버스노선ID2', 
                        right_on = '표준노선ID')

In [321]:
two_t_merge_2['버스노선ID2'] = two_t_merge_2['노선명']
two_t_merge_2 = two_t_merge_2.dropna(subset=['노선명'])
del two_t_merge_2['노선명']
del two_t_merge_2['표준노선ID']
del two_t_merge_2['이비노선ID']
two_t_merge_2.head()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID1,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드
0,900079696430,56,2,9000923,1,1,500.0,500.0,,,...,,,,,,4116828.0,4116708.0,1350,2,;
1,900079971595,86,2,9000923,1,1,500.0,500.0,,,...,,,,,,4116704.0,4151749.0,1250,2,;
3,900079985178,50,2,9000923,1,1,500.0,500.0,,,...,,,,,,4120515.0,4199616.0,1250,2,;
4,900517099631,17,2,9000921,1,1,500.0,500.0,,,...,,,,,,4116808.0,4108222.0,1250,2,;
5,900030099159,3,2,3101915,1,4,500.0,500.0,,,...,,,,,,4117181.0,4117030.0,870,2,;


#### 환승횟수 1-5까지 모두 합치기

In [372]:
one_t_merge = one_t_merge[['암호화카드번호', '버스노선ID1', '버스노선ID2', '버스노선ID3', '버스노선ID4', '버스노선ID5',
          '총통행거리', '총탑승시간','총소요시간', '승차역ID1', '승차역ID2', '승차역ID3', '승차역ID4',
           '승차역ID5', '하차역ID1', '하차역ID2', '하차역ID3', '하차역ID4', '하차역ID5',
           '최초승차일시', '최종하차일시', '최초승차역ID', '최종하차역ID', '총이용금액', '수집건수',
                         '총이용객수','사용자구분']]
two_t_merge_2 = two_t_merge_2[['암호화카드번호', '버스노선ID1', '버스노선ID2', '버스노선ID3', '버스노선ID4', '버스노선ID5',
          '총통행거리', '총탑승시간','총소요시간', '승차역ID1', '승차역ID2', '승차역ID3', '승차역ID4',
           '승차역ID5', '하차역ID1', '하차역ID2', '하차역ID3', '하차역ID4', '하차역ID5',
           '최초승차일시', '최종하차일시', '최초승차역ID', '최종하차역ID', '총이용금액', '수집건수',
                         '총이용객수','사용자구분']]

In [365]:
Over3Transfer = Over3Transfer.drop(Over3Transfer.columns[0], axis=1)
Over3Transfer.head()

Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,승차역ID1,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
0,900508818160,7,202,5-1,,,10930,24,24,4108196.0,...,,,20180701204621,20180701213623,4108196.0,4199077.0,1350,3,1,1
1,900351733805,13-5,400-4,22,,,16150,49,49,4108106.0,...,,,20180701145032,20180701161318,4108106.0,4102942.0,1450,3,1,1
2,900351733805,11,13-5,13-4,,,11350,24,24,4105311.0,...,,,20180701212521,20180701221059,4105311.0,4117053.0,1350,3,1,1
3,900028085150,62-1,88-1,11-1,,,7140,26,26,4116853.0,...,,,20180701182945,20180701191500,4116853.0,4170093.0,870,3,1,4
4,900028088225,720-2,32,30-1,,,13690,46,46,4170102.0,...,,,20180701141014,20180701150226,4170102.0,4130061.0,1350,3,1,1


In [374]:
TripChain_bus = pd.concat([one_t_merge, two_t_merge_2])
TripChain_bus = pd.concat([TripChain_bus, Over3Transfer])
TripChain_bus.drop_duplicates(subset =["암호화카드번호", "최초승차일시", "최종하차일시"], 
                     keep = 'first', inplace = True)
TripChain_bus.head()

Unnamed: 0,암호화카드번호,버스노선ID1,버스노선ID2,버스노선ID3,버스노선ID4,버스노선ID5,총통행거리,총탑승시간,총소요시간,승차역ID1,...,하차역ID4,하차역ID5,최초승차일시,최종하차일시,최초승차역ID,최종하차역ID,총이용금액,수집건수,총이용객수,사용자구분
0,900079697651,62-1,,,,,1700,3,3,4117280.0,...,,,20180701072156,20180701072520,,4117269.0,1250,1,1,1
1,900079698254,35,,,,,23180,66,66,4199619.0,...,,,20180701123653,20180701134223,,4107936.0,1550,1,1,1
2,900079699257,64,,,,,500,1,1,4108130.0,...,,,20180701224424,20180701224543,,4116717.0,1250,1,1,1
3,900079701419,300,,,,,3240,8,8,4100122.0,...,,,20180701085058,20180701085910,,4116848.0,2050,1,1,1
4,900079701419,300,,,,,2940,8,8,4100098.0,...,,,20180701220922,20180701221723,,4100121.0,2050,1,1,1


#### 1. 일반 (1)

In [385]:
p_1 = TripChain_bus[TripChain_bus['사용자구분']==1]

In [387]:
p_1.shape

(711897, 27)

In [386]:
p_1_hash = {}
for a in only_bus_num:
    p_1_hash[a] = 0

In [397]:
p_1_hash.keys()

dict_keys(['13-1', '3', '2-1', '13', '7-1', '20', '13-5', '35', '202', '13-4', '11', '25', '37', '80', '8', '66', '11-1', '10', '9', '81', '720-3', '38', '22', '27', '4-1', '9-1', '2-2', '116-3', '7', '10-5', '10-2', '15', '5-2', '19', '116-2', '33-1', '2', '721', '8155', '3-1', '712', '17', '150', '330', '1-1', '50', '38-1', '1004', '77', '73-1', '11-5', '18', '203', '23', '1', '340-1', '8-2', '8-1', '6003', '333', '16', '340', '11-3', '201', '1001', '11-2', '21', '710', '6', '26', '24', '50-2', '1000', '27-1', '12', '50-5', '5-4', '8156', '31-3', '1008', '6002', '36', '3-2', '31', '79', '8501', '17-1', '200', '709-1', '6001', '10-4', '6-3', '1002', '78', '2000B', '76', '1004-1', '100', '4', '33-2', '35-2', '4403', '10-1', '31-2', '2000A', '13-2', '22-2', '50-1', '50-4', '330-1', '6-1', '35-1', '11-4', '29', '39', '6-2', '12-1'])

In [394]:
# 버스노선ID = 0
버스노선ID1 = np.concatenate([p_1['버스노선ID1'], p_1['버스노선ID2'], p_1['버스노선ID3'], 
                         p_1['버스노선ID4'], p_1['버스노선ID5']])

In [399]:
for elem in 버스노선ID1:
    if elem in p_1_hash.keys():
        p_1_hash[elem] += 1

In [415]:
sorted_p_1_hash = dict(sorted(p_1_hash.items(), key=operator.itemgetter(1), reverse=True))
sorted_p_1_hash

{'13-1': 31406,
 '3': 25539,
 '7-1': 21390,
 '13-5': 20939,
 '2-1': 20308,
 '202': 17821,
 '8': 16189,
 '20': 14970,
 '35': 13889,
 '720-3': 12520,
 '25': 10680,
 '81': 9598,
 '11-1': 9092,
 '7': 8035,
 '116-3': 7309,
 '13': 5077,
 '116-2': 4978,
 '9': 4083,
 '22': 4070,
 '15': 3864,
 '721': 3788,
 '80': 3605,
 '2-2': 3384,
 '13-4': 3372,
 '11': 3243,
 '66': 3063,
 '37': 2775,
 '5-2': 2774,
 '38': 2642,
 '10': 2444,
 '712': 2364,
 '73-1': 1953,
 '8501': 1850,
 '150': 1837,
 '9-1': 1814,
 '4-1': 1711,
 '27': 1603,
 '203': 1284,
 '2': 1120,
 '10-5': 1087,
 '10-2': 1024,
 '19': 891,
 '50': 818,
 '1001': 718,
 '33-1': 658,
 '201': 657,
 '1': 653,
 '1004': 546,
 '710': 543,
 '6003': 497,
 '16': 470,
 '38-1': 468,
 '8155': 453,
 '26': 425,
 '17': 418,
 '31': 412,
 '27-1': 408,
 '3-1': 354,
 '8-2': 346,
 '11-5': 317,
 '1-1': 311,
 '340': 300,
 '1000': 283,
 '330': 245,
 '24': 226,
 '18': 218,
 '6002': 217,
 '333': 205,
 '76': 179,
 '6': 150,
 '709-1': 148,
 '77': 142,
 '21': 127,
 '340-1': 12

#### 2. 어린이 (2)

In [414]:
p_2 = TripChain_bus[TripChain_bus['사용자구분']==2]
p_2_hash = {}
for a in only_bus_num:
    p_2_hash[a] = 0


버스노선ID2 = np.concatenate([p_2['버스노선ID1'], p_2['버스노선ID2'], p_2['버스노선ID3'], 
                         p_2['버스노선ID4'], p_2['버스노선ID5']])

for elem in 버스노선ID2:
    if elem in p_2_hash.keys():
        p_2_hash[elem] += 1
        
sorted_p_2_hash = dict(sorted(p_2_hash.items(), key=operator.itemgetter(1), reverse=True))
sorted_p_2_hash

{'8': 370,
 '13-1': 317,
 '202': 212,
 '3': 196,
 '7': 188,
 '2-1': 162,
 '13-5': 150,
 '7-1': 102,
 '35': 90,
 '11-1': 77,
 '20': 68,
 '720-3': 53,
 '25': 52,
 '9': 48,
 '81': 47,
 '116-3': 44,
 '22': 39,
 '116-2': 37,
 '15': 30,
 '73-1': 24,
 '721': 21,
 '2-2': 19,
 '13': 16,
 '38': 16,
 '5-2': 15,
 '712': 14,
 '80': 13,
 '19': 12,
 '150': 10,
 '201': 10,
 '13-4': 9,
 '11': 9,
 '66': 9,
 '203': 8,
 '1000': 8,
 '31': 8,
 '50': 7,
 '37': 6,
 '4-1': 6,
 '9-1': 6,
 '2': 6,
 '710': 6,
 '27-1': 6,
 '10-5': 5,
 '10-2': 4,
 '10': 3,
 '33-1': 3,
 '16': 3,
 '27': 2,
 '17': 2,
 '1': 2,
 '340': 2,
 '1001': 2,
 '8501': 2,
 '17-1': 2,
 '3-1': 1,
 '38-1': 1,
 '1004': 1,
 '77': 1,
 '18': 1,
 '26': 1,
 '50-2': 1,
 '36': 1,
 '76': 1,
 '8155': 0,
 '330': 0,
 '1-1': 0,
 '11-5': 0,
 '23': 0,
 '340-1': 0,
 '8-2': 0,
 '8-1': 0,
 '6003': 0,
 '333': 0,
 '11-3': 0,
 '11-2': 0,
 '21': 0,
 '6': 0,
 '24': 0,
 '12': 0,
 '50-5': 0,
 '5-4': 0,
 '8156': 0,
 '31-3': 0,
 '1008': 0,
 '6002': 0,
 '3-2': 0,
 '79': 0,
 '2

#### 3. 청소년 (4)

In [413]:
p_4 = TripChain_bus[TripChain_bus['사용자구분']==4]
p_4_hash = {}
for a in only_bus_num:
    p_4_hash[a] = 0


버스노선ID4 = np.concatenate([p_4['버스노선ID1'], p_4['버스노선ID2'], p_4['버스노선ID3'], 
                         p_4['버스노선ID4'], p_4['버스노선ID5']])

for elem in 버스노선ID4:
    if elem in p_4_hash.keys():
        p_4_hash[elem] += 1
        
sorted_p_4_hash = dict(sorted(p_4_hash.items(), key=operator.itemgetter(1), reverse=True))
sorted_p_4_hash

{'8': 3857,
 '13-1': 3786,
 '202': 2854,
 '3': 2736,
 '2-1': 2078,
 '35': 2056,
 '13-5': 1905,
 '7-1': 1776,
 '25': 1400,
 '7': 1334,
 '20': 1275,
 '11-1': 898,
 '81': 894,
 '720-3': 856,
 '116-3': 687,
 '116-2': 576,
 '9': 476,
 '13': 473,
 '2-2': 436,
 '721': 436,
 '22': 381,
 '11': 378,
 '15': 358,
 '73-1': 349,
 '80': 307,
 '38': 295,
 '13-4': 263,
 '5-2': 214,
 '37': 196,
 '712': 156,
 '201': 151,
 '66': 146,
 '10': 135,
 '1001': 134,
 '9-1': 116,
 '4-1': 114,
 '2': 107,
 '710': 98,
 '10-5': 97,
 '50': 84,
 '27': 72,
 '1': 70,
 '8501': 70,
 '1004': 61,
 '150': 60,
 '203': 60,
 '26': 59,
 '10-2': 58,
 '340': 58,
 '16': 56,
 '19': 54,
 '33-1': 51,
 '1000': 51,
 '38-1': 49,
 '27-1': 38,
 '17': 37,
 '8155': 35,
 '3-1': 25,
 '24': 22,
 '50-2': 22,
 '50-5': 17,
 '18': 16,
 '709-1': 16,
 '76': 14,
 '31': 13,
 '77': 11,
 '330': 10,
 '333': 10,
 '8-2': 9,
 '21': 9,
 '31-3': 9,
 '36': 8,
 '1-1': 6,
 '11-5': 6,
 '340-1': 6,
 '5-4': 6,
 '8156': 6,
 '6002': 6,
 '79': 6,
 '23': 5,
 '6': 4,
 '8-

#### 4. 경로 (8)

In [412]:
p_8 = TripChain_bus[TripChain_bus['사용자구분']==8]
p_8_hash = {}
for a in only_bus_num:
    p_8_hash[a] = 0


버스노선ID8 = np.concatenate([p_8['버스노선ID1'], p_8['버스노선ID2'], p_8['버스노선ID3'], 
                         p_8['버스노선ID4'], p_8['버스노선ID5']])

for elem in 버스노선ID8:
    if elem in p_8_hash.keys():
        p_8_hash[elem] += 1
        
sorted_p_8_hash = dict(sorted(p_8_hash.items(), key=operator.itemgetter(1), reverse=True))
sorted_p_8_hash

{'3': 86,
 '13-1': 78,
 '35': 76,
 '8': 75,
 '202': 73,
 '20': 72,
 '7-1': 64,
 '7': 43,
 '2-1': 37,
 '720-3': 35,
 '81': 34,
 '116-3': 32,
 '13-5': 29,
 '25': 28,
 '712': 27,
 '11-1': 19,
 '73-1': 18,
 '22': 14,
 '116-2': 13,
 '15': 12,
 '721': 12,
 '2-2': 7,
 '80': 6,
 '5-2': 5,
 '203': 4,
 '1001': 4,
 '150': 3,
 '340': 3,
 '26': 3,
 '9': 2,
 '16': 2,
 '201': 2,
 '1000': 2,
 '27-1': 2,
 '76': 2,
 '50': 1,
 '710': 1,
 '31': 1,
 '709-1': 1,
 '13': 0,
 '13-4': 0,
 '11': 0,
 '37': 0,
 '66': 0,
 '10': 0,
 '38': 0,
 '27': 0,
 '4-1': 0,
 '9-1': 0,
 '10-5': 0,
 '10-2': 0,
 '19': 0,
 '33-1': 0,
 '2': 0,
 '8155': 0,
 '3-1': 0,
 '17': 0,
 '330': 0,
 '1-1': 0,
 '38-1': 0,
 '1004': 0,
 '77': 0,
 '11-5': 0,
 '18': 0,
 '23': 0,
 '1': 0,
 '340-1': 0,
 '8-2': 0,
 '8-1': 0,
 '6003': 0,
 '333': 0,
 '11-3': 0,
 '11-2': 0,
 '21': 0,
 '6': 0,
 '24': 0,
 '50-2': 0,
 '12': 0,
 '50-5': 0,
 '5-4': 0,
 '8156': 0,
 '31-3': 0,
 '1008': 0,
 '6002': 0,
 '36': 0,
 '3-2': 0,
 '79': 0,
 '8501': 0,
 '17-1': 0,
 '200':

### 2-2. 각 노선별로 어떤 이용자가 많이 타는지 분류

```
위의 나이대별로 많이 타는 노선 TOP 10 중에서 겹치는 버스 노선 7개
: 13-1, 3, 7-1, 2-1, 202, 8, 35
```

In [406]:
common_busline  = ['13-1', '3', '7-1', '2-1', '202', '8', '35']

1. 13-1

In [417]:
# 일반, 어린이, 청소년, 경로 순서
common1 = [sorted_p_1_hash[common_busline[0]], sorted_p_2_hash[common_busline[0]], 
           sorted_p_4_hash[common_busline[0]],sorted_p_8_hash[common_busline[0]]]
common2 = [sorted_p_1_hash[common_busline[1]], sorted_p_2_hash[common_busline[1]], 
           sorted_p_4_hash[common_busline[1]],sorted_p_8_hash[common_busline[1]]]
common3 = [sorted_p_1_hash[common_busline[2]], sorted_p_2_hash[common_busline[2]], 
           sorted_p_4_hash[common_busline[2]],sorted_p_8_hash[common_busline[2]]]
common4 = [sorted_p_1_hash[common_busline[3]], sorted_p_2_hash[common_busline[3]], 
           sorted_p_4_hash[common_busline[3]],sorted_p_8_hash[common_busline[3]]]
common5 = [sorted_p_1_hash[common_busline[4]], sorted_p_2_hash[common_busline[4]], 
           sorted_p_4_hash[common_busline[5]],sorted_p_8_hash[common_busline[4]]]
common6 = [sorted_p_1_hash[common_busline[5]], sorted_p_2_hash[common_busline[5]], 
           sorted_p_4_hash[common_busline[5]],sorted_p_8_hash[common_busline[5]]]
common7 = [sorted_p_1_hash[common_busline[6]], sorted_p_2_hash[common_busline[6]], 
           sorted_p_4_hash[common_busline[6]],sorted_p_8_hash[common_busline[6]]]

In [418]:
common1

[31406, 317, 3786, 78]

2. 3

In [419]:
common2

[25539, 196, 2736, 86]

3. 7-1

In [420]:
common3

[21390, 102, 1776, 64]

4. 2-1

In [422]:
common4

[20308, 162, 2078, 37]

5. 202

In [423]:
common5

[17821, 212, 3857, 73]

6. 8

In [424]:
common6

[16189, 370, 3857, 75]

7. 35

In [425]:
common7

[13889, 90, 2056, 76]

### 3. 정류소명을 기준으로

In [427]:
filteredRoute.shape

(6879, 14)

In [428]:
RouteStationInfo.shape

(37831, 7)

3-1. 학교 정류장 확인

In [449]:
filteredRoute.sample()

Unnamed: 0.1,Unnamed: 0,seq,pr_station_id,bus_line_no,bus_line_no_seq,station_nm,station_id,mobile_no,정류소명,표준정류장ID,WGS84위도,WGS84경도,관할관청,이비카드정류장ID
1516,8742,90649,233000077,1000,194,상안1.2리,233000756,37048.0,상안1.2리,233000756.0,37.182567,126.706467,경기도 화성시,4102857.0


In [470]:
filteredRoute['school_elem'] = filteredRoute['station_nm'].apply(lambda x: True if '초' in x else False)
filteredRoute['school_mid'] = filteredRoute['station_nm'].apply(lambda x: True if '중' in x else False)
filteredRoute['school_high'] = filteredRoute['station_nm'].apply(lambda x: True if '고' in x else False)
# [True for ele in filteredRoute.loc[:] if '초' in ele['station_nm']]
# df['equal_or_lower_than_4?'] = df['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')

In [529]:
filteredRoute = filteredRoute.drop('school', axis = 1)

In [480]:
school_route = pd.concat([filteredRoute[filteredRoute['school_elem'] == True], 
                                         filteredRoute[filteredRoute['school_mid'] == True]])
school_route = pd.concat([school_route, 
                                         filteredRoute[filteredRoute['school_high'] == True]])
school_route

Unnamed: 0.1,Unnamed: 0,seq,pr_station_id,bus_line_no,bus_line_no_seq,station_nm,station_id,mobile_no,정류소명,표준정류장ID,WGS84위도,WGS84경도,관할관청,이비카드정류장ID,school,school_elem,school_mid,school_high
166,6962,88310,233000005,330,62,비봉초등학교후문,233000722,37171.0,비봉초등학교후문,233000722.0,37.236217,126.868233,경기도 화성시,4102910.0,True,True,False,False
180,7027,88375,233000005,330,127,비봉초등학교후문.삼화3리,233000777,37170.0,비봉초등학교후문.삼화3리,233000777.0,37.236317,126.868150,경기도 화성시,4102909.0,True,True,False,False
264,7121,88581,233000010,33-1,35,발안초등학교,233000632,37464.0,발안초등학교,233000632.0,37.131033,126.912167,경기도 화성시,4170805.0,True,True,False,False
284,7152,88612,233000010,33-1,66,화성중앙병원.발안초등학교,233001026,37269.0,화성중앙병원.발안초등학교,233001026.0,37.131183,126.911500,경기도 화성시,4130354.0,True,True,True,False
457,7348,89123,233000022,15,29,장안초등학교,233001366,37389.0,장안초등학교,233001366.0,37.084817,126.823500,경기도 화성시,4170696.0,True,True,False,False
474,7371,89146,233000024,14,21,장안초등학교,233001366,37389.0,장안초등학교,233001366.0,37.084817,126.823500,경기도 화성시,4170696.0,True,True,False,False
527,7434,89209,233000027,3-1,12,우정초등학교,233001160,36788.0,우정초등학교,233001160.0,37.080700,126.801667,경기도 화성시,4170324.0,True,True,False,False
541,7454,89229,233000028,3,15,우정초등학교,233001160,36788.0,우정초등학교,233001160.0,37.080700,126.801667,경기도 화성시,4170324.0,True,True,False,False
618,7551,89458,233000035,2-2,21,석천초등학교,233001370,36952.0,석천초등학교,233001370.0,37.054833,126.789733,경기도 화성시,4170660.0,True,True,False,False
704,7717,89624,233000040,6,30,석포초등학교,233000446,37442.0,석포초등학교,233000446.0,37.130017,126.840117,경기도 화성시,4170787.0,True,True,False,False


In [482]:
school_line_arr = school_route['bus_line_no'].unique()

In [484]:
school_hash = {}
for a in school_line_arr:
    school_hash[a] = 0


# 버스노선ID8 = np.concatenate([p_8['버스노선ID1'], p_8['버스노선ID2'], p_8['버스노선ID3'], 
#                          p_8['버스노선ID4'], p_8['버스노선ID5']])

for i in range(len(school_route)):
    tmp = school_route.iloc[i]
    if tmp['school_elem'] == True:
        school_hash[tmp['bus_line_no']] += 1
    if tmp['school_mid'] == True:
        school_hash[tmp['bus_line_no']] += 1
    if tmp['school_high'] == True:
        school_hash[tmp['bus_line_no']] += 1
    
sorted_school_hash = dict(sorted(school_hash.items(), key=operator.itemgetter(1), reverse=True))
sorted_school_hash

{'1001': 59,
 '81': 34,
 '22': 31,
 '13-2': 29,
 '19-1': 28,
 '31-1': 28,
 '11-1': 27,
 '340-1': 26,
 '712': 26,
 '9-1': 26,
 '31-3': 26,
 '37': 26,
 '1000': 25,
 '21': 25,
 '50': 24,
 '13-4A': 24,
 '27-2': 23,
 '13-4': 23,
 '1008': 22,
 '26': 21,
 '3-2': 21,
 '79': 21,
 'H8': 20,
 '33-1': 19,
 '340': 19,
 '33-2': 19,
 '13-7': 19,
 '330': 18,
 '12-1': 18,
 '73-1': 17,
 '710': 17,
 '13-5': 17,
 '8501': 16,
 '201': 16,
 '80': 16,
 '13-6': 16,
 '11-3': 15,
 '31': 15,
 '31-2': 15,
 '721': 14,
 '22-3(신경대)': 14,
 '13-3': 14,
 '77': 13,
 '78': 13,
 'H5': 13,
 '13-1': 13,
 '720-3': 13,
 '19-3': 12,
 '22-2': 12,
 '22-3(장전)': 12,
 '38': 12,
 '1002': 12,
 '50-1': 12,
 '150': 11,
 '20-3(왕모대시청)': 11,
 '8156(급행)': 10,
 '205': 10,
 '6004': 10,
 'H2': 10,
 '50-2': 10,
 '10-6': 10,
 '10-7': 10,
 '10-2': 10,
 '10-1': 10,
 '10-3': 10,
 '10-4': 10,
 '10-5': 10,
 '10-8': 10,
 '7': 10,
 '4': 10,
 '4-1': 10,
 '20': 10,
 '28': 10,
 '공영2(송정대광A)': 10,
 '20-3(쌍쏭.대광A)': 10,
 '8-3': 9,
 '116-2': 9,
 '17': 9,
 '200

In [500]:
sorted_school_hash['11-1']

27

In [490]:
school_line_arr

array(['330', '33-1', '15', '14', '3-1', '3', '2-2', '6', '11', '6-2',
       '8', '8-3', '1', '1-1', '11-5', '340', '1000', '330-1', '23', '26',
       '340-1', '1008', '33-2', '712', '73-1', '1001', '8501', '6002',
       '8155', '201', '6002-1', '710', '150', '200', '8156(급행)', '9802',
       '116-5', '205', '6004', '721', '116-2', '9-1', '31-3', '3-2', '50',
       '76', '77', '78', '79', '80', '81', '333', '709', 'H4', 'H5',
       'H6(A)', 'H6(B)', '19-1', '19-2', '29', '19-3', 'H3', 'H2',
       '20(마산포)', '20(어도동내)', '20(어도펜션)', '공영1(사곳.궁평)', '공영1(매화.궁평)',
       '공영1(궁평.사곳)', '공영2(금당2리)', '공영3(청원.삼존)', '공영3(삼존.청원)', '공영3(금당2리)',
       '공영4(마산.고포)', '공영4(형도.고포)', '공영4(솔미.구렛)', '20-1(홍법리)',
       '20-3(왕모대시청)', '20-3(당성왕모대)', '공영3(솔미.구렛)', '20-4A', '20-4B',
       '11-3', '13', '27-1', '27-2', '17-1', '50-2', '5-1A', '22-2', '22',
       '22-3(장전)', '22-3(신경대)', 'H8', '10-6', '10-7', '10-2', '10-1',
       '10-3', '10-4', '10-5', '39-2', '37', '13-3', '13-2', '13-1',
       '1

3-2. 마트 정류장

In [531]:
filteredRoute['mart'] = filteredRoute['station_nm'].apply(lambda x: True if '마트' in x else False)

In [533]:
filteredRoute[filteredRoute['mart']==True]['이비카드정류장ID'].unique()

array([4130356., 4130357., 4130120., 4199077., 4199456., 4170419.,
       4170493., 4130229., 4116672.])

In [537]:
m_arr = filteredRoute[filteredRoute['mart']==True]['bus_line_no'].unique()

In [547]:
'720-3' in m_arr

True

In [522]:
TripChain_bus.shape

(803295, 27)

In [524]:
TripChain_bus['최초승차역ID']=TripChain_bus['승차역ID1']

In [517]:
StationTable[StationTable['이비카드정류장ID'] != np.nan]

Unnamed: 0.1,Unnamed: 0,표준정류장ID,시군명,정류소명,정류소영문명,정류소번호,중앙차로여부,관할관청,위치,WGS84위도,WGS84경도,모바일정류장ID,이비카드정류장ID,station_id
0,0,228003422,용인시,손골마을회관.국제학교,"Songol Community Center,",56443.0,노변정류장,경기도 용인시,,37.342517,127.066817,56443,,228003422
1,1,228003423,용인시,풀잎사랑,Pulipsarang,56444.0,노변정류장,경기도 용인시,,37.341800,127.068983,56444,,228003423
2,2,228003424,용인시,풀잎사랑,Pulipsarang,56445.0,노변정류장,경기도 용인시,,37.341817,127.069083,56445,,228003424
3,3,228003425,용인시,대성공정,Daesung Process,56446.0,노변정류장,경기도 용인시,,37.339350,127.073067,56446,,228003425
4,4,228003426,용인시,대성공정,Daesung Process,56447.0,노변정류장,경기도 용인시,,37.339183,127.073400,56447,,228003426
5,5,228003427,용인시,지플러스.벨몽테,Gieungsa,56448.0,노변정류장,경기도 용인시,,37.338267,127.075033,56448,,228003427
6,6,228003428,용인시,지플러스.벨몽테,Gieungsa,56449.0,노변정류장,경기도 용인시,,37.338333,127.075067,56449,,228003428
7,7,228003429,용인시,동천동용인한빛중학교,Dongcheon-dong Yongin Hanbit Middle School,56450.0,노변정류장,경기도 용인시,,37.337900,127.084967,56450,,228003429
8,8,228003443,용인시,신봉센트레빌6차,Sinbong Centreville 6-cha,56464.0,노변정류장,경기도 용인시,,37.331733,127.064617,56464,,228003443
9,9,228003447,용인시,래미안이스트팰리스4단지,Raemian East Palace 4-danji,56468.0,노변정류장,경기도 용인시,,37.336250,127.086833,56468,,228003447


---

### 이용객 수 다시 전처리

In [3]:
TripChain = pd.read_csv('../Data/PJT001_TripChain.csv')

In [4]:
TripChain.head()

Unnamed: 0,암호화카드번호||','||트랜잭션ID||','||환승횟수||','||교통카드발행사ID||','||총이용객수||','||사용자구분||','||교통수단CD1||','||교통수단CD2||','||교통수단CD3||','||교통수단CD4||',...,'||하차역ID1||','||하차역ID2||','||하차역ID3||','||하차역ID4||','||하차역ID5||','||최초승차역ID||','||최종하차역ID||','||총이용금액||','||수집건수||','||트립체인완료코드
0,900079696430,56,2,9000923,1,1,500.0,500.0,,,...,,,,,,,4116708.0,1350,2,;
1,900079697651,5,1,9000923,1,1,500.0,,,,...,,,,,,,4117269.0,1250,1,;
2,900079698254,32,1,9000923,1,1,500.0,,,,...,,,,,,,4107936.0,1550,1,;
3,900079699257,80,1,9000923,1,1,500.0,,,,...,,,,,,,4116717.0,1250,1,;
4,900079701419,64,1,9000923,1,1,530.0,,,,...,,,,,,,4116848.0,2050,1,;


In [5]:
TripChain.columns=['암호화카드번호', '트랜잭션ID', '환승횟수', '교통카드발행사ID',
       '총이용객수', '사용자구분', '교통수단CD1', '교통수단CD2',
       '교통수단CD3', '교통수단CD4', '교통수단CD5', '버스노선ID1',
       '버스노선ID2', '버스노선ID3', '버스노선ID4', '버스노선ID5',
       '차량ID1', '차량ID2', '차량ID3', '차량ID4',
       '차량ID5', '총통행거리', '총탑승시간', '총소요시간',
       '승차일시1', '승차일시2', '승차일시3', '승차일시4',
       '승차일시5', '하차일시1', '하차일시2', '하차일시3',
       '하차일시4', '하차일시5', '최초승차일시', '최종하차일시',
       '승차역ID1', '승차역ID2', '승차역ID3', '승차역ID4',
       '승차역ID5', '하차역ID1', '하차역ID2', '하차역ID3',
       '하차역ID4', '하차역ID5', '최초승차역ID', '최종하차역ID',
       '총이용금액', '수집건수', '트립체인완료코드']

In [6]:
TripChain['최초승차역ID'] = TripChain['승차역ID1']

In [7]:
TripChain.drop_duplicates(subset =["암호화카드번호", "최초승차일시", "최종하차일시"], 
                     keep = 'first', inplace = True)

In [8]:
TripChain.sample()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID1,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드
46390,900515420833,30,1,3101915,1,4,500.0,,,,...,,,,,,4170093.0,4116825.0,870,1,;


In [114]:
RouteStation_latlon = pd.read_csv('RouteStationInfo_latlon.csv')

In [115]:
RouteStationMapping = pd.read_csv('../Data/PJT001_routestationmapping.csv')

In [116]:
All_bus_id = RouteStation_latlon['pr_station_id'].unique()
# 화성시 bus id만 저장

In [117]:
All_bus_id = list(All_bus_id)
All_bus_id

[241317009,
 241483001,
 241319005,
 241319032,
 241319029,
 241319002,
 241319009,
 241319010,
 241483002,
 241319043,
 233000134,
 241319044,
 241335026,
 241483017,
 233000010,
 233000013,
 233000015,
 233000019,
 233000021,
 233000022,
 233000024,
 233000025,
 233000026,
 233000027,
 233000028,
 233000029,
 233000035,
 233000034,
 233000040,
 233000041,
 233000043,
 233000044,
 233000046,
 233000047,
 233000048,
 233000051,
 233000052,
 233000056,
 233000057,
 233000058,
 233000060,
 233000062,
 233000064,
 233000264,
 241205014,
 241205017,
 233000080,
 241485006,
 241485018,
 241485021,
 233000128,
 241205022,
 241324001,
 233000268,
 233000079,
 233000081,
 233000085,
 241205008,
 241205012,
 241485015,
 241485016,
 241485052,
 241205016,
 241485038,
 233000095,
 233000126,
 233000130,
 241205018,
 241324002,
 241205023,
 233000252,
 241323001,
 233000141,
 241317011,
 241485041,
 241483004,
 241317002,
 233000267,
 241205003,
 241485024,
 241485045,
 233000275,
 233000281,
 241

In [124]:
len(All_bus_id)

137

In [125]:
RouteStation_latlon = RouteStation_latlon.sort_values('pr_station_id', ascending=True)
RouteStation_latlon.shape

(2216, 13)

In [126]:
TripChain['isHwasung'] = 0

In [121]:
new_TripChain = TripChain.copy()

In [122]:
RouteStationMapping.shape

(2127, 6)

In [123]:
RouteStationMapping.sample()

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명
894,경기시내,경진여객,4100800,233000058,41008140,7


In [127]:
RouteStationMapping['isHwasung'] = RouteStationMapping['이비노선ID'].apply(lambda x: True if x in All_bus_id else False)

In [128]:
Route_Mapping = RouteStationMapping[RouteStationMapping['isHwasung'] == True]
Route_Mapping.head()

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명,isHwasung
862,경기시내,경진여객,4100800,233000010,41008001,33-1,True
863,경기시내,경진여객,4100800,233000013,41008101,2,True
864,경기시내,경진여객,4100800,233000015,41008103,4,True
865,경기시내,경진여객,4100800,233000019,41008110,5-2,True
866,경기시내,경진여객,4100800,233000021,41008112,5-4,True


In [129]:
Route_Mapping.shape

(41, 7)

In [130]:
Route_Mapping['p_1'] = Route_Mapping['isHwasung'].apply(lambda x: 0 if x is True else False)
Route_Mapping['p_2'] = Route_Mapping['isHwasung'].apply(lambda x: 0 if x is True else False)
Route_Mapping['p_4'] = Route_Mapping['isHwasung'].apply(lambda x: 0 if x is True else False)
Route_Mapping['p_8'] = Route_Mapping['isHwasung'].apply(lambda x: 0 if x is True else False)
Route_Mapping.drop_duplicates(subset =["이비노선ID", "표준노선ID"], 
                     keep = 'first', inplace = True)
Route_Mapping.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



(41, 8)

In [131]:
Route_Mapping

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명,isHwasung,p_1
862,경기시내,경진여객,4100800,233000010,41008001,33-1,True,0
863,경기시내,경진여객,4100800,233000013,41008101,2,True,0
864,경기시내,경진여객,4100800,233000015,41008103,4,True,0
865,경기시내,경진여객,4100800,233000019,41008110,5-2,True,0
866,경기시내,경진여객,4100800,233000021,41008112,5-4,True,0
867,경기시내,경진여객,4100800,233000022,41008115,15,True,0
868,경기시내,경진여객,4100800,233000024,41008114,14,True,0
869,경기시내,경진여객,4100800,233000025,41008105,9,True,0
870,경기시내,경진여객,4100800,233000026,41008117,19,True,0
871,경기시내,경진여객,4100800,233000027,41008108,3-1,True,0


In [132]:
normal_id_h = Route_Mapping['표준노선ID'].unique()

In [133]:
normal_id_h

array([41008001, 41008101, 41008103, 41008110, 41008112, 41008115,
       41008114, 41008105, 41008117, 41008108, 41008102, 41008107,
       41008118, 41008119, 41008124, 41008125, 41008126, 41008127,
       41008129, 41008130, 41008131, 41008133, 41008134, 41008138,
       41008139, 41008140, 41008142, 41008144, 41008137, 41008145,
       41008146, 41008147, 41008150, 41008156, 41008161, 41061022,
       41075031, 41075017, 41075030, 41075035, 41086009], dtype=int64)

### 해당 노선을 탄 p_1 (일반인)

In [191]:
new_TripChain.drop_

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드,isHwasung
17619,900504225088,64,2,9000927,1,1,500.0,500.0,,,...,,,,,4160308.0,4117044.0,1250,2,;,0


In [134]:
new_TripChain_1 = new_TripChain[new_TripChain['사용자구분']==1]
new_TripChain_1.shape

(943298, 52)

In [109]:
41002045.0 == 41002045

True

In [None]:
# '암호화카드번호', '트랜잭션ID', '환승횟수', '교통카드발행사ID',
#        '총이용객수', '사용자구분', '교통수단CD1', '교통수단CD2',
#        '교통수단CD3', '교통수단CD4', '교통수단CD5', '버스노선ID1',
#        '버스노선ID2', '버스노선ID3', '버스노선ID4', '버스노선ID5',
#        '차량ID1', '차량ID2', '차량ID3', '차량ID4',
#        '차량ID5', '총통행거리', '총탑승시간', '총소요시간',
#        '승차일시1', '승차일시2', '승차일시3', '승차일시4',
#        '승차일시5', '하차일시1', '하차일시2', '하차일시3',
#        '하차일시4', '하차일시5', '최초승차일시', '최종하차일시',
#        '승차역ID1', '승차역ID2', '승차역ID3', '승차역ID4',
#        '승차역ID5', '하차역ID1', '하차역ID2', '하차역ID3',
#        '하차역ID4', '하차역ID5', '최초승차역ID', '최종하차역ID',
#        '총이용금액', '수집건수', '트립체인완료코드' 

In [193]:
Route_Mapping

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명,isHwasung,p_1,p_2,p_3,p_4,p_5
862,경기시내,경진여객,4100800,233000010,41008001,33-1,True,0,0,0,0,0
863,경기시내,경진여객,4100800,233000013,41008101,2,True,0,0,0,0,0
864,경기시내,경진여객,4100800,233000015,41008103,4,True,0,0,0,0,0
865,경기시내,경진여객,4100800,233000019,41008110,5-2,True,0,0,0,0,0
866,경기시내,경진여객,4100800,233000021,41008112,5-4,True,0,0,0,0,0
867,경기시내,경진여객,4100800,233000022,41008115,15,True,0,0,0,0,0
868,경기시내,경진여객,4100800,233000024,41008114,14,True,0,0,0,0,0
869,경기시내,경진여객,4100800,233000025,41008105,9,True,0,0,0,0,0
870,경기시내,경진여객,4100800,233000026,41008117,19,True,0,0,0,0,0
871,경기시내,경진여객,4100800,233000027,41008108,3-1,True,0,0,0,0,0


In [195]:
Route_Mapping['p_1'] = 0
Route_Mapping['p_2'] = 0
Route_Mapping['p_3'] = 0
Route_Mapping['p_4'] = 0
Route_Mapping['p_5'] = 0
Route_Mapping



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명,isHwasung,p_1,p_2,p_3,p_4,p_5
862,경기시내,경진여객,4100800,233000010,41008001,33-1,True,0,0,0,0,0
863,경기시내,경진여객,4100800,233000013,41008101,2,True,0,0,0,0,0
864,경기시내,경진여객,4100800,233000015,41008103,4,True,0,0,0,0,0
865,경기시내,경진여객,4100800,233000019,41008110,5-2,True,0,0,0,0,0
866,경기시내,경진여객,4100800,233000021,41008112,5-4,True,0,0,0,0,0
867,경기시내,경진여객,4100800,233000022,41008115,15,True,0,0,0,0,0
868,경기시내,경진여객,4100800,233000024,41008114,14,True,0,0,0,0,0
869,경기시내,경진여객,4100800,233000025,41008105,9,True,0,0,0,0,0
870,경기시내,경진여객,4100800,233000026,41008117,19,True,0,0,0,0,0
871,경기시내,경진여객,4100800,233000027,41008108,3-1,True,0,0,0,0,0


In [196]:
p_1_id1 = new_TripChain_1['버스노선ID1']

In [197]:
for i in p_1_id1:
    if i in normal_id_h:
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == i, 'p_1'] += 1
Route_Mapping

Unnamed: 0,구분,운수사명,운수사ID,이비노선ID,표준노선ID,노선명,isHwasung,p_1,p_2,p_3,p_4,p_5
862,경기시내,경진여객,4100800,233000010,41008001,33-1,True,83,0,0,0,0
863,경기시내,경진여객,4100800,233000013,41008101,2,True,2,0,0,0,0
864,경기시내,경진여객,4100800,233000015,41008103,4,True,4,0,0,0,0
865,경기시내,경진여객,4100800,233000019,41008110,5-2,True,2,0,0,0,0
866,경기시내,경진여객,4100800,233000021,41008112,5-4,True,1,0,0,0,0
867,경기시내,경진여객,4100800,233000022,41008115,15,True,0,0,0,0,0
868,경기시내,경진여객,4100800,233000024,41008114,14,True,0,0,0,0,0
869,경기시내,경진여객,4100800,233000025,41008105,9,True,3,0,0,0,0
870,경기시내,경진여객,4100800,233000026,41008117,19,True,2,0,0,0,0
871,경기시내,경진여객,4100800,233000027,41008108,3-1,True,0,0,0,0,0


In [149]:
new_TripChain_1['버스노선ID1'] = new_TripChain_1['버스노선ID1'].fillna(0)
new_TripChain_1['버스노선ID2'] = new_TripChain_1['버스노선ID2'].fillna(0)
new_TripChain_1['버스노선ID3'] = new_TripChain_1['버스노선ID3'].fillna(0)
new_TripChain_1['버스노선ID4'] = new_TripChain_1['버스노선ID4'].fillna(0)
new_TripChain_1['버스노선ID5'] = new_TripChain_1['버스노선ID5'].fillna(0)

# hash_p_1 = {}
# tmp
for index, row in new_TripChain_1.iterrows():
    a = row['버스노선ID1']
    b = row['버스노선ID2']
    c = row['버스노선ID3']
    d = row['버스노선ID4']
    e = row['버스노선ID5']
    if a in normal_id_h:
#         Route_Mapping.set_value(Route_Mapping.index[Route_Mapping['표준노선ID'] == a], 'p_1', Route_Mapping[Route_Mapping['p_1']] + 1)
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == a, 'p_1'] += 1
    if b in normal_id_h:
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == b, 'p_1'] += 1
    if c in normal_id_h:
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == c, 'p_1'] += 1
    if d in normal_id_h:
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == d, 'p_1'] += 1
    if e in normal_id_h:
        Route_Mapping.loc[Route_Mapping['표준노선ID'] == e, 'p_1'] += 1
    
#     if b in hash_p_1:
#         hash_p_1[b] = hash_p_1[b] + 1
#     else:
#         hash_p_1[b] = 1
    
#     if c in hash_p_1:
#         hash_p_1[c] = hash_p_1[c] + 1
#     else:
#         hash_p_1[c] = 1
    
#     if d in hash_p_1:
#         hash_p_1[d] = hash_p_1[d] + 1
#     else:
#         hash_p_1[d] = 1
    
#     if e in hash_p_1:
#         hash_p_1[e] = hash_p_1[e] + 1
#     else:
#         hash_p_1[e] = 1
Route_Mapping

# Route_Mapping['p_1'].unique()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

KeyboardInterrupt: 

In [95]:
# Route_Mapping

In [25]:
students = [ ('jack', 34, 'Sydeny' , 'Australia') ,
             ('Riti', 30, 'Delhi' , 'India' ) ,
             ('Vikas', 31, 'Mumbai' , 'India' ) ,
             ('Neelu', 32, 'Bangalore' , 'India' ) ,
             ('John', 16, 'New York' , 'US') ,
             ('Mike', 17, 'las vegas' , 'US')  ]
 
 
#Create a DataFrame object
df_s = pd.DataFrame(students, columns = ['Name' , 'Age', 'City' , 'Country'], index=['a', 'b', 'c' , 'd' , 'e' , 'f']) 

In [26]:
t = 17
df_s.loc[df_s['Age']==t, 'City'] = 18
df_s.loc[df_s['Age']==t, 'City'] += 18
df_s

Unnamed: 0,Name,Age,City,Country
a,jack,34,Sydeny,Australia
b,Riti,30,Delhi,India
c,Vikas,31,Mumbai,India
d,Neelu,32,Bangalore,India
e,John,16,New York,US
f,Mike,17,36,US


In [151]:
Trip5 = TripChain[TripChain['환승횟수']==5]

Trip5.head()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드,isHwasung
170,900028079354,80,5,3101915,1,1,202.0,500.0,500.0,500.0,...,,,4103755.0,1817.0,1817.0,1817.0,3050,5,;,0
641,900480627813,9,5,3101915,1,1,500.0,500.0,209.0,533.0,...,,,4113480.0,4114880.0,4116891.0,4114880.0,2900,5,;,0
1027,900517935036,97,5,3101915,1,4,582.0,533.0,500.0,500.0,...,,,4100606.0,4100605.0,4103534.0,4100605.0,1760,5,;,0
1153,900097650068,8,5,9000923,1,1,500.0,533.0,500.0,500.0,...,,,4100126.0,4116659.0,4103271.0,4116659.0,2900,5,;,0
1324,900524156222,9,5,9000921,1,1,582.0,500.0,500.0,500.0,...,,,4170093.0,1708.0,4101464.0,1708.0,1750,5,;,0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [170]:
Trip5.loc[170, 'isHwasung'] = 0
Trip5.head()

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드,isHwasung
170,900028079354,80,5,3101915,1,1,202.0,500.0,500.0,500.0,...,,,4103755.0,1817.0,1817.0,1817.0,3050,5,;,0
641,900480627813,9,5,3101915,1,1,500.0,500.0,209.0,533.0,...,,,4113480.0,4114880.0,4116891.0,4114880.0,2900,5,;,0
1027,900517935036,97,5,3101915,1,4,582.0,533.0,500.0,500.0,...,,,4100606.0,4100605.0,4103534.0,4100605.0,1760,5,;,0
1153,900097650068,8,5,9000923,1,1,500.0,533.0,500.0,500.0,...,,,4100126.0,4116659.0,4103271.0,4116659.0,2900,5,;,0
1324,900524156222,9,5,9000921,1,1,582.0,500.0,500.0,500.0,...,,,4170093.0,1708.0,4101464.0,1708.0,1750,5,;,0


In [179]:
Trip5['isHwasung'] = 0
for index, row in n.iterrows():
    a = row['버스노선ID1']
    b = row['버스노선ID2']
    c = row['버스노선ID3']
    d = row['버스노선ID4']
    e = row['버스노선ID5']

    if a in normal_id_h:
        Trip5.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['암호화카드번호'] == a, 'isHwasung'] += 1
    if b in normal_id_h:
        Trip5.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID2'] == b, 'isHwasung'] += 1
    if c in normal_id_h:
        Trip5.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID3'] == c, 'isHwasung'] += 1
    if d in normal_id_h:
        Trip5.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID4'] == d, 'isHwasung'] += 1
#     if e in normal_id_h:
#         Trip5.loc[Trip5['버스노선ID5'] == e, 'isHwasung'] += 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [180]:
Trip5[Trip5['isHwasung']!=0]

Unnamed: 0,암호화카드번호,트랜잭션ID,환승횟수,교통카드발행사ID,총이용객수,사용자구분,교통수단CD1,교통수단CD2,교통수단CD3,교통수단CD4,...,하차역ID2,하차역ID3,하차역ID4,하차역ID5,최초승차역ID,최종하차역ID,총이용금액,수집건수,트립체인완료코드,isHwasung
3242,900200738804,48,5,9000901,1,1,500.0,202.0,120.0,120.0,...,,,60031.0,2750.0,4170347.0,2750.0,2150,5,;,1
32346,900400164539,82,5,9000923,1,1,500.0,500.0,500.0,500.0,...,,,4107928.0,4100140.0,4117239.0,4100140.0,1650,5,;,1
49556,900015115963,74,5,3101000,1,1,105.0,533.0,500.0,500.0,...,,,4130511.0,8001794.0,9009651.0,8001794.0,3700,5,;,1
73461,900385921634,16,5,9000931,1,1,511.0,500.0,500.0,500.0,...,,,4107933.0,4108059.0,4131043.0,4108059.0,1450,5,;,1
75104,900498748981,22,5,3101001,1,1,511.0,500.0,500.0,500.0,...,,,4116882.0,4108220.0,9115604.0,4108220.0,1350,5,;,1
92726,900515625518,5,5,9000931,1,1,500.0,500.0,530.0,500.0,...,,,4116833.0,4113779.0,4117239.0,4113779.0,2050,5,;,2
97275,100036373328,32,5,3104000,1,1,500.0,500.0,500.0,202.0,...,,,1865.0,4196112.0,4117239.0,4196112.0,1550,5,;,1
104882,900512893924,43,5,9000928,1,1,500.0,202.0,500.0,500.0,...,,,4102820.0,1716.0,4170984.0,1716.0,1650,5,;,1
119043,900390974149,38,5,9000921,1,1,500.0,202.0,500.0,500.0,...,,,4170943.0,4130080.0,4100663.0,4130080.0,2050,5,;,1
142344,900526065467,11,5,9000901,1,1,500.0,500.0,500.0,500.0,...,,,4196064.0,4196067.0,4123658.0,4196067.0,1750,5,;,1


In [176]:
Trip5['isHwasung'].unique()

array([0, 1], dtype=int64)

In [181]:
Trip5.loc[198712]

암호화카드번호         900521875020
트랜잭션ID                    74
환승횟수                       5
교통카드발행사ID            9000901
총이용객수                      1
사용자구분                      1
교통수단CD1                  500
교통수단CD2                  500
교통수단CD3                  500
교통수단CD4                  570
교통수단CD5                  205
버스노선ID1           4.1075e+07
버스노선ID2           4.1075e+07
버스노선ID3          4.10311e+07
버스노선ID4          4.10039e+07
버스노선ID5                  NaN
차량ID1            1.41766e+08
차량ID2            1.41766e+08
차량ID3            1.41702e+08
차량ID4            1.41773e+08
차량ID5                  1e+09
총통행거리                  50810
총탑승시간                    107
총소요시간                    107
승차일시1            2.01807e+13
승차일시2            2.01807e+13
승차일시3            2.01807e+13
승차일시4            2.01807e+13
승차일시5            2.01807e+13
하차일시1            2.01807e+13
하차일시2            2.01807e+13
하차일시3            2.01807e+13
하차일시4            2.01807e+13
하차일시5            2.01807e+13
최초승차일시        

In [184]:
TripChain.drop_duplicates(subset =["암호화카드번호", "최초승차일시", "최종하차일시"], 
                     keep = 'first', inplace = True)

In [185]:
sum(TripChain['환승횟수']) / (TripChain.shape[0])

1.5269770879526976

In [186]:
tmp = TripChain[TripChain['환승횟수'] !=0]
Trip12 = tmp[tmp['환승횟수']<=2]

In [None]:
Trip12.shape()

In [190]:
Trip12['isHwasung'] = 0
for index, row in Trip12.iterrows():
    a = row['버스노선ID1']
    b = row['버스노선ID2']
    c = row['버스노선ID3']
    d = row['버스노선ID4']
    e = row['버스노선ID5']

    if a in normal_id_h:
        Trip12.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['암호화카드번호'] == a, 'isHwasung'] += 1
    if b in normal_id_h:
        Trip12.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID2'] == b, 'isHwasung'] += 1
    if c in normal_id_h:
        Trip12.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID3'] == c, 'isHwasung'] += 1
    if d in normal_id_h:
        Trip12.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID4'] == d, 'isHwasung'] += 1
    if e in normal_id_h:
        Trip12.loc[index, 'isHwasung'] += 1
#         Trip5.loc[Trip5['버스노선ID5'] == e, 'isHwasung'] += 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



KeyboardInterrupt: 

In [187]:
Trip12['isHwasung'].unique()

TypeError: 'tuple' object is not callable

In [150]:
# RouteStationMapping['isHwasung'] = 0

    
# Route_hwa = RouteStationMapping.drop(RouteStationMapping[(RouteStationMapping['이비노선ID'] not in All_bus_id)].index)
# Route_hwa = RouteStationMapping[RouteStationMapping.index.get_level_values('이비노선ID') in All_bus_id]
Route_hwa1 = RouteStationMapping.query('이비노선ID')
Route_hwa1
# RouteStationMapping_h = RouteStationMapping[RouteStationMapping['이비노선ID'] in All_bus_id]
# RouteStationMapping_h.shape

KeyError: "None of [Int64Index([216000044, 232000028, 232000029, 232000061, 232000067, 232000087,\n            232000088, 232000091, 232000093, 221000005,\n            ...\n            218000002, 218000108, 219000008, 229000030, 229000035, 229000060,\n            229000063, 213000024, 229000102, 225000004],\n           dtype='int64', length=2127)] are in the [index]"

In [575]:
# for문을 이용해 화성 버스를 이용한 것만으로 TripChain 걸러내기


for i in range(len(TripChain)):
    tmp = TripChain.iloc[i]
    if(tmp['환승횟수']==1):
        if tmp['버스노선ID1'] in All_bus_id:
            tmp['isHwasung'] = 1
            continue
    if(tmp['환승횟수']==2):
        if tmp['버스노선ID1'] in All_bus_id:
            if tmp['버스노선ID2'] in All_bus_id:
                tmp['isHwasung'] = 1
                continue
    if(tmp['환승횟수']==3):
        if tmp['버스노선ID1'] in All_bus_id:
            if tmp['버스노선ID2'] in All_bus_id:
                if tmp['버스노선ID3'] in All_bus_id:
                    tmp['isHwasung'] = 1
                    continue
    if(tmp['환승횟수']==4):
        if tmp['버스노선ID1'] in All_bus_id:
            if tmp['버스노선ID2'] in All_bus_id:
                if tmp['버스노선ID3'] in All_bus_id:
                    if tmp['버스노선ID4'] in All_bus_id:
                        tmp['isHwasung'] = 1
                                
    if(tmp['환승횟수']==5):
        if tmp['버스노선ID1'] in All_bus_id:
                if tmp['버스노선ID2'] in All_bus_id:
                    if tmp['버스노선ID3'] in All_bus_id:
                            if tmp['버스노선ID4'] in All_bus_id:
                                    if tmp['버스노선ID5'] in All_bus_id:
                                            tmp['isHwasung'] = 1

In [658]:
# filteredRoute['station_nm'].apply(lambda x: True if '초' in x else False)
# t = TripChain.loc[:2]
# for i in range(len(t)):
#     tm = t.iloc[i]
#     if tm['수집건수']==1:


# df_new = new_TripChain.drop(new_TripChain[(new_TripChain['버스노선ID1'] not in All_bus_id) & 
#                                                        (new_TripChain['환승횟수']==1)].index)
# df_new.shape()
# df_new = pd.DataFrame(t.drop(t[(t['버스노선ID1'] not in All_bus_id) & (t['최초승차역ID'] == 4116828)].index))
# df_new

new_TripChain[['버스노선ID1', '버스노선ID2']]

Unnamed: 0,버스노선ID1,버스노선ID2
0,41002045.0,41002044.0
1,41031040.0,
2,41031121.0,
3,41031013.0,
4,41020001.0,
5,41020001.0,
6,41067109.0,
7,41110050.0,
8,41002045.0,
9,41036102.0,


In [577]:
TripChain.shape

(1048575, 53)