In [264]:
import pandas as pd
import numpy as np

In [265]:
full_data = pd.read_csv('./data/final_data/full_data.csv')

In [266]:
print(full_data.shape)

(6202174, 11)


In [267]:
districts = ["강서구","구로구","양천구","영등포구", "동작구", "금천구", "관악구", "서초구", "강남구", "송파구", "강동구", "마포구", "용산구", "성동구", "광진구", "중구", "서대문구", "은평구", "종로구", "성북구", "동대문구", "중랑구", "노원구", "도봉구", "강북구"]
full_data = full_data[(full_data['출발지구'].isin(districts)) & (full_data['목적지구'].isin(districts))]

In [268]:
# 영종동, 신흥동, 동인천동  drop
full_data.drop(full_data[full_data.출발지동 == "영종동"].index, inplace=True)
full_data.drop(full_data[full_data.목적지동 == "영종동"].index, inplace=True)
full_data.drop(full_data[full_data.목적지동 == "신흥동"].index, inplace=True)
full_data.drop(full_data[full_data.목적지동 == "동인천동"].index, inplace=True)

In [269]:
# 2023년 데이터 삭제
full_data.drop(full_data[full_data.year == 2023].index, inplace=True)

In [270]:
full_data.출발지동 = full_data.출발지동.str.replace('제','')
full_data.목적지동 = full_data.목적지동.str.replace('제','')

In [271]:
full_data.shape

(5345867, 11)

In [272]:
# unique한 동 리스트 생성
unique_list = list(set.union(set(full_data.출발지동.unique())))

### 함수

In [273]:
dong_to_gu = pd.read_csv('./data/extra_data/동참고표.csv')
dong_to_gu.출발지동 = dong_to_gu.출발지동.str.replace('제','')
dong_to_gu_dict = dict(zip(dong_to_gu.출발지동, dong_to_gu.출발지구))

In [274]:
def create_gu_col(dongs):
    return dong_to_gu_dict[dongs]

### 행정동별 주민 센터 위치 및 가장 가까운 차고지와의 거리 계산

In [275]:
community_center = pd.read_excel('./data/extra_data/행정동별주민센터.xlsx', sheet_name = 'seoul_edit')
taxi_garage = pd.read_excel('./data/extra_data/장애인콜택시차고지.xlsx')

In [276]:
# 행정동 공백 제거
community_center.행정동 = community_center.행정동.str.strip()
community_center.시군구 = community_center.시군구.str.strip()

# 제 제거
community_center.행정동 = community_center.행정동.str.replace('제','')

In [277]:
# 가장 가까운 차고지와의 거리 계산
def cal_min_distance(location):
    return np.sqrt(np.min(np.sum((taxi_garage.iloc[:,5:] - location) ** 2, axis = 1)))

community_center['nearest_garage_distance'] = community_center.iloc[:, 1:].apply(cal_min_distance, axis = 1)

In [278]:
community_center.head()

Unnamed: 0,시군구,행정동,Latitude,Longitude,nearest_garage_distance
0,종로구,청운효자동,37.584088,126.970609,0.011351
1,종로구,사직동,37.576172,126.968804,0.007382
2,종로구,삼청동,37.584949,126.981746,0.012402
3,종로구,부암동,37.592393,126.964026,0.021851
4,종로구,평창동,37.606363,126.968335,0.033199


In [279]:
call_taxi_unique = set(full_data.출발지동.unique())
community_unique = set(community_center.행정동.unique())
print(set.difference(call_taxi_unique, community_unique))
print(set.difference(community_unique, call_taxi_unique))

set()
{'약수동', '청구동', '항동', '위례동', '기3동', '다산동', '답십리5동', '동화동'}


In [280]:
call_taxi_unique = set(full_data.출발지구.unique())
community_unique = set(community_center.시군구.unique())
print(set.difference(call_taxi_unique, community_unique))
print(set.difference(community_unique, call_taxi_unique))

set()
set()


In [281]:
# join
final_data = pd.merge(full_data, community_center, how = "left", left_on = ["출발지동", "출발지구"], right_on = ["행정동", "시군구"])
final_data.rename(columns = {"Latitude":"출발동_위도", "Longitude":"출발동_경도", "nearest_garage_distance":"출발동_최근접_차고지_거리"}, inplace = True)
final_data.drop(columns = ["시군구","행정동"], inplace = True)

In [282]:
final_data.shape

(5345867, 14)

In [283]:
final_data = pd.merge(final_data, community_center, how = "left", left_on = ["목적지동", "목적지구"], right_on = ["행정동", "시군구"])
final_data.rename(columns = {"Latitude":"목적동_위도", "Longitude":"목적동_경도"}, inplace = True)
final_data.drop(columns = ["시군구","행정동", "nearest_garage_distance"], inplace = True)

In [284]:
final_data.shape

(5345867, 16)

In [285]:
final_data['이동거리'] = np.sqrt((final_data.출발동_위도 - final_data.목적동_위도) ** 2 + (final_data.출발동_경도 - final_data.목적동_경도) ** 2)

In [286]:
print(final_data.shape)
final_data.head()

(5345867, 17)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,이용목적,출발동_위도,출발동_경도,출발동_최근접_차고지_거리,목적동_위도,목적동_경도,이동거리
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,귀가,37.560008,126.985802,0.014748,37.528107,126.969193,0.035966
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,귀가,37.505081,127.013377,0.025657,37.55927,126.848268,0.173774
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,귀가,37.641473,127.01064,0.036339,37.632001,127.067959,0.058096
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,귀가,37.492266,126.958473,0.022919,37.586123,127.02165,0.113139
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,귀가,37.619921,127.062983,0.020209,37.662734,127.069523,0.043309


### 서울시 행정동별 연도별 보건업 및 사회복지 서비스 종사자 수

In [287]:
seoul_companies = pd.read_csv('./data/extra_data/서울시_사업체현황.csv')

In [288]:
seoul_companies.head()

Unnamed: 0,동별,산업대분류별,구분별,시점,사업체현황(산업대분류별/동별)(2017년 이후),Column1
0,합계,합계,종사자수,2018.1.1,5210936.0,
1,합계,합계,종사자수,2019.1.1,5226997.0,
2,합계,합계,종사자수,2020.1.1,5868926.0,
3,합계,합계,종사자수,2021.1.1,5771226.0,
4,합계,농업 임업 및 어업,종사자수,2018.1.1,462.0,


In [289]:
seoul_companies.동별 = seoul_companies.동별.str.replace('제','')

In [290]:
final_data.출발지동 = final_data.출발지동.replace({'명륜3가동':'혜화동', '장안3동':'장안1동', '기2동':'제기동','기1동':'제기동', '전농3동':'전농2동', '용두동':'신설동',
                                                '이문3동':'이문2동', '답십리4동':'답십리2동','답십리3동':'답십리1동','답십리5동':'답십리1동','신당3동':'신당동',
                                                '신당4동':'신당동', '신당2동':'신당동', '신당1동':'신당동','신당6동':'신당동', '장안4동':'장안2동'})

dong_to_gu.출발지동 = dong_to_gu.출발지동.replace({'명륜3가동':'혜화동', '장안3동':'장안1동', '기2동':'제기동','기1동':'제기동', '전농3동':'전농2동', '용두동':'신설동',
                                                '이문3동':'이문2동', '답십리4동':'답십리2동','답십리3동':'답십리1동','답십리5동':'답십리1동','신당3동':'신당동',
                                                '신당4동':'신당동', '신당2동':'신당동', '신당1동':'신당동','신당6동':'신당동', '장안4동':'장안2동'})

dong_to_gu_dict = dict(zip(dong_to_gu.출발지동, dong_to_gu.출발지구))

In [291]:
seoul_companies.동별 = seoul_companies.동별.replace({'기동':'제기동', '공릉1동':'공릉1.3동','용신동':'신설동'})

In [292]:
seoul_companies = seoul_companies[seoul_companies.동별.isin(final_data.출발지동.unique())]

In [293]:
call_taxi_unique = set(final_data.출발지동.unique())
companies_unique = set(seoul_companies.동별.unique())
print(set.difference(call_taxi_unique, companies_unique))
print(set.difference(companies_unique, call_taxi_unique))

set()
set()


In [294]:
seoul_companies['구별'] = seoul_companies.동별.apply(create_gu_col)

In [295]:
# 전체 종사자수와 사회복지 서비스업 종사자 테이블 분리
total_seoul_companies = seoul_companies[seoul_companies['산업대분류별'].str.contains('합계')]
social_seoul_companies = seoul_companies[seoul_companies['산업대분류별'].str.contains('보건업 및 사회복지 서비스업')]

In [296]:
# 시점 column 변경
total_seoul_companies.loc[:,'시점'] = pd.to_datetime(total_seoul_companies.시점).dt.year
social_seoul_companies.loc[:,'시점'] = pd.to_datetime(social_seoul_companies.시점).dt.year

In [297]:
total_seoul_companies.drop(total_seoul_companies[total_seoul_companies.동별 == '합계'].index, inplace=True)
social_seoul_companies.drop(social_seoul_companies[social_seoul_companies.동별 == '합계'].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_seoul_companies.drop(total_seoul_companies[total_seoul_companies.동별 == '합계'].index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_seoul_companies.drop(social_seoul_companies[social_seoul_companies.동별 == '합계'].index, inplace=True)


In [298]:
print(total_seoul_companies.shape)
print(social_seoul_companies.shape)

(1675, 7)
(1672, 7)


In [299]:
total_gangnam_idx = total_seoul_companies[total_seoul_companies.동별 == '신사동'].index[-4:]
social_gangnam_idx = social_seoul_companies[social_seoul_companies.동별 == '신사동'].index[-4:]

In [300]:
total_seoul_companies.loc[list(total_gangnam_idx),'구별'] = '강남구'
social_seoul_companies.loc[list(social_gangnam_idx),'구별'] = '강남구'

In [301]:
# 2022년 데이터를 2021년의 데이터로 imputation
total_2021 = total_seoul_companies[total_seoul_companies.시점 == 2021]
social_2021 = social_seoul_companies[social_seoul_companies.시점 == 2021]

total_2021.loc[:,'시점'] = 2022
social_2021.loc[:,'시점'] = 2022

total_seoul_companies = pd.concat([total_seoul_companies, total_2021])
social_seoul_companies = pd.concat([social_seoul_companies, social_2021])

In [302]:
# 열 이름 변경
total_seoul_companies = total_seoul_companies.rename(columns = {'사업체현황(산업대분류별/동별)(2017년 이후)':'총 종사자 수'})
social_seoul_companies = social_seoul_companies.rename(columns = {'사업체현황(산업대분류별/동별)(2017년 이후)':'보건업 종사자 수'})

In [303]:
# join
final_data = pd.merge(final_data, total_seoul_companies[['구별', '동별', '총 종사자 수', '시점']], how = "left", left_on = ["출발지구","출발지동", "year"], right_on = ["구별","동별","시점"])
final_data = final_data.drop(columns = ["구별", '동별', '시점'])

In [304]:
print(final_data.shape)

(5345867, 18)


In [305]:
# join
final_data = pd.merge(final_data, social_seoul_companies[['구별','동별', '보건업 종사자 수', '시점']], how = "left", left_on = ["출발지구","출발지동", "year"], right_on = ["구별","동별","시점"])
final_data = final_data.drop(columns = ['구별', '동별', '시점'])

In [306]:
print(final_data.shape)
final_data.head()

(5345867, 19)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,이용목적,출발동_위도,출발동_경도,출발동_최근접_차고지_거리,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,귀가,37.560008,126.985802,0.014748,37.528107,126.969193,0.035966,110677.0,2156.0
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,귀가,37.505081,127.013377,0.025657,37.55927,126.848268,0.173774,10055.0,772.0
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,귀가,37.641473,127.01064,0.036339,37.632001,127.067959,0.058096,5715.0,1461.0
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,귀가,37.492266,126.958473,0.022919,37.586123,127.02165,0.113139,1314.0,324.0
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,귀가,37.619921,127.062983,0.020209,37.662734,127.069523,0.043309,6475.0,759.0


In [307]:
final_data.isna().sum()

dispatch_waiting_time     425465
total_waiting_time        716233
cancel_time              4632873
using_time                     0
using_day                      0
year                           0
출발지구                           0
출발지동                           0
목적지구                           0
목적지동                           0
이용목적                           0
출발동_위도                         0
출발동_경도                         0
출발동_최근접_차고지_거리                 0
목적동_위도                         0
목적동_경도                         0
이동거리                           0
총 종사자 수                    16982
보건업 종사자 수                  18005
dtype: int64

In [308]:
def impute_total_worker(dong):
    return total_seoul_companies[(total_seoul_companies.시점 == year-1) & (total_seoul_companies.동별 == dong)]['총 종사자 수']

def impute_social_worker(dong):
    return social_seoul_companies[(social_seoul_companies.시점 == year-1) & (social_seoul_companies.동별 == dong)]['보건업 종사자 수']

In [309]:
condition = (final_data.year == 2021) & (final_data['총 종사자 수'].isna())
year = 2021
final_data.loc[condition, '총 종사자 수'] = final_data.loc[condition, '출발지동'].apply(impute_total_worker).iloc[:,0]
final_data.loc[condition, '보건업 종사자 수'] = final_data.loc[condition, '출발지동'].apply(impute_social_worker).iloc[:,0]

In [310]:
condition = (final_data.year == 2022) & (final_data['총 종사자 수'].isna())
year = 2021
final_data.loc[condition, '총 종사자 수'] = final_data.loc[condition, '출발지동'].apply(impute_total_worker).iloc[:,0]
final_data.loc[condition, '보건업 종사자 수'] = final_data.loc[condition, '출발지동'].apply(impute_social_worker).iloc[:,0]

In [311]:
# 둔촌1동은 모든 시점이 null값을 가진다.
final_data[final_data['보건업 종사자 수'].isna()][['year', '출발지동']].value_counts()

year  출발지동
2021  둔촌1동    288
2020  둔촌1동    273
2022  둔촌1동    245
2018  둔촌1동    142
2019  둔촌1동     75
Name: count, dtype: int64

In [312]:
# 둔촌2동의 데이터로 둔촌1동의 데이터 대
final_data.loc[final_data['보건업 종사자 수'].isna() & (final_data.year == 2018), '보건업 종사자 수'] = social_seoul_companies.loc[(social_seoul_companies.동별 == '둔촌2동')&(social_seoul_companies.시점 == 2018), '보건업 종사자 수'].iloc[0]
final_data.loc[final_data['보건업 종사자 수'].isna() & (final_data.year == 2019), '보건업 종사자 수'] = social_seoul_companies.loc[(social_seoul_companies.동별 == '둔촌2동')&(social_seoul_companies.시점 == 2019), '보건업 종사자 수'].iloc[0]
final_data.loc[final_data['보건업 종사자 수'].isna() & (final_data.year == 2020), '보건업 종사자 수'] = social_seoul_companies.loc[(social_seoul_companies.동별 == '둔촌2동')&(social_seoul_companies.시점 == 2020), '보건업 종사자 수'].iloc[0]
final_data.loc[final_data['보건업 종사자 수'].isna() & (final_data.year == 2021), '보건업 종사자 수'] = social_seoul_companies.loc[(social_seoul_companies.동별 == '둔촌2동')&(social_seoul_companies.시점 == 2021), '보건업 종사자 수'].iloc[0]
final_data.loc[final_data['보건업 종사자 수'].isna() & (final_data.year == 2022), '보건업 종사자 수'] = social_seoul_companies.loc[(social_seoul_companies.동별 == '둔촌2동')&(social_seoul_companies.시점 == 2022), '보건업 종사자 수'].iloc[0]

In [313]:
final_data.isna().sum()

dispatch_waiting_time     425465
total_waiting_time        716233
cancel_time              4632873
using_time                     0
using_day                      0
year                           0
출발지구                           0
출발지동                           0
목적지구                           0
목적지동                           0
이용목적                           0
출발동_위도                         0
출발동_경도                         0
출발동_최근접_차고지_거리                 0
목적동_위도                         0
목적동_경도                         0
이동거리                           0
총 종사자 수                        0
보건업 종사자 수                      0
dtype: int64

In [314]:
# 상대적 종사자 수

for y in [2018, 2019, 2020, 2021, 2022]:
    total_seoul = total_seoul_companies.loc[total_seoul_companies.시점 == y, '총 종사자 수'].sum()
    social_seoul = social_seoul_companies.loc[social_seoul_companies.시점 == y, '보건업 종사자 수'].sum()

    print(total_seoul, social_seoul)

    denom = social_seoul / total_seoul

    final_data.loc[final_data.year == y, '상대적 보건업 종사자 수'] = (final_data.loc[final_data.year == y, '보건업 종사자 수'] / final_data.loc[final_data.year == y, '총 종사자 수']) / denom


5186606.0 376367.0
5200810.0 397524.0
5839379.0 440152.0
5720121.0 444470.0
5720121.0 444470.0


In [315]:
print(final_data.shape)
final_data.head()

(5345867, 20)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,이용목적,출발동_위도,출발동_경도,출발동_최근접_차고지_거리,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수,상대적 보건업 종사자 수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,귀가,37.560008,126.985802,0.014748,37.528107,126.969193,0.035966,110677.0,2156.0,0.26845
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,귀가,37.505081,127.013377,0.025657,37.55927,126.848268,0.173774,10055.0,772.0,1.058052
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,귀가,37.641473,127.01064,0.036339,37.632001,127.067959,0.058096,5715.0,1461.0,3.522944
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,귀가,37.492266,126.958473,0.022919,37.586123,127.02165,0.113139,1314.0,324.0,3.397984
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,귀가,37.619921,127.062983,0.020209,37.662734,127.069523,0.043309,6475.0,759.0,1.615376


In [316]:
fInal_data = final_data.rename(columns = {'총 종사자 수':'총_종사자_수', '보건업 종사자 수':'보건업_종사자_수', '상대적 보건업 종사자 수':'상대적_보건업_종사자_수'})

### 서울시 행정동별 장애인 시설 수

In [317]:
disabled_center = pd.read_csv('./data/extra_data/서울시_장애인시설.csv', encoding='cp949')

In [318]:
# 필요없는 열 삭제
disabled_center.drop(['시설명', '시설코드', '자치구(시)구분', '시설장명', '시군구코드', '시설주소', '전화번호','우편번호' ], axis=1, inplace=True)

# 열 이름 변경
disabled_center.columns = ['시설종류명', '시설종류상세명', '구별', '시설정원','현인원']

# 서울특별시 삭제
disabled_center.drop(disabled_center[disabled_center['구별'].str.contains('서울특별시')].index, inplace=True)

In [319]:
# 구별 갯수 세기
grouped_dis_center = disabled_center['구별'].groupby(disabled_center['구별'])
grouped_dis_center = pd.DataFrame(grouped_dis_center.count())
grouped_dis_center.columns = ['구별_장애인_시설_수']
grouped_dis_center = grouped_dis_center.reset_index()
grouped_dis_center

Unnamed: 0,구별,구별_장애인_시설_수
0,강남구,55
1,강동구,50
2,강북구,30
3,강서구,58
4,관악구,24
5,광진구,14
6,구로구,25
7,금천구,22
8,노원구,58
9,도봉구,23


In [320]:
call_taxi_gus = set(full_data.출발지구.unique())
community_gus = set(grouped_dis_center.구별.unique())
print(set.difference(call_taxi_gus, community_gus))
print(set.difference(community_gus, call_taxi_gus))

set()
set()


In [321]:
# join
final_data = pd.merge(final_data, grouped_dis_center, how = "left", left_on = "출발지구", right_on = "구별")
final_data = final_data.drop(columns = ['구별'])

In [322]:
print(final_data.shape)
final_data.head()

(5345867, 21)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,...,출발동_위도,출발동_경도,출발동_최근접_차고지_거리,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수,상대적 보건업 종사자 수,구별_장애인_시설_수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,...,37.560008,126.985802,0.014748,37.528107,126.969193,0.035966,110677.0,2156.0,0.26845,11
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,...,37.505081,127.013377,0.025657,37.55927,126.848268,0.173774,10055.0,772.0,1.058052,30
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,...,37.641473,127.01064,0.036339,37.632001,127.067959,0.058096,5715.0,1461.0,3.522944,30
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,...,37.492266,126.958473,0.022919,37.586123,127.02165,0.113139,1314.0,324.0,3.397984,24
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,...,37.619921,127.062983,0.020209,37.662734,127.069523,0.043309,6475.0,759.0,1.615376,58


### 행정동별 연도별 장애인 수

In [323]:
seoul_disabled = pd.read_csv('./data/extra_data/서울시_장애인구.csv')

In [324]:
# '-' -> NaN 바꾸기
seoul_disabled.장애인구수 = seoul_disabled.장애인구수.str.replace('-','')

# 장애인구 수 형 변환
seoul_disabled.장애인구수 = pd.to_numeric(seoul_disabled.장애인구수)

# '구'로 구분된 데이터 삭제
seoul_disabled.drop(seoul_disabled[seoul_disabled['동별'].str.contains('구')].index, inplace=True)

# 기타로 분류된 데이터 삭제
seoul_disabled.drop(seoul_disabled[seoul_disabled['동별'].str.contains('기타')].index, inplace=True)

# 시점 열에서 '년' 없애기
seoul_disabled.시점 = seoul_disabled.시점.str.replace(' 년', '')

In [325]:
seoul_disabled.head()

Unnamed: 0,동별,장애유형별,시점,장애인구수
80,사직동,합계,2018,309.0
81,사직동,합계,2019,295.0
82,사직동,합계,2020,284.0
83,사직동,합계,2021,277.0
84,사직동,합계,2022,267.0


In [326]:
seoul_disabled.시점 = pd.to_numeric(seoul_disabled.시점)

In [327]:
final_data.출발지동 = final_data.출발지동.replace({'홍1동':'홍제1동', '홍2동':'홍제2동', '홍3동':'홍제3동'})

dong_to_gu.출발지동 = dong_to_gu.출발지동.replace({'홍1동':'홍제1동', '홍2동':'홍제2동', '홍3동':'홍제3동'})

dong_to_gu_dict = dict(zip(dong_to_gu.출발지동, dong_to_gu.출발지구))

In [328]:
seoul_disabled.동별 = seoul_disabled.동별.replace({'용신동':'신설동', '공릉1동':'공릉1.3동'})

In [329]:
seoul_disabled = seoul_disabled[seoul_disabled.동별.isin(final_data.출발지동.unique())]

In [330]:
call_taxi_dongs = set(final_data.출발지동.unique())
disabled_dongs = set(seoul_disabled.동별.unique())
print(set.difference(call_taxi_dongs, disabled_dongs))
print(set.difference(disabled_dongs, call_taxi_dongs))

{'구로2동', '구로3동', '구로5동', '구의1동', '구로1동', '구로4동', '구의2동', '구의3동', '구산동', '압구정동'}
set()


In [331]:
seoul_disabled = seoul_disabled[seoul_disabled.장애유형별 == '합계']

In [332]:
seoul_disabled['구별'] = seoul_disabled.동별.apply(create_gu_col)

In [333]:
gangnam_idx = seoul_disabled[seoul_disabled.동별 == '신사동'].index[-5:]
seoul_disabled.loc[list(gangnam_idx), '구별'] = '강남구'

In [334]:
# join
final_data = pd.merge(final_data, seoul_disabled[['구별','동별', '장애인구수', '시점']], how = "left", left_on = ["출발지구","출발지동", "year"], right_on = ["구별","동별","시점"])
final_data = final_data.drop(columns = ['구별','동별', '시점'])

In [335]:
print(final_data.shape)
final_data.head()

(5345867, 22)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,...,출발동_경도,출발동_최근접_차고지_거리,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수,상대적 보건업 종사자 수,구별_장애인_시설_수,장애인구수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,...,126.985802,0.014748,37.528107,126.969193,0.035966,110677.0,2156.0,0.26845,11,126.0
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,...,127.013377,0.025657,37.55927,126.848268,0.173774,10055.0,772.0,1.058052,30,532.0
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,...,127.01064,0.036339,37.632001,127.067959,0.058096,5715.0,1461.0,3.522944,30,1794.0
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,...,126.958473,0.022919,37.586123,127.02165,0.113139,1314.0,324.0,3.397984,24,679.0
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,...,127.062983,0.020209,37.662734,127.069523,0.043309,6475.0,759.0,1.615376,58,1056.0


### 행정동별 연도별 전체 인구 수

In [336]:
seoul_population = pd.read_csv('./data/extra_data/서울시_주민등록인구.csv')

In [337]:
# 동별 칼럼 소계 제거
seoul_population.drop(seoul_population[seoul_population['동별'].str.contains('소계')].index, inplace=True)

# 2023년 데이터 삭제
seoul_population.drop(seoul_population[seoul_population['시점'].str.contains('2023 1/4')].index, inplace=True)

In [338]:
seoul_population.head()

Unnamed: 0,구별,동별,시점,인구수
7,종로구,사직동,2018,9818
8,종로구,사직동,2019,9815
9,종로구,사직동,2020,9806
10,종로구,사직동,2021,9636
11,종로구,사직동,2022,9355


In [339]:
seoul_population.동별 = seoul_population.동별.replace({'용신동':'신설동', '공릉1동':'공릉1.3동'})

In [340]:
seoul_population = seoul_population[seoul_population.동별.isin(final_data.출발지동.unique())]

In [341]:
call_taxi_dongs = set(final_data.출발지동.unique())
population_dongs = set(seoul_population.동별.unique())
print(set.difference(call_taxi_dongs, population_dongs))
print(set.difference(population_dongs, call_taxi_dongs))

set()
set()


In [342]:
seoul_population.시점 = pd.to_numeric(seoul_population.시점)

In [343]:
seoul_population['구별'] = seoul_population.동별.apply(create_gu_col)

In [344]:
gangnam_idx = seoul_population[seoul_population.동별 == '신사동'].index[-5:]
seoul_population.loc[list(gangnam_idx), '구별'] = '강남구'

In [345]:
# join
final_data = pd.merge(final_data, seoul_population[['구별','동별', '인구수', '시점']], how = "left", left_on = ["출발지구","출발지동", "year"], right_on = ["구별","동별","시점"])
final_data = final_data.drop(columns = ['구별','동별', '시점'])

In [346]:
# 상대적 장애인구 수

for y in [2018, 2019, 2020, 2021, 2022]:
    total_seoul = seoul_population.loc[seoul_population.시점 == y, '인구수'].sum()
    social_seoul = seoul_disabled.loc[seoul_disabled.시점 == y, '장애인구수'].sum()

    print(total_seoul, social_seoul)

    denom = social_seoul / total_seoul

    final_data.loc[final_data.year == y, '상대적_장애인구수'] = (final_data.loc[final_data.year == y, '장애인구수'] / final_data.loc[final_data.year == y, '인구수']) / denom


9961623 378845.0
9924408 380734.0
9807131 379538.0
9575802 375617.0
9483798 373958.0


In [347]:
print(final_data.shape)
final_data.head()

(5345867, 24)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,...,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수,상대적 보건업 종사자 수,구별_장애인_시설_수,장애인구수,인구수,상대적_장애인구수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,...,37.528107,126.969193,0.035966,110677.0,2156.0,0.26845,11,126.0,3470.0,0.954794
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,...,37.55927,126.848268,0.173774,10055.0,772.0,1.058052,30,532.0,32109.0,0.435666
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,...,37.632001,127.067959,0.058096,5715.0,1461.0,3.522944,30,1794.0,33046.0,1.427487
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,...,37.586123,127.02165,0.113139,1314.0,324.0,3.397984,24,679.0,16527.0,1.0803
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,...,37.662734,127.069523,0.043309,6475.0,759.0,1.615376,58,1056.0,24087.0,1.152789


### 배차와 취소 시간이 모두 null인 데이터 & 승차와 취소 시간이 모두 null인 데이터 삭제

In [348]:
final_data = final_data[~(final_data.dispatch_waiting_time.isna() & final_data.cancel_time.isna())]
final_data = final_data[~(final_data.total_waiting_time.isna() & final_data.cancel_time.isna())]

In [349]:
print(final_data.shape)
final_data.head()

(5342521, 24)


Unnamed: 0,dispatch_waiting_time,total_waiting_time,cancel_time,using_time,using_day,year,출발지구,출발지동,목적지구,목적지동,...,목적동_위도,목적동_경도,이동거리,총 종사자 수,보건업 종사자 수,상대적 보건업 종사자 수,구별_장애인_시설_수,장애인구수,인구수,상대적_장애인구수
0,136.0,165.0,,새벽,평일,2018,중구,명동,용산구,한강로동,...,37.528107,126.969193,0.035966,110677.0,2156.0,0.26845,11,126.0,3470.0,0.954794
1,140.0,162.0,,새벽,평일,2018,서초구,반포1동,강서구,등촌3동,...,37.55927,126.848268,0.173774,10055.0,772.0,1.058052,30,532.0,32109.0,0.435666
2,139.0,160.0,,새벽,평일,2018,강북구,인수동,노원구,하계2동,...,37.632001,127.067959,0.058096,5715.0,1461.0,3.522944,30,1794.0,33046.0,1.427487
3,207.0,218.0,,새벽,평일,2018,관악구,청림동,성북구,안암동,...,37.586123,127.02165,0.113139,1314.0,324.0,3.397984,24,679.0,16527.0,1.0803
4,210.0,232.0,,새벽,평일,2018,노원구,월계1동,노원구,상계5동,...,37.662734,127.069523,0.043309,6475.0,759.0,1.615376,58,1056.0,24087.0,1.152789


In [350]:
final_data.to_csv('./data/final_data/final_data.csv', index = False)