In [520]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm


tqdm.pandas()
plt.style.use('ggplot')
pd.options.display.float_format = '{:.2f}'.format

df = pd.read_csv('../data/socar_reservation_triplog.csv', parse_dates=['reservation_start_at', 'reservation_return_at', 'member_created_date'])

In [521]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751548 entries, 0 to 751547
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   reservation_id           751548 non-null  int64         
 1   car_id                   751548 non-null  int64         
 2   member_id_encrypted      751548 non-null  object        
 3   region                   751548 non-null  object        
 4   reservation_return_at    751548 non-null  datetime64[ns]
 5   reservation_start_at     751548 non-null  datetime64[ns]
 6   member_age               751548 non-null  int64         
 7   member_gender            699017 non-null  object        
 8   member_created_date      751548 non-null  datetime64[ns]
 9   member_total_distance    738010 non-null  float64       
 10  is_vroom                 751548 non-null  bool          
 11  car_name                 751548 non-null  object        
 12  zone_name       

In [522]:
def preprocess(df):
    # 사용하지 않는 컬럼 제거
    df = df.drop(columns=['reservation_id', 'car_id', 'reservation_created_lat', 'reservation_created_lng'])

    # 활용할 컬럼의 null 행 제거
    COLS = ['member_age', 'member_gender', 'trip']
    df = df.dropna(subset=COLS)

    df = df.rename(columns={'member_id_encrypted':'member_id'})

    # 이용시간이 너무 짧은 것 제거


    return df.copy()


In [523]:
df = preprocess(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 516650 entries, 5735 to 751547
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   member_id              516650 non-null  object        
 1   region                 516650 non-null  object        
 2   reservation_return_at  516650 non-null  datetime64[ns]
 3   reservation_start_at   516650 non-null  datetime64[ns]
 4   member_age             516650 non-null  int64         
 5   member_gender          516650 non-null  object        
 6   member_created_date    516650 non-null  datetime64[ns]
 7   member_total_distance  509060 non-null  float64       
 8   is_vroom               516650 non-null  bool          
 9   car_name               516650 non-null  object        
 10  zone_name              516068 non-null  object        
 11  zone_address           516650 non-null  object        
 12  zone_lat               516650 non-null  f

In [517]:
code = pd.read_csv('../data/region_data/행정구역코드_sgis_201906.csv')
code.head()

Unnamed: 0,시도코드,시도명칭,시군구코드,시군구명칭,읍면동코드,읍면동명칭
0,11,서울특별시,11010,종로구,1101053,사직동
1,11,서울특별시,11010,종로구,1101054,삼청동
2,11,서울특별시,11010,종로구,1101055,부암동
3,11,서울특별시,11010,종로구,1101056,평창동
4,11,서울특별시,11010,종로구,1101057,무악동


In [449]:
code.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   시도코드    3512 non-null   int64 
 1   시도명칭    3512 non-null   object
 2   시군구코드   3512 non-null   int64 
 3   시군구명칭   3512 non-null   object
 4   읍면동코드   3512 non-null   int64 
 5   읍면동명칭   3512 non-null   object
dtypes: int64(3), object(3)
memory usage: 164.8+ KB


In [450]:
code['name'] = code.시도명칭.add(' ').add(code.시군구명칭)
code = code.drop_duplicates('name')
code.head()

Unnamed: 0,시도코드,시도명칭,시군구코드,시군구명칭,읍면동코드,읍면동명칭,name
0,11,서울특별시,11010,종로구,1101053,사직동,서울특별시 종로구
17,11,서울특별시,11020,중구,1102052,소공동,서울특별시 중구
32,11,서울특별시,11030,용산구,1103051,후암동,서울특별시 용산구
48,11,서울특별시,11040,성동구,1104052,왕십리2동,서울특별시 성동구
65,11,서울특별시,11050,광진구,1105053,화양동,서울특별시 광진구


In [452]:
# regions : 쏘카 이용정보 triplog에 등장하는 unique한 지역들

regions = list()
for triplog in df.trip.str.split(','):
    regions.extend(triplog)
regions = list(set(regions))


regions = pd.DataFrame({'name':regions})
regions.head()

Unnamed: 0,name
0,경상북도 군위군
1,충청북도 충주시
2,서울특별시 양천구
3,서울특별시 서초구
4,서울특별시 금천구


In [453]:
regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    249 non-null    object
dtypes: object(1)
memory usage: 2.1+ KB


In [454]:
regions.shape

(249, 1)

In [455]:
regions = regions.merge(code[['name', '시군구코드']], how='left', on='name')
regions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249 entries, 0 to 248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    249 non-null    object 
 1   시군구코드   248 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB


In [456]:
regions.shape

(249, 2)

In [457]:
regions[regions.시군구코드.isna()]

Unnamed: 0,name,시군구코드
40,세종특별자치시,


In [458]:
regions.iloc[40, 1] = 29010

In [460]:
regions = regions.rename(columns={'시군구코드':'rcode'})
regions.rcode = regions.rcode.astype(int)

In [462]:
regions.head()

Unnamed: 0,name,rcode
0,경상북도 군위군,37310
1,충청북도 충주시,33020
2,서울특별시 양천구,11150
3,서울특별시 서초구,11220
4,서울특별시 금천구,11180


In [463]:
import os

path = '../data/region_data'

# foods = pd.read_csv(os.path.join(path, 'KC_618_LLR_RSTRT_CNBAS_TRND_2020.csv'))
stores = pd.read_csv(os.path.join(path, 'KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019.csv'))
# tours = pd.read_csv(os.path.join(path, 'YN_REGNAL_TOUR_ND_TRNSPORT_GOODS_20200831.csv'))

In [464]:
stores.head()

Unnamed: 0,sido_nm,sgg_nm,hadm_cd,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum,FILE_NAME,base_ymd
0,강원도,강릉시,42150,308,4685,460,162905,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214
1,강원도,고성군,42820,105,795,117,18628,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214
2,강원도,동해시,42170,85,1929,247,68772,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214
3,강원도,삼척시,42230,135,1597,209,45409,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214
4,강원도,속초시,42210,79,2594,210,63203,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214


In [465]:
stores[stores.sgg_nm.isna()]

Unnamed: 0,sido_nm,sgg_nm,hadm_cd,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum,FILE_NAME,base_ymd
165,세종특별자치시,,36110,152,3886,301,229563,KC_619_DMSTC_TRV_CNSMP_STATN_BIZAEA_MAP_2019,20200214


In [466]:
stores['name'] = stores.sido_nm + ' ' + stores.sgg_nm
stores = stores.drop(columns=['FILE_NAME', 'base_ymd', 'hadm_cd'])
stores = stores.merge(code[['name', '시군구코드']], how='left', on='name')

stores = stores.rename(columns={'시군구코드':'rcode'})

In [467]:
stores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sido_nm          250 non-null    object 
 1   sgg_nm           249 non-null    object 
 2   atrctn_cnt       250 non-null    int64  
 3   rstrt_cnt        250 non-null    int64  
 4   shopng_cnt       250 non-null    int64  
 5   residnt_cnt_sum  250 non-null    int64  
 6   name             249 non-null    object 
 7   rcode            249 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 17.6+ KB


In [468]:
stores.loc[stores.name.isna(), 'name'] = '세종특별자치시'
stores.loc[stores.rcode.isna(), 'rcode'] = 29010

In [469]:
stores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sido_nm          250 non-null    object 
 1   sgg_nm           249 non-null    object 
 2   atrctn_cnt       250 non-null    int64  
 3   rstrt_cnt        250 non-null    int64  
 4   shopng_cnt       250 non-null    int64  
 5   residnt_cnt_sum  250 non-null    int64  
 6   name             250 non-null    object 
 7   rcode            250 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 17.6+ KB


In [470]:
regions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249 entries, 0 to 248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    249 non-null    object
 1   rcode   249 non-null    int32 
dtypes: int32(1), object(1)
memory usage: 4.9+ KB


In [471]:
stores.columns

Index(['sido_nm', 'sgg_nm', 'atrctn_cnt', 'rstrt_cnt', 'shopng_cnt',
       'residnt_cnt_sum', 'name', 'rcode'],
      dtype='object')

In [472]:
regions = regions.merge(stores[['atrctn_cnt', 'rstrt_cnt', 'shopng_cnt', 'residnt_cnt_sum', 'rcode']], how='left', on='rcode')

In [473]:
# triplog에 존재하는 모든 지역에 대한 특성정보를 추가함
# 이 테이블을 참조하여 이용정보에서 방문한 모든 지역의 특성을 조회할수있음
regions.head(10)

Unnamed: 0,name,rcode,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum
0,경상북도 군위군,37310,153,421,82,16253
1,충청북도 충주시,33020,260,4534,520,165765
2,서울특별시 양천구,11150,27,3716,432,385006
3,서울특별시 서초구,11220,87,6711,310,379715
4,서울특별시 금천구,11180,26,3305,301,244507
5,전라남도 함평군,36430,170,443,89,21135
6,부산광역시 동래구,21060,85,4146,320,214009
7,충청남도 서산시,34050,153,3201,311,131876
8,경기도 성남시 중원구,31022,30,2671,271,185399
9,경기도 용인시 수지구,31193,30,2254,218,286691


In [474]:
feature_cols = [col for col in regions.columns if col[-3:] == 'cnt']

# 인구 1000명당 관광지, 식당, 쇼핑점 수로 변환
for col in feature_cols:
    regions[col] = regions[col] * 1000 / regions.residnt_cnt_sum

In [475]:
# 시군구별 인구1000명당 관광지, 식당, 쇼핑점 수
regions

Unnamed: 0,name,rcode,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum
0,경상북도 군위군,37310,9.41,25.90,5.05,16253
1,충청북도 충주시,33020,1.57,27.35,3.14,165765
2,서울특별시 양천구,11150,0.07,9.65,1.12,385006
3,서울특별시 서초구,11220,0.23,17.67,0.82,379715
4,서울특별시 금천구,11180,0.11,13.52,1.23,244507
...,...,...,...,...,...,...
244,경상북도 구미시,37050,0.56,24.06,2.05,324508
245,제주특별자치도 제주시,39010,1.60,24.96,1.88,369821
246,전라북도 전주시 완산구,35011,0.42,19.72,2.04,282095
247,경기도 고양시 덕양구,31101,0.32,12.30,1.17,351337


In [476]:
regions.sort_values('atrctn_cnt', ascending=False)

Unnamed: 0,name,rcode,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum
224,경상남도 산청군,38370,16.95,28.47,4.48,24133
127,전라남도 신안군,36480,15.04,27.32,4.03,21343
96,경상남도 합천군,38400,13.46,25.16,4.66,30450
120,전라북도 임실군,35350,12.31,30.65,6.40,16409
166,전라북도 진안군,35320,11.86,23.26,5.57,15435
...,...,...,...,...,...,...
240,경기도 수원시 권선구,31012,0.09,11.14,1.19,309292
160,경기도 군포시,31160,0.07,10.66,1.26,235162
2,서울특별시 양천구,11150,0.07,9.65,1.12,385006
228,경기도 성남시 분당구,31023,0.07,12.18,0.83,417115


In [477]:
regions.sort_values('shopng_cnt', ascending=False)

Unnamed: 0,name,rcode,atrctn_cnt,rstrt_cnt,shopng_cnt,residnt_cnt_sum
92,충청북도 보은군,33320,6.29,33.30,7.49,22433
187,강원도 정선군,32350,5.03,39.01,7.12,26840
201,전라북도 무주군,35330,7.49,31.80,6.82,16415
182,충청남도 태안군,34380,3.37,37.96,6.60,44834
28,강원도 양양군,32410,6.99,48.66,6.47,19462
...,...,...,...,...,...,...
109,경기도 수원시 영통구,31014,0.04,11.55,0.86,289907
228,경기도 성남시 분당구,31023,0.07,12.18,0.83,417115
3,서울특별시 서초구,11220,0.23,17.67,0.82,379715
9,경기도 용인시 수지구,31193,0.10,7.86,0.76,286691


In [478]:
regions = regions.set_index('name')

In [479]:
# trip log에서 방문한 지역의 3가지 특성정보를 불러온 뒤 평균냄
# 해당 이용 방문지의 평균적인 특성을 나타내는 변수
# ex. 관광지만 돌아다니면 -> 인구대비 관광지 평균이 높음
def get_trip_feature(triplog, region_table):
    triplog = triplog.split(',')
    mean_features = region_table.loc[triplog, ['atrctn_cnt', 'rstrt_cnt', 'shopng_cnt']].values.mean(axis=0)
    return mean_features



In [480]:
trip_feautures = df.trip.progress_map(lambda log: get_trip_feature(log, regions))

100%|██████████| 516650/516650 [06:04<00:00, 1417.56it/s]


In [495]:
trip_df = pd.DataFrame(trip_feautures.values)
trip_df

Unnamed: 0,0
0,"[1.138925919768058, 22.30863221929183, 2.25880..."
1,"[0.2977804499337986, 19.86939453775826, 1.5718..."
2,"[0.21045855204516192, 14.676389026443498, 1.38..."
3,"[3.358291123724975, 19.721326430014567, 3.0401..."
4,"[0.23434012795714923, 16.173188513614047, 1.32..."
...,...
516645,"[1.134770624858102, 17.080522566638404, 2.0172..."
516646,"[0.43563249335589493, 13.839466231644632, 1.43..."
516647,"[0.1561441628473658, 9.977831927302796, 0.9060..."
516648,"[0.2689360940699067, 17.398650014892347, 1.405..."


In [497]:
trip_df['atrctn'] = trip_df[0].map(lambda x: x[0])
trip_df['rstrt'] = trip_df[0].map(lambda x: x[1])
trip_df['shopng'] = trip_df[0].map(lambda x: x[2])

In [498]:
trip_df.head()

Unnamed: 0,0,atrctn,rstrt,shopng
0,"[1.138925919768058, 22.30863221929183, 2.25880...",1.14,22.31,2.26
1,"[0.2977804499337986, 19.86939453775826, 1.5718...",0.3,19.87,1.57
2,"[0.21045855204516192, 14.676389026443498, 1.38...",0.21,14.68,1.38
3,"[3.358291123724975, 19.721326430014567, 3.0401...",3.36,19.72,3.04
4,"[0.23434012795714923, 16.173188513614047, 1.32...",0.23,16.17,1.32


In [524]:
df = df.reset_index(drop=True)

In [525]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516650 entries, 0 to 516649
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   member_id              516650 non-null  object        
 1   region                 516650 non-null  object        
 2   reservation_return_at  516650 non-null  datetime64[ns]
 3   reservation_start_at   516650 non-null  datetime64[ns]
 4   member_age             516650 non-null  int64         
 5   member_gender          516650 non-null  object        
 6   member_created_date    516650 non-null  datetime64[ns]
 7   member_total_distance  509060 non-null  float64       
 8   is_vroom               516650 non-null  bool          
 9   car_name               516650 non-null  object        
 10  zone_name              516068 non-null  object        
 11  zone_address           516650 non-null  object        
 12  zone_lat               516650 non-null  floa

In [526]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516650 entries, 0 to 516649
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       516650 non-null  object 
 1   atrctn  516650 non-null  float64
 2   rstrt   516650 non-null  float64
 3   shopng  516650 non-null  float64
dtypes: float64(3), object(1)
memory usage: 15.8+ MB


In [538]:
pd.concat([df, trip_df], axis=1).drop(columns=['trip', 0]).info

Unnamed: 0,member_id,region,reservation_return_at,reservation_start_at,member_age,member_gender,member_created_date,member_total_distance,is_vroom,car_name,zone_name,zone_address,zone_lat,zone_lng,zone_type1,zone_type2,zone_type3,atrctn,rstrt,shopng
0,1mGAeA7IOfhkzhQFwLxRBgGzPowVDcSQVK2/V8kDMPM=,서울특별시 서대문구,2019-02-04 23:04:15,2019-02-01 15:30:00,35,male,2015-04-11,8774.00,False,볼트EV (제주),yesAPM주차장,서울 서대문구 대현동 145,37.56,126.94,TRANSFER_SUBWAY,COMMERCIAL_HOTSPOT,ETC,1.14,22.31,2.26
1,wbThAF/3/JTlPt/WYFh5smLsm5SiYeHRKUwSdSSTPoU=,울산광역시 북구,2019-02-03 19:09:33,2019-02-02 08:30:00,35,male,2018-01-27,222.00,False,더뉴레이,경성유료주차장,울산 북구 화봉동 1466,35.59,129.37,LIVING_APT,LIVING_VILLA,ETC,0.30,19.87,1.57
2,KkyDio56SwjqcZ7on2ABGMAKLFbEB7hRfcZ4DJ5aBlg=,서울특별시 동대문구,2019-02-02 09:27:15,2019-02-02 06:30:00,41,male,2014-11-13,191.00,True,카니발 11인승,삼육서울병원 정산소 옆 주차장,서울 동대문구 휘경동 283-5,37.59,127.06,LIVING_ETC,COMMERCIAL_HOTSPOT,SCHOOL_OUT,0.21,14.68,1.38
3,Au2om8v5WY/HEZd81G/gqfWVO14pTUs124HNylTSPjU=,서울특별시 강서구,2019-02-06 16:33:35,2019-02-02 18:00:00,32,female,2015-09-26,19256.00,False,볼트EV (제주),볏골공원 공영주차장,서울 강서구 화곡동 98-86,37.54,126.84,LIVING_VILLA,LIVING_ETC,TRANSFER_SUBWAY,3.36,19.72,3.04
4,KCBn3ApdPczY3rDgOmFRYXMbpX9PgsH/LiAUqp3DJnc=,부산광역시 강서구,2019-02-05 10:48:56,2019-02-04 12:00:00,35,male,2013-07-23,1421.00,False,레이,직원전용주차장,부산 강서구 대저2동 2764-2,35.17,128.95,TRANSFER_STATION,ETC,ETC,0.23,16.17,1.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516645,X8aIsbFWJdDldihinn7oWYt46MX/OE57k6OSE0VzPOc=,전라북도 전주시 완산구,2019-08-17 22:03:46,2019-08-17 10:40:00,28,male,2017-04-30,704.00,True,더뉴아반떼,대명유료주차장,전북 전주시 완산구 중화산동2가 629-2,35.82,127.12,LIVING_APT,LIVING_VILLA,COMMERCIAL_HOTSPOT,1.13,17.08,2.02
516646,isNozhU/SlfjlXncrzi20TgQEev2hiSf+JH/13Vzt1Y=,경기도 의정부시,2019-08-16 20:29:58,2019-08-15 21:40:00,31,male,2018-03-09,10799.00,False,올뉴K3,AJ파크 의정부 센트럴타워,경기 의정부시 의정부동 494 센트럴타워,37.74,127.04,TRANSFER_SUBWAY,COMMERCIAL_HOTSPOT,,0.44,13.84,1.44
516647,DBVfEBiHB2Fq4PgtorxeD6SKjkvP+yeofER1mjQ6OD0=,서울특별시 강남구,2019-08-16 00:39:36,2019-08-15 22:00:00,28,male,2015-06-10,9080.00,False,더뉴레이,대승주차장,서울 강남구 대치동 984-12,37.50,127.07,LIVING_APT,LIVING_VILLA,COMMERCIAL_HOTSPOT,0.16,9.98,0.91
516648,2akRtQ2yPc0zFkPX1EGe72+tJPXrOhFKHmlOiwiwYY0=,서울특별시 강서구,2019-08-16 04:39:18,2019-08-15 21:40:00,21,male,2016-08-04,932.00,False,넥스트스파크,삼영주차장,서울 강서구 화곡동 110-65,37.54,126.84,LIVING_VILLA,LIVING_ETC,ETC,0.27,17.40,1.41


In [548]:
df[df.member_total_distance.eq(0)].groupby('member_id')['region'].count().gt(1)

0