In [9]:
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib import font_manager, rc
import seaborn as sns 
sns.set_style("darkgrid")
%matplotlib inline 
rc("font", family="Malgun Gothic")
plt.rcParams["axes.unicode_minus"] = False 

import warnings
import os
warnings.filterwarnings(action='ignore')

In [10]:
data_path = os.getcwd()

In [11]:
df_triplog = pd.read_csv(os.path.join(data_path, 'socar_reservation_triplog.csv'))

In [13]:
df_triplog.columns

Index(['reservation_id', 'car_id', 'member_id_encrypted', 'region',
       'reservation_return_at', 'reservation_start_at', 'member_age',
       'member_gender', 'member_created_date', 'member_total_distance',
       'is_vroom', 'car_name', 'zone_name', 'zone_address', 'zone_lat',
       'zone_lng', 'zone_type1', 'zone_type2', 'zone_type3',
       'reservation_created_lat', 'reservation_created_lng', 'trip'],
      dtype='object')

## 추가, 수정 컬럼
- usage_time : reservation_return_at - reservation_start_at (분 단위)
- trip : 지역구 중복 제거 (순서X)

In [25]:
def preprocess(df):
    # 사용 컬럼 선택
    use_col = ['reservation_id', 'region', 'reservation_return_at', 'reservation_start_at', 'member_age', 
           'member_gender', 'member_total_distance', 'is_vroom', 'car_name', 'zone_address', 'zone_lat', 'zone_lng','trip']
    df = df[use_col]
    
    # member_gender 와 trip 결측 데이터 제거
    df = df.loc[df['member_gender'].notnull() & df['trip'].notnull()]
    

    df['reservation_return_at'] =  pd.to_datetime(df['reservation_return_at'], format='%Y-%m-%d %H:%M:%S')
    df['reservation_start_at'] =  pd.to_datetime(df['reservation_start_at'], format='%Y-%m-%d %H:%M:%S')
    df.dropna(how='any', inplace=True)

    # 사용시간 (분), 5분 미만 데이터는 제외
    df['usage_time'] = (df['reservation_return_at'] - df['reservation_start_at']).astype('timedelta64[m]')
    df = df.loc[df['usage_time']>5]
    
    # trip의 지역구 중복 제거
    df['trip'] = df['trip'].apply(lambda x : ','.join(list(set(x.split(',')))))
    return df.copy()

In [29]:
a.columns

Index(['reservation_id', 'region', 'reservation_return_at',
       'reservation_start_at', 'member_age', 'member_gender',
       'member_total_distance', 'is_vroom', 'car_name', 'zone_address',
       'zone_lat', 'zone_lng', 'trip', 'usage_time'],
      dtype='object')

In [5]:
df_triplog.columns

Index(['reservation_id', 'car_id', 'member_id_encrypted', 'region',
       'reservation_return_at', 'reservation_start_at', 'member_age',
       'member_gender', 'member_created_date', 'member_total_distance',
       'is_vroom', 'car_name', 'zone_name', 'zone_address', 'zone_lat',
       'zone_lng', 'zone_type1', 'zone_type2', 'zone_type3',
       'reservation_created_lat', 'reservation_created_lng', 'trip'],
      dtype='object')

In [6]:
min(df_triplog['reservation_start_at']), max(df_triplog['reservation_start_at'])

('2018-12-25 14:10:00', '2019-11-30 23:30:00')

In [8]:
df_triplog['member_gender'].unique()

array(['male', 'female', nan], dtype=object)

In [7]:
df_triplog = df_triplog.loc[df_triplog['member_gender'].notnull() & df_triplog['trip'].notnull()]

In [8]:
df_usage.columns

Index(['reservation_id', 'region', 'reservation_return_at',
       'reservation_start_at', 'member_age', 'member_gender', 'car_name',
       'zone_name', 'zone_address', 'zone_lat', 'zone_lng', 'zone_type1',
       'zone_type2', 'zone_type3'],
      dtype='object')

In [11]:
use_col = ['reservation_id', 'region', 'reservation_return_at', 'reservation_start_at', 'member_age', 
           'member_gender', 'zone_address', 'zone_lat', 'zone_lng']
df_usage = df_usage[use_col]

use_col = ['reservation_id', 'member_gender', 'member_total_distance', 'is_vroom', 'car_name', 'trip']
df_triplog = df_triplog[use_col]

In [9]:
df_join = pd.merge(df_usage, df_triplog, how='inner', on='reservation_id')

In [18]:
df_usage.loc[df_usage['reservation_id']==20283135]

Unnamed: 0,reservation_id,region,reservation_return_at,reservation_start_at,member_age,member_gender,car_name,zone_name,zone_address,zone_lat,zone_lng,zone_type1,zone_type2,zone_type3
6433,20283135,서울특별시 동대문구,2019-02-02 09:27:15,2019-02-02 06:30:00,41,male,카니발 11인승,회기역(삼육서울병원),서울 동대문구 휘경동 283-5,37.589523,127.063044,SCHOOL_OUT,COMMERCIAL_HOTSPOT,LIVING_ETC


In [19]:
df_triplog.loc[df_triplog['reservation_id']==20283135]

Unnamed: 0,reservation_id,car_id,member_id_encrypted,region,reservation_return_at,reservation_start_at,member_age,member_gender,member_created_date,member_total_distance,...,zone_name,zone_address,zone_lat,zone_lng,zone_type1,zone_type2,zone_type3,reservation_created_lat,reservation_created_lng,trip
6297,20283135,11865,KkyDio56SwjqcZ7on2ABGMAKLFbEB7hRfcZ4DJ5aBlg=,서울특별시 동대문구,2019-02-02 09:27:15,2019-02-02 06:30:00,41,male,2014-11-13,191.0,...,삼육서울병원 정산소 옆 주차장,서울 동대문구 휘경동 283-5,37.589523,127.063044,LIVING_ETC,COMMERCIAL_HOTSPOT,SCHOOL_OUT,,,경기도 구리시


In [10]:
df_join.columns

Index(['reservation_id', 'region_x', 'reservation_return_at_x',
       'reservation_start_at_x', 'member_age_x', 'member_gender_x',
       'car_name_x', 'zone_name_x', 'zone_address_x', 'zone_lat_x',
       'zone_lng_x', 'zone_type1_x', 'zone_type2_x', 'zone_type3_x', 'car_id',
       'member_id_encrypted', 'region_y', 'reservation_return_at_y',
       'reservation_start_at_y', 'member_age_y', 'member_gender_y',
       'member_created_date', 'member_total_distance', 'is_vroom',
       'car_name_y', 'zone_name_y', 'zone_address_y', 'zone_lat_y',
       'zone_lng_y', 'zone_type1_y', 'zone_type2_y', 'zone_type3_y',
       'reservation_created_lat', 'reservation_created_lng', 'trip'],
      dtype='object')

In [17]:
df_join

Unnamed: 0,reservation_id,region_x,reservation_return_at_x,reservation_start_at_x,member_age_x,member_gender_x,car_name_x,zone_name_x,zone_address_x,zone_lat_x,...,zone_name_y,zone_address_y,zone_lat_y,zone_lng_y,zone_type1_y,zone_type2_y,zone_type3_y,reservation_created_lat,reservation_created_lng,trip
0,20277450,서울특별시 서대문구,2019-02-04 23:04:15,2019-02-01 15:30:00,35,male,볼트EV (제주),yesAPM주차장,서울 서대문구 대현동 145,37.557541,...,yesAPM주차장,서울 서대문구 대현동 145,37.557541,126.944977,TRANSFER_SUBWAY,COMMERCIAL_HOTSPOT,ETC,,,"광주광역시 남구,광주광역시 남구,광주광역시 동구,광주광역시 동구,광주광역시 동구,광..."
1,20283135,서울특별시 동대문구,2019-02-02 09:27:15,2019-02-02 06:30:00,41,male,카니발 11인승,회기역(삼육서울병원),서울 동대문구 휘경동 283-5,37.589523,...,삼육서울병원 정산소 옆 주차장,서울 동대문구 휘경동 283-5,37.589523,127.063044,LIVING_ETC,COMMERCIAL_HOTSPOT,SCHOOL_OUT,,,경기도 구리시
2,20300527,서울특별시 강서구,2019-02-06 16:33:35,2019-02-02 18:00:00,32,female,볼트EV (제주),볏골공원 공영주차장,서울 강서구 화곡동 98-86,37.541569,...,볏골공원 공영주차장,서울 강서구 화곡동 98-86,37.541569,126.844612,LIVING_VILLA,LIVING_ETC,TRANSFER_SUBWAY,,,"충청북도 보은군,서울특별시 양천구,강원도 원주시,경상북도 의성군,경상북도 의성군,경..."
3,20319207,부산광역시 강서구,2019-02-05 10:48:56,2019-02-04 12:00:00,35,male,레이,직원전용주차장,부산 강서구 대저2동 2764-2,35.171036,...,직원전용주차장,부산 강서구 대저2동 2764-2,35.171036,128.951645,TRANSFER_STATION,ETC,ETC,,,부산광역시 사하구
4,20320848,부산광역시 강서구,2019-02-05 12:00:12,2019-02-04 12:40:00,44,male,티볼리(경유),직원전용주차장,부산 강서구 대저2동 2764-2,35.171036,...,직원전용주차장,부산 강서구 대저2동 2764-2,35.171036,128.951645,TRANSFER_STATION,ETC,ETC,,,"부산광역시 금정구,부산광역시 금정구,부산광역시 동래구,부산광역시 해운대구"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394462,27626153,경기도 고양시 덕양구,2019-11-30 23:14:33,2019-11-30 21:20:00,28,male,더뉴레이,이케아 고양점,경기 고양시 덕양구 도내동 1003 이케아 고양점,37.629789,...,이케아 고양점,경기 고양시 덕양구 도내동 1003 이케아 고양점,37.629789,126.862958,LIVING_APT,COMMERCIAL_OFFICE,COMMERCIAL_HOTSPOT,37.627940,126.868969,경기도 고양시 덕양구
394463,27626177,경기도 김포시,2019-11-30 23:18:07,2019-11-30 21:20:00,28,male,아반떼AD,산타마리아 상가주차장,경기 김포시 장기동 1618,37.644100,...,산타마리아 상가주차장,경기 김포시 장기동 1618,37.644100,126.667442,LIVING_ETC,TRANSFER_SUBWAY,COMMERCIAL_HOTSPOT,37.644402,126.667155,경기도 김포시
394464,27626714,서울특별시 강남구,2019-11-30 23:54:04,2019-11-30 22:00:00,37,male,더뉴레이,군인공제회관,서울 강남구 도곡동 467-13,37.488924,...,군인공제회관,서울 강남구 도곡동 467-13,37.488924,127.052720,LIVING_APT,LIVING_VILLA,TRANSFER_SUBWAY,37.487583,127.053556,서울특별시 강남구
394465,27626872,경기도 화성시,2019-11-30 23:35:34,2019-11-30 22:10:00,29,male,아반떼AD,세림타운,경기 화성시 반송동 46,37.207886,...,세림타운,경기 화성시 반송동 46,37.207886,127.062042,LIVING_ETC,ETC,ETC,37.208541,127.061897,경기도 화성시


In [13]:
len(df_join.loc[df_join['zone_type1_x'] != df_join['zone_type1_y']])

43843

In [15]:
df_join.loc[df_join['zone_type1_x'] != df_join['zone_type1_y'], ['zone_type1_x', 'zone_type1_y']]

Unnamed: 0,zone_type1_x,zone_type1_y
1,SCHOOL_OUT,LIVING_ETC
6,TRANSFER_SUBWAY,LIVING_APT
18,,
27,,
33,,
...,...,...
394413,COMMERCIAL_HOTSPOT,TRANSFER_SUBWAY
394417,,LIVING_ETC
394420,COMMERCIAL_HOTSPOT,LIVING_APT
394436,LIVING_VILLA,SCHOOL_OUT


In [13]:
df_join.columns

Index(['reservation_id', 'region', 'reservation_return_at',
       'reservation_start_at', 'member_age', 'member_gender_x', 'zone_address',
       'zone_lat', 'zone_lng', 'member_gender_y', 'member_total_distance',
       'is_vroom', 'car_name', 'trip'],
      dtype='object')

In [14]:
# member gender가 다른 경우 제외
df_join = df_join.loc[df_join['member_gender_x'] == df_join['member_gender_y']]

In [15]:
df_join.drop(columns='member_gender_y', inplace=True)

In [16]:
df_join.rename(columns={'member_gender_x': 'member_gender'}, inplace=True)

In [17]:
df_join.columns

Index(['reservation_id', 'region', 'reservation_return_at',
       'reservation_start_at', 'member_age', 'member_gender', 'zone_address',
       'zone_lat', 'zone_lng', 'member_total_distance', 'is_vroom', 'car_name',
       'trip'],
      dtype='object')

In [18]:
df_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394399 entries, 0 to 394466
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   reservation_id         394399 non-null  int64  
 1   region                 394399 non-null  object 
 2   reservation_return_at  394399 non-null  object 
 3   reservation_start_at   394399 non-null  object 
 4   member_age             394399 non-null  int64  
 5   member_gender          394399 non-null  object 
 6   zone_address           394399 non-null  object 
 7   zone_lat               394399 non-null  float64
 8   zone_lng               394399 non-null  float64
 9   member_total_distance  389313 non-null  float64
 10  is_vroom               394399 non-null  bool   
 11  car_name               394399 non-null  object 
 12  trip                   394399 non-null  object 
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 39.5+ MB


In [19]:
df_join['reservation_return_at'] =  pd.to_datetime(df_join['reservation_return_at'], format='%Y-%m-%d %H:%M:%S')
df_join['reservation_start_at'] =  pd.to_datetime(df_join['reservation_start_at'], format='%Y-%m-%d %H:%M:%S')

In [20]:
# 결측치 5개 제거
df_join.dropna(how='any', inplace=True)

In [None]:
df_join.dropna(how='any', inplace=True)

# 사용시간 (분), 5분 미만 데이터는 제외
df_join['usage_time'] = (df_join['reservation_return_at'] - df_join['reservation_start_at']).astype('timedelta64[m]')
df_join = df_join.loc[df_join['usage_time']>5]

In [21]:
# 사용시간 (분), 5분 미만 데이터는 제외
df_join['usage_time'] = (df_join['reservation_return_at'] - df_join['reservation_start_at']).astype('timedelta64[m]')
df_join = df_join.loc[df_join['usage_time']>5]

In [22]:
len(df_join)

389308

In [23]:
df_join.columns

Index(['reservation_id', 'region', 'reservation_return_at',
       'reservation_start_at', 'member_age', 'member_gender', 'zone_address',
       'zone_lat', 'zone_lng', 'member_total_distance', 'is_vroom', 'car_name',
       'trip', 'usage_time'],
      dtype='object')

In [24]:
# trip의 지역구 중복 제거
df_join['trip'] = df_join['trip'].apply(lambda x : ','.join(list(set(x.split(',')))))[0]

In [30]:
df_join.head()

Unnamed: 0,reservation_id,region,reservation_return_at,reservation_start_at,member_age,member_gender,zone_address,zone_lat,zone_lng,member_total_distance,is_vroom,car_name,trip,usage_time
0,20277450,서울특별시 서대문구,2019-02-04 23:04:15,2019-02-01 15:30:00,35,male,서울 서대문구 대현동 145,37.557541,126.944977,8774.0,False,볼트EV (제주),"충청남도 예산군,전라남도 화순군,광주광역시 북구,전라북도 정읍시,광주광역시 남구,광...",4774.0
1,20283135,서울특별시 동대문구,2019-02-02 09:27:15,2019-02-02 06:30:00,41,male,서울 동대문구 휘경동 283-5,37.589523,127.063044,191.0,True,카니발 11인승,"충청남도 예산군,전라남도 화순군,광주광역시 북구,전라북도 정읍시,광주광역시 남구,광...",177.0
2,20300527,서울특별시 강서구,2019-02-06 16:33:35,2019-02-02 18:00:00,32,female,서울 강서구 화곡동 98-86,37.541569,126.844612,19256.0,False,볼트EV (제주),"충청남도 예산군,전라남도 화순군,광주광역시 북구,전라북도 정읍시,광주광역시 남구,광...",5673.0
3,20319207,부산광역시 강서구,2019-02-05 10:48:56,2019-02-04 12:00:00,35,male,부산 강서구 대저2동 2764-2,35.171036,128.951645,1421.0,False,레이,"충청남도 예산군,전라남도 화순군,광주광역시 북구,전라북도 정읍시,광주광역시 남구,광...",1368.0
4,20320848,부산광역시 강서구,2019-02-05 12:00:12,2019-02-04 12:40:00,44,male,부산 강서구 대저2동 2764-2,35.171036,128.951645,1271.0,False,티볼리(경유),"충청남도 예산군,전라남도 화순군,광주광역시 북구,전라북도 정읍시,광주광역시 남구,광...",1400.0


In [None]:
# 위도, 경도 이상치는 없는 것 같다

In [32]:
min(df_join['zone_lat']), max(df_join['zone_lat'])

(35.083052, 37.760854)

In [33]:
min(df_join['zone_lng']), max(df_join['zone_lng'])

(126.597065, 128.952002)

In [None]:
######################################

### 같은 reservation_id 임에도 성별이 다른 경우 제거

In [73]:
df_usage.loc[df_usage['reservation_id']==20877853, 'member_gender']

69487    female
Name: member_gender, dtype: object

In [74]:
df_triplog.loc[df_triplog['reservation_id']==20877853, 'member_gender']

67728    male
Name: member_gender, dtype: object

In [76]:
df_join = df_join.loc[df_join['member_gender_x'] == df_join['member_gender_y']]