In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt

In [2]:
with open('../dataset/따릉이_대여이력_2019v2.pkl','rb') as f:
    df2019=pickle.load(f)
    
with open('../dataset/따릉이_대여이력_2020v2.pkl','rb') as f:
    df2020=pickle.load(f)

In [3]:
df2019.head()

Unnamed: 0,rent_id,rent_time,return_id,return_time,travel_time,travel_distance,fare,sex,age
0,1421,2019-01-01 00:00:44,1420,2019-01-01 00:03:53,189.0,580.0,BIL_004,\N,AGE_002
1,1408,2019-01-01 00:02:16,1433,2019-01-01 00:07:07,291.0,1020.0,BIL_002,F,AGE_005
2,347,2019-01-01 00:04:12,387,2019-01-01 00:07:07,175.0,550.0,BIL_014,M,AGE_003
3,615,2019-01-01 00:03:09,612,2019-01-01 00:07:27,258.0,700.0,BIL_002,\N,AGE_002
4,1177,2019-01-01 00:01:44,1152,2019-01-01 00:08:09,385.0,1090.0,BIL_006,\N,AGE_002


In [4]:
fare2019=sorted(df2019['fare'].unique().tolist())
fare2020=sorted(df2019['fare'].unique().tolist())

In [5]:
fare2019

['BIL_001',
 'BIL_002',
 'BIL_004',
 'BIL_005',
 'BIL_006',
 'BIL_007',
 'BIL_008',
 'BIL_011',
 'BIL_012',
 'BIL_014',
 'BIL_015',
 'BIL_016',
 'BIL_017',
 'BIL_020']

In [6]:
fare2020

['BIL_001',
 'BIL_002',
 'BIL_004',
 'BIL_005',
 'BIL_006',
 'BIL_007',
 'BIL_008',
 'BIL_011',
 'BIL_012',
 'BIL_014',
 'BIL_015',
 'BIL_016',
 'BIL_017',
 'BIL_020']

#### 정기권 / 일일권 타입

정기권 - 001,002,004,005 / 011,012,014,015

일일권 - 006,007 / 016/017

In [7]:
regular=['BIL_001','BIL_002','BIL_004','BIL_005','BIL_011','BIL_012','BIL_014','BIL_015']
daily=['BIL_006','BIL_007','BIL_016','BIL_017']
others=['BIL_008', 'BIL_020','BIL_021']

In [8]:
df2019['type']=df2019['fare'].replace(regular,'Regular').replace(daily,'Daily').replace(others,'Others')
df2020['type']=df2020['fare'].replace(regular,'Regular').replace(daily,'Daily').replace(others,'Others')

In [9]:
def drop_others(df):
    df=df.drop(df[df['type']=='Others'].index).reset_index(drop=True)
    
    return df

In [10]:
df2019=drop_others(df2019)
df2020=drop_others(df2020)

### 이용시간 전처리

In [11]:
times={'BIL_001':'1H',
       'BIL_002':'1H',
       'BIL_004':'1H',
       'BIL_005':'1H',
       'BIL_006':'1H',
       'BIL_007':'1H',
       'BIL_008':'OTHER',
       'BIL_020':'OTHER',
       'BIL_021':'OTHER',
       'BIL_011':'2H',
       'BIL_012':'2H',
       'BIL_014':'2H',
       'BIL_015':'2H',
       'BIL_016':'2H',
       'BIL_017':'2H'}

In [12]:
df2019['times']=df2019['fare'].map(times)
df2020['times']=df2020['fare'].map(times)

In [13]:
df2019=df2019.drop(df2019[(df2019['travel_time']<60)].index).reset_index(drop=True)
df2020=df2020.drop(df2020[(df2020['travel_time']<60)].index).reset_index(drop=True)


df2019=df2019.drop(df2019[(df2019['times']=='1H') & (df2019['travel_time']>14400)].index).reset_index(drop=True)
df2020=df2020.drop(df2020[(df2020['times']=='1H') & (df2020['travel_time']>14400)].index).reset_index(drop=True)
df2019=df2019.drop(df2019[(df2019['times']=='2H') & (df2019['travel_time']>21600)].index).reset_index(drop=True)
df2020=df2020.drop(df2020[(df2020['times']=='2H') & (df2020['travel_time']>21600)].index).reset_index(drop=True)

### 이용 거리 전처리

In [14]:
df2019=df2019.drop(df2019[df2019['travel_distance']==0].index).reset_index(drop=True)
df2020=df2020.drop(df2020[df2020['travel_distance']==0].index).reset_index(drop=True)

df2019=df2019.drop(df2019[(df2019['times']=='1H') & (df2019['travel_distance']>65200)].index).reset_index(drop=True)
df2020=df2020.drop(df2020[(df2020['times']=='1H') & (df2020['travel_distance']>65200)].index).reset_index(drop=True)

df2019=df2019.drop(df2019[(df2019['times']=='2H') & (df2019['travel_distance']>97800)].index).reset_index(drop=True)
df2020=df2020.drop(df2020[(df2020['times']=='2H') & (df2020['travel_distance']>97800)].index).reset_index(drop=True)

### 연/월/일/주말 주중 구분

In [56]:
def year_month_day(df):
    df['year']=df['rent_time'].dt.year
    df['month']=df['rent_time'].dt.month
    df['day']=df['rent_time'].dt.day
    df['dayofweek']=df['rent_time'].dt.dayofweek
    df['hour']=df['rent_time'].dt.hour

    weeks={0:'Mon',
          1:'Tue',
          2:'Wed',
          3:'Thu',
          4:'Fri',
          5:'Sat',
          6:'Sun'}

    df['dayofweek']=df['dayofweek'].map(weeks)
    
    return df

In [57]:
df2019=year_month_day(df2019)
df2020=year_month_day(df2020)

In [17]:
type(df2019['month'].iloc[0])

numpy.int64

In [18]:
def week_hol(df):
    df_week=df.drop(df[(df['dayofweek']=='Sat') | (df['dayofweek']=='Sun')].index).reset_index(drop=True)
    df_hol=df.iloc[df[(df['dayofweek']=='Sat') | (df['dayofweek']=='Sun')].index].reset_index(drop=True)
    
    return df_week, df_hol

In [61]:
df2019_week, df2019_hol=week_hol(df2019)

print('2019 주중 : ',df2019_week.shape)
print('2019 주말 : ',df2019_hol.shape)

2019 주중 :  (13377729, 17)
2019 주말 :  (4738456, 17)


In [62]:
# 2019년 공휴일

df2019_new_hol=df2019_week.iloc[df2019_week[((df2019_week['month']==1)&(df2019_week['day']==1))|
                     ((df2019_week['month']==2)&(df2019_week['day']==4))|
                     ((df2019_week['month']==2)&(df2019_week['day']==5))|
                     ((df2019_week['month']==2)&(df2019_week['day']==6))|
                     ((df2019_week['month']==3)&(df2019_week['day']==1))|
                     ((df2019_week['month']==5)&(df2019_week['day']==5))|
                     ((df2019_week['month']==5)&(df2019_week['day']==12))|
                     ((df2019_week['month']==6)&(df2019_week['day']==6))|
                     ((df2019_week['month']==8)&(df2019_week['day']==15))|
                     ((df2019_week['month']==9)&(df2019_week['day']==12))|
                     ((df2019_week['month']==9)&(df2019_week['day']==13))|
                     ((df2019_week['month']==9)&(df2019_week['day']==14))|
                     ((df2019_week['month']==10)&(df2019_week['day']==3))|
                     ((df2019_week['month']==10)&(df2019_week['day']==9))|
                     ((df2019_week['month']==12)&(df2019_week['day']==25))].index].reset_index(drop=True)

df2019_week=df2019_week.drop(df2019_week[((df2019_week['month']==1)&(df2019_week['day']==1))|
                     ((df2019_week['month']==2)&(df2019_week['day']==4))|
                     ((df2019_week['month']==2)&(df2019_week['day']==5))|
                     ((df2019_week['month']==2)&(df2019_week['day']==6))|
                     ((df2019_week['month']==3)&(df2019_week['day']==1))|
                     ((df2019_week['month']==5)&(df2019_week['day']==5))|
                     ((df2019_week['month']==5)&(df2019_week['day']==12))|
                     ((df2019_week['month']==6)&(df2019_week['day']==6))|
                     ((df2019_week['month']==8)&(df2019_week['day']==15))|
                     ((df2019_week['month']==9)&(df2019_week['day']==12))|
                     ((df2019_week['month']==9)&(df2019_week['day']==13))|
                     ((df2019_week['month']==9)&(df2019_week['day']==14))|
                     ((df2019_week['month']==10)&(df2019_week['day']==3))|
                     ((df2019_week['month']==10)&(df2019_week['day']==9))|
                     ((df2019_week['month']==12)&(df2019_week['day']==25))].index).reset_index(drop=True)

print(df2019_week.shape)
print(df2019_new_hol.shape)

(13000705, 17)
(377024, 17)


In [63]:
df2019_hol=pd.concat([df2019_hol, df2019_new_hol]).reset_index(drop=True)
print(df2019_hol.shape)

(5115480, 17)


In [20]:
df2020_week, df2020_hol=week_hol(df2020)

print('2020 주중 : ',df2020_week.shape)
print('2020 주말 : ',df2020_hol.shape)

2020 주중 :  (5744157, 16)
2020 주말 :  (2202585, 16)


In [64]:
# 2020년 공휴일

df2020_new_hol=df2020_week.iloc[df2020_week[((df2020_week['month']==1)&(df2020_week['day']==1))|
                     ((df2020_week['month']==1)&(df2020_week['day']==24))|
                     ((df2020_week['month']==1)&(df2020_week['day']==25))|
                     ((df2020_week['month']==1)&(df2020_week['day']==26))|
                     ((df2020_week['month']==3)&(df2020_week['day']==1))|
                     ((df2020_week['month']==4)&(df2020_week['day']==30))|
                     ((df2020_week['month']==5)&(df2020_week['day']==5))|
                     ((df2020_week['month']==6)&(df2020_week['day']==6))|
                     ((df2020_week['month']==8)&(df2020_week['day']==15))|
                     ((df2020_week['month']==8)&(df2020_week['day']==17))|
                     ((df2020_week['month']==9)&(df2020_week['day']==30))|
                     ((df2020_week['month']==10)&(df2020_week['day']==1))|
                     ((df2020_week['month']==10)&(df2020_week['day']==2))|
                     ((df2020_week['month']==10)&(df2020_week['day']==3))|
                     ((df2020_week['month']==10)&(df2020_week['day']==9))|
                     ((df2020_week['month']==12)&(df2020_week['day']==25))].index].reset_index(drop=True)

df2020_week=df2020_week.drop(df2020_week[((df2020_week['month']==1)&(df2020_week['day']==1))|
                     ((df2020_week['month']==1)&(df2020_week['day']==24))|
                     ((df2020_week['month']==1)&(df2020_week['day']==25))|
                     ((df2020_week['month']==1)&(df2020_week['day']==26))|
                     ((df2020_week['month']==3)&(df2020_week['day']==1))|
                     ((df2020_week['month']==4)&(df2020_week['day']==30))|
                     ((df2020_week['month']==5)&(df2020_week['day']==5))|
                     ((df2020_week['month']==6)&(df2020_week['day']==6))|
                     ((df2020_week['month']==8)&(df2020_week['day']==15))|
                     ((df2020_week['month']==8)&(df2020_week['day']==17))|
                     ((df2020_week['month']==9)&(df2020_week['day']==30))|
                     ((df2020_week['month']==10)&(df2020_week['day']==1))|
                     ((df2020_week['month']==10)&(df2020_week['day']==2))|
                     ((df2020_week['month']==10)&(df2020_week['day']==3))|
                     ((df2020_week['month']==10)&(df2020_week['day']==9))|
                     ((df2020_week['month']==12)&(df2020_week['day']==25))].index).reset_index(drop=True)

print(df2020_week.shape)
print(df2020_new_hol.shape)

(5580480, 16)
(163677, 16)


In [65]:
df2020_hol=pd.concat([df2020_hol, df2020_new_hol]).reset_index(drop=True)
print(df2020_hol.shape)

(2366262, 16)


### 19-20 주중/주말 pickle 파일로 저장

In [66]:
## Save pickle
with open("../dataset/2019_week.pkl","wb") as fw:
    pickle.dump(df2019_week, fw)

with open("../dataset/2019_hol.pkl","wb") as fw:
    pickle.dump(df2019_hol, fw)
    
with open("../dataset/2020_week.pkl","wb") as fw:
    pickle.dump(df2020_week, fw)
    
with open("../dataset/2020_hol.pkl","wb") as fw:
    pickle.dump(df2020_hol, fw)

#### 반기 구분

In [None]:
def make_half(df):
    first=[1,2,3,4,5,6]
    second=[7,8,9,10,11,12]
    
    df_first=df[df['month'].isin(first)].reset_index(drop=True)
    df_second=df[df['month'].isin(second)].reset_index(drop=True)
    
    return df_first, df_second

#### 분기 구분

In [21]:
def make_quarter(df):
    first=[1,2,3]
    second=[4,5,6]
    third=[7,8,9]
    fourth=[10,11,12]
    
    df_first=df[df['month'].isin(first)].reset_index(drop=True)
    df_second=df[df['month'].isin(second)].reset_index(drop=True)
    df_third=df[df['month'].isin(third)].reset_index(drop=True)
    df_fourth=df[df['month'].isin(fourth)].reset_index(drop=True)
    
    return df_first, df_second, df_third, df_fourth

In [24]:
df2019_week.head()

Unnamed: 0,rent_id,rent_time,return_id,return_time,travel_time,travel_distance,fare,sex,age,type,times,year,month,day,dayofweek,hour
0,1421,2019-01-01 00:00:44,1420,2019-01-01 00:03:53,189.0,580.0,BIL_004,\N,AGE_002,Regular,1H,2019,1,1,Tue,0
1,1408,2019-01-01 00:02:16,1433,2019-01-01 00:07:07,291.0,1020.0,BIL_002,F,AGE_005,Regular,1H,2019,1,1,Tue,0
2,347,2019-01-01 00:04:12,387,2019-01-01 00:07:07,175.0,550.0,BIL_014,M,AGE_003,Regular,2H,2019,1,1,Tue,0
3,615,2019-01-01 00:03:09,612,2019-01-01 00:07:27,258.0,700.0,BIL_002,\N,AGE_002,Regular,1H,2019,1,1,Tue,0
4,1177,2019-01-01 00:01:44,1152,2019-01-01 00:08:09,385.0,1090.0,BIL_006,\N,AGE_002,Daily,1H,2019,1,1,Tue,0


In [38]:
df2019_week.groupby('rent_id')['type'].value_counts(normalize=True).unstack()

type,Daily,Regular
rent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,0.178073,0.821927
1002,0.190514,0.809486
1003,0.158406,0.841594
1004,0.176554,0.823446
1006,0.115788,0.884212
...,...,...
9993,0.059322,0.940678
9997,0.625000,0.375000
9998,0.357143,0.642857
99998,,1.000000


In [68]:
# 대여소 기준 연도에 따른 주중/주말 이용권 비율 데이터프레임 생성

def ratio_by_station(df):
    grouped=df.groupby('rent_id')['type'].value_counts(normalize=True).unstack()
    
    station_id=grouped.index
    daily=grouped['Daily'].values
    regular=grouped['Regular'].values
    
    new_df=pd.DataFrame()
    new_df['station_id']=station_id
    new_df['Daily']=daily
    new_df['Regular']=regular
    
    return new_df

In [101]:
df2019_week_station=ratio_by_station(df2019_week)
df2019_week_station.rename(columns={'Daily':'19_week_daily', 'Regular':'19_week_regular'}, inplace=True)
df2019_hol_station=ratio_by_station(df2019_hol)
df2019_hol_station.rename(columns={'Daily':'19_hol_daily', 'Regular':'19_hol_regular'}, inplace=True)
df2020_week_station=ratio_by_station(df2020_week)
df2020_week_station.rename(columns={'Daily':'20_week_daily', 'Regular':'20_week_regular'}, inplace=True)
df2020_hol_station=ratio_by_station(df2020_hol)
df2020_hol_station.rename(columns={'Daily':'20_hol_daily', 'Regular':'20_hol_regular'}, inplace=True)

df_station=pd.merge(df2019_week_station, df2019_hol_station, left_on='station_id',right_on='station_id',how='outer')
df_station=pd.merge(df_station, df2020_week_station, left_on='station_id',right_on='station_id',how='outer')
df_station=pd.merge(df_station, df2020_hol_station, left_on='station_id',right_on='station_id',how='outer')

In [102]:
print('19년도 주중 : ',df2019_week_station.shape)
print('19년도 주말 : ',df2019_hol_station.shape)
print('20년도 주중 : ',df2020_week_station.shape)
print('20년도 주말 : ',df2020_hol_station.shape)

19년도 주중 :  (1559, 3)
19년도 주말 :  (1556, 3)
20년도 주중 :  (2164, 3)
20년도 주말 :  (2105, 3)


In [103]:
df_station=df_station.reset_index(drop=True)
df_station.head(20)

Unnamed: 0,station_id,19_week_daily,19_week_regular,19_hol_daily,19_hol_regular,20_week_daily,20_week_regular,20_hol_daily,20_hol_regular
0,1001,0.172905,0.827095,0.316646,0.683354,0.258624,0.741376,0.392308,0.607692
1,1002,0.183894,0.816106,0.35635,0.64365,0.283254,0.716746,0.454603,0.545397
2,1003,0.154073,0.845927,0.25914,0.74086,0.238314,0.761686,0.364192,0.635808
3,1004,0.172014,0.827986,0.301563,0.698438,0.272103,0.727897,0.428854,0.571146
4,1006,0.113201,0.886799,0.241314,0.758686,0.197472,0.802528,0.357414,0.642586
5,1007,0.212272,0.787728,0.443936,0.556064,0.272926,0.727074,0.541985,0.458015
6,1008,0.159479,0.840521,0.293281,0.706719,0.242745,0.757255,0.403878,0.596122
7,1009,0.204528,0.795472,0.341662,0.658338,0.266499,0.733501,0.417925,0.582075
8,101,0.147218,0.852782,0.248787,0.751213,0.19162,0.80838,0.306781,0.693219
9,1010,0.142545,0.857455,0.230505,0.769495,0.198139,0.801861,0.319519,0.680481


In [96]:
sta_info=pd.read_csv('../dataset/con_sta_info.csv')
sta_info.head()

Unnamed: 0,ID,stationName,Y,X,Xn,Yn,Elev,BikeRoadNM,ToBikeRoad
0,101,101. (구)합정동 주민센터,37.549561,126.905754,947509.050934,1950191.0,9.152323,한강자전거길,771.0
1,102,102. 망원역 1번출구 앞,37.555649,126.910629,947943.924406,1950863.0,6.963737,홍제천자전거길,1065.0
2,103,103. 망원역 2번출구 앞,37.554951,126.910835,947961.634042,1950786.0,6.865565,한강자전거길,1146.0
3,104,104. 합정역 1번출구 앞,37.550629,126.914986,948325.239132,1950304.0,18.136753,한강자전거길,676.0
4,105,105. 합정역 5번출구 앞,37.550007,126.914825,948310.657239,1950235.0,17.49843,한강자전거길,962.0


In [104]:
id_info=sta_info['ID'].unique().tolist()
id_info=[str(i) for i in id_info]
id_info[:5]

['101', '102', '103', '104', '105']

In [105]:
type(id_info[0])

str

In [106]:
df_station=df_station[df_station['station_id'].isin(id_info)].reset_index(drop=True)
df_station.head()

Unnamed: 0,station_id,19_week_daily,19_week_regular,19_hol_daily,19_hol_regular,20_week_daily,20_week_regular,20_hol_daily,20_hol_regular
0,1001,0.172905,0.827095,0.316646,0.683354,0.258624,0.741376,0.392308,0.607692
1,1002,0.183894,0.816106,0.35635,0.64365,0.283254,0.716746,0.454603,0.545397
2,1003,0.154073,0.845927,0.25914,0.74086,0.238314,0.761686,0.364192,0.635808
3,1004,0.172014,0.827986,0.301563,0.698438,0.272103,0.727897,0.428854,0.571146
4,1006,0.113201,0.886799,0.241314,0.758686,0.197472,0.802528,0.357414,0.642586


In [108]:
df_station.shape

(1527, 9)

## 19->20 차이가 큰 정류소 찾기

#### 편하게 보기위해서 정기권만 가지고 확인

In [110]:
df_station_reg=df_station[['station_id','19_week_regular','19_hol_regular','20_week_regular','20_hol_regular']]
df_station_reg.head()

Unnamed: 0,station_id,19_week_regular,19_hol_regular,20_week_regular,20_hol_regular
0,1001,0.827095,0.683354,0.741376,0.607692
1,1002,0.816106,0.64365,0.716746,0.545397
2,1003,0.845927,0.74086,0.761686,0.635808
3,1004,0.827986,0.698438,0.727897,0.571146
4,1006,0.886799,0.758686,0.802528,0.642586


In [112]:
df_station_reg['week diff']=abs(df_station_reg['20_week_regular']-df_station_reg['19_week_regular'])
df_station_reg['hol diff']=abs(df_station_reg['20_hol_regular']-df_station_reg['19_hol_regular'])

df_station_reg.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,station_id,19_week_regular,19_hol_regular,20_week_regular,20_hol_regular,week diff,hol diff
0,1001,0.827095,0.683354,0.741376,0.607692,0.085718,0.075662
1,1002,0.816106,0.64365,0.716746,0.545397,0.099361,0.098253
2,1003,0.845927,0.74086,0.761686,0.635808,0.084241,0.105052
3,1004,0.827986,0.698438,0.727897,0.571146,0.100089,0.127291
4,1006,0.886799,0.758686,0.802528,0.642586,0.084271,0.116101


In [113]:
def make_sign(data1, data2):
    if data1 > data2 :
        return '-'
    elif data2 > data1 :
        return '+'
    else:
        return '0'

In [115]:
df_station_reg['week diff sign']=df_station_reg.apply(lambda x:make_sign(x['19_week_regular'], x['20_week_regular']), axis=1)
df_station_reg['hol diff sign']=df_station_reg.apply(lambda x:make_sign(x['19_hol_regular'], x['20_hol_regular']), axis=1)

df_station_reg.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,station_id,19_week_regular,19_hol_regular,20_week_regular,20_hol_regular,week diff,hol diff,week diff sign,hol diff sign
0,1001,0.827095,0.683354,0.741376,0.607692,0.085718,0.075662,-,-
1,1002,0.816106,0.64365,0.716746,0.545397,0.099361,0.098253,-,-
2,1003,0.845927,0.74086,0.761686,0.635808,0.084241,0.105052,-,-
3,1004,0.827986,0.698438,0.727897,0.571146,0.100089,0.127291,-,-
4,1006,0.886799,0.758686,0.802528,0.642586,0.084271,0.116101,-,-


In [117]:
df_station_reg['week diff sign'].value_counts()

-    1496
+      31
Name: week diff sign, dtype: int64

In [174]:
# 정기권의 이용 비율이 증가한 대여소 (주중)

station_week_plus=df_station_reg[df_station_reg['week diff sign']=='+']['station_id'].tolist()
print(station_week_plus)
print('크기 : ',len(station_week_plus))

['1025', '1101', '1241', '1296', '1304', '1370', '1376', '1424', '1652', '1679', '1910', '198', '2141', '2180', '2225', '2275', '2357', '2364', '2385', '2392', '315', '322', '330', '337', '346', '3527', '447', '449', '462', '575', '802']
크기 :  31


In [118]:
df_station_reg['hol diff sign'].value_counts()

-    1472
+      55
Name: hol diff sign, dtype: int64

In [179]:
# 정기권의 이용 비율이 증가한 대여소 (주말)

station_hol_plus=df_station_reg[df_station_reg['hol diff sign']=='+']['station_id'].tolist()
print(station_hol_plus)
print('크기 : ',len(station_hol_plus))

['1108', '1220', '127', '1296', '1304', '1316', '136', '1370', '1457', '1510', '1670', '1910', '199', '1993', '2061', '2127', '2130', '2141', '2180', '2233', '2239', '2266', '2272', '2280', '2328', '2333', '2406', '2409', '2502', '2609', '2610', '2612', '307', '315', '322', '330', '336', '346', '3529', '3532', '354', '380', '391', '453', '454', '455', '462', '575', '580', '608', '640', '802', '812', '846', '963']
크기 :  55


In [182]:
# 정기권의 이용 비율이 증가한 대여소 교집합 (주중-주말)

print(set(station_week_plus).intersection(set(station_hol_plus)))
print(len(set(station_week_plus).intersection(set(station_hol_plus))))

{'322', '330', '802', '1304', '2180', '462', '346', '2141', '1296', '1370', '315', '1910', '575'}
13


In [120]:
df_station_reg['week diff'].describe()

count    1527.000000
mean        0.059412
std         0.030902
min         0.000593
25%         0.038381
50%         0.055540
75%         0.077492
max         0.207296
Name: week diff, dtype: float64

In [133]:
df_station_reg['hol diff'].describe()

count    1527.000000
mean        0.072593
std         0.039381
min         0.000163
25%         0.045215
50%         0.070242
75%         0.095023
max         0.284067
Name: hol diff, dtype: float64

### Quantile 별 소속 대여소 구분

#### 주중

In [164]:
week_qt1=df_station_reg['week diff'].quantile(.25)
week_qt2=df_station_reg['week diff'].quantile(.5)
week_qt3=df_station_reg['week diff'].quantile(.75)

In [165]:
station_week_qt1=df_station_reg[df_station_reg['week diff'] < week_qt1]['station_id'].tolist()
station_week_qt2=df_station_reg[(df_station_reg['week diff'] >= week_qt1)&(df_station_reg['week diff'] < week_qt2)]['station_id'].tolist()
station_week_qt3=df_station_reg[(df_station_reg['week diff'] >= week_qt2)&(df_station_reg['week diff'] < week_qt3)]['station_id'].tolist()
station_week_qt4=df_station_reg[df_station_reg['week diff'] >= week_qt3]['station_id'].tolist()

In [166]:
iqr=week_qt3-week_qt1
out_up=week_qt3+1.5*iqr
out_down=week_qt1-1.5*iqr

print(out_up, out_down)

0.1361574894727374 -0.02028431900019459


In [167]:
# 이상치로 간주된 정류소

station_outlier_week=df_station_reg[df_station_reg['week diff'] > out_up]['station_id'].tolist()
print(station_outlier_week)
print('크기 : ',len(station_outlier_week))

['1039', '1052', '1056', '1058', '1359', '1445', '1450', '1452', '1664', '1816', '1824', '1825', '1826', '1847', '1857', '1858', '1937', '1951', '1952', '1987', '2038', '2130', '221', '3525', '502', '524', '928', '938', '966']
크기 :  29


#### 주말

In [168]:
hol_qt1=df_station_reg['hol diff'].quantile(.25)
hol_qt2=df_station_reg['hol diff'].quantile(.5)
hol_qt3=df_station_reg['hol diff'].quantile(.75)

In [169]:
station_hol_qt1=df_station_reg[df_station_reg['hol diff'] < hol_qt1]['station_id'].tolist()
station_hol_qt2=df_station_reg[(df_station_reg['hol diff'] >= hol_qt1)&(df_station_reg['hol diff'] < hol_qt2)]['station_id'].tolist()
station_hol_qt3=df_station_reg[(df_station_reg['hol diff'] >= hol_qt2)&(df_station_reg['hol diff'] < hol_qt3)]['station_id'].tolist()
station_hol_qt4=df_station_reg[df_station_reg['hol diff'] >= hol_qt3]['station_id'].tolist()

In [170]:
iqr=hol_qt3-hol_qt1
out_up=hol_qt3+1.5*iqr
out_down=hol_qt1-1.5*iqr

print(out_up, out_down)

0.16973560896893147 -0.029498094943539402


In [171]:
# 이상치로 간주된 정류소

station_outlier_hol=df_station_reg[df_station_reg['hol diff'] > out_up]['station_id'].tolist()
print(station_outlier_hol)
print('크기 : ',len(station_outlier_hol))

['1038', '1057', '1058', '1406', '1421', '1819', '1826', '1835', '1836', '1847', '1857', '1858', '1929', '1952', '1953', '1969', '1999', '206', '2201', '2287', '2309', '2341', '3524', '3528', '3539', '827', '902']
크기 :  27


In [173]:
# 주중/주말 모두 변화가 큰 대여소

set(station_outlier_week).intersection(set(station_outlier_hol))

{'1058', '1826', '1847', '1857', '1858', '1952'}