## Import library

In [1]:
import pandas as pd
import numpy as np
import os, random, warnings, gc, psutil, datetime
from tqdm import tqdm_notebook, tqdm

from multiprocessing import Pool

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import lightgbm as lgbm

from glob import glob
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans, KMeans

# Set options
pd.set_option('max_columns',500)
pd.set_option('max_rows',500)
pd.options.display.max_colwidth = 300

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_palette('bright')

In [2]:
path = '../raw_dataset/'

## Load Dataset

In [3]:
df_bus = pd.read_csv(path + 'bus_bts.csv')

In [4]:
df_bus['geton_date'] = pd.to_datetime(df_bus['geton_date'])

In [5]:
# 주중에 정기적으로 타는 사람들
df_bus['weekday'] = df_bus['geton_date'].dt.weekday

df_weekday = df_bus[df_bus['weekday']<5]

df_commuter = df_weekday.groupby(['user_card_id','geton_station_code']).size().reset_index()
df_commuter.columns = ['user_card_id','geton_station_code','num_usage']

df_commuter = df_commuter[df_commuter['num_usage']>=10].reset_index(drop=True)
df_commuter = df_commuter.groupby('geton_station_code')['user_card_id'].count()

df_commuter = df_commuter.reset_index()
df_commuter.columns = ['station_code','regular_commuter_count']

In [6]:
# 해당 정류장에 12시 이후에 몇명이 내렸는지
df_afternoon = df_bus[df_bus['getoff_time']>='12:00:00'][['bus_route_id','getoff_date','getoff_station_code','getoff_time','user_category','user_count']]
df_afternoon_getoff_amount = df_afternoon.groupby(['bus_route_id','getoff_date','getoff_station_code'])['user_count'].sum().reset_index()
df_afternoon_getoff_amount = df_afternoon_getoff_amount.rename(columns = {'user_count' : 'afternoon_takeoff'})

In [7]:
# 해당 정류장에 같은 노선의 버스가 직전 몇분 전에 왔었는지
first_passenger_tagtime = df_bus.groupby(['geton_date', 'bus_route_id', 'vhc_id','geton_station_code'])['geton_time'].min().reset_index()


first_passenger_tagtime = first_passenger_tagtime.sort_values(by=['geton_date','bus_route_id','geton_station_code','geton_time']).reset_index(drop=True)

first_passenger_tagtime['geton_time_second']= first_passenger_tagtime['geton_time'].apply(lambda x: 60*60 *int(x.split(':')[0] ) +\
                                                                                                    60 * int(x.split(':')[1]) +\
                                                                                                          int(x.split(':')[2]) )

first_passenger_tagtime['next_bus_time_diff'] = first_passenger_tagtime.groupby(['geton_date','bus_route_id','geton_station_code'])['geton_time_second'].diff()
date_route_stataion_waittime = first_passenger_tagtime.groupby(['geton_date','bus_route_id','geton_station_code'])['next_bus_time_diff'].mean().reset_index()

date_route_stataion_waittime = date_route_stataion_waittime.groupby(['geton_date','bus_route_id'])['next_bus_time_diff'].mean()
date_route_stataion_waittime =date_route_stataion_waittime.reset_index()

In [8]:
# 6~9, 9~12시 사이에 각기 다른 집단의 사람들이 몇명 탑승했는지
bus_sample = df_bus[['geton_date','geton_station_code','geton_time','user_category','user_count']].copy()
bus_sample['geton_morning'] = bus_sample['geton_time'].apply(lambda x: int(x.split(':')[0]) <=9 )
bus_passender_cluster_count = bus_sample.groupby(['geton_date','geton_station_code','geton_morning','user_category'])['user_count'].sum().reset_index()

bus_passender_cluster_count_morning = bus_passender_cluster_count[bus_passender_cluster_count['geton_morning']==True]
geton_bus_passender_cluster_count_morning = pd.pivot_table( bus_passender_cluster_count_morning, index = ['geton_date', 'geton_station_code'],
                                columns=['user_category'], values = ['user_count'], aggfunc='sum').reset_index()

geton_bus_passender_cluster_count_morning.columns = ['geton_date', 'geton_station_code']  +\
                            ['getin_user_count1_morning','getin_user_count2_morning','getin_user_count4_morning','getin_user_count6_morning','d1','d2','d3','d4']

geton_bus_passender_cluster_count_morning = geton_bus_passender_cluster_count_morning.drop(['d1','d2','d3','d4'],1)


In [9]:
def calculate_getoff_time(val):
    if val <= 9 :
        return 0
    elif val <= 12:
        return 1
    else:
        return 2

In [10]:
# 6~9, 9~12시 사이에 각기 다른 집단의 사람들이 몇명 내렸는지
bus_sample = df_bus[['geton_date','getoff_station_code','getoff_time','user_category','user_count']].copy()
bus_sample = bus_sample[bus_sample['getoff_time'].notnull()]
bus_sample['getoff_hour'] =  bus_sample['getoff_time'].apply(lambda x: int(x.split(':')[0]) )
bus_sample['getoff_hour'] =  bus_sample['getoff_hour'].apply(calculate_getoff_time)

bus_passender_cluster_count = bus_sample.groupby(['geton_date','getoff_station_code','getoff_hour','user_category'])['user_count'].sum().reset_index()

takeoff_bus_passender_cluster_count_noon = bus_passender_cluster_count[bus_passender_cluster_count['getoff_hour']==1]


takeoff_bus_passender_cluster_count_noon = pd.pivot_table( takeoff_bus_passender_cluster_count_noon, index = ['geton_date', 'getoff_station_code'],
                                                                    columns=['user_category'], values = ['user_count'], aggfunc='sum').reset_index()



takeoff_bus_passender_cluster_count_noon.columns = ['geton_date', 'getoff_station_code']  +\
                            ['takeoff_user_count1_noon','takeoff_user_count2_noon','takeoff_user_count4_noon','takeoff_user_count6_noon','d1','d2','d3','d4']

takeoff_bus_passender_cluster_count_noon = takeoff_bus_passender_cluster_count_noon.drop(['d1','d2','d3','d4'],1)


## Main Dataset

In [11]:
train = pd.read_csv(path+'train.csv', parse_dates =['date'])
test = pd.read_csv(path+'test.csv', parse_dates =['date'])

In [12]:
n_trn = len(train)
target_col = '18~20_ride'

In [13]:
# Make a whole dataset
combined = train.append(test, ignore_index=True)

In [14]:
combined.head()

Unnamed: 0,10~11_ride,10~11_takeoff,11~12_ride,11~12_takeoff,18~20_ride,6~7_ride,6~7_takeoff,7~8_ride,7~8_takeoff,8~9_ride,8~9_takeoff,9~10_ride,9~10_takeoff,bus_route_id,date,id,in_out,latitude,longitude,station_code,station_name
0,2.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,5.0,0.0,4270000,2019-09-01,0,시외,33.4899,126.49373,344,제주썬호텔
1,5.0,0.0,6.0,0.0,5.0,1.0,0.0,4.0,0.0,4.0,0.0,2.0,0.0,4270000,2019-09-01,1,시외,33.48944,126.48508,357,한라병원
2,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,4270000,2019-09-01,2,시외,33.48181,126.47352,432,정존마을
3,14.0,0.0,16.0,0.0,53.0,0.0,0.0,17.0,0.0,6.0,0.0,26.0,0.0,4270000,2019-09-01,3,시내,33.50577,126.49252,1579,제주국제공항(600번)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4270000,2019-09-01,4,시내,33.25579,126.4126,1646,중문관광단지입구


In [15]:
# Sequence of the station
combined['station_sequence'] = 1
combined['station_reverse_sequence'] = combined[::-1].groupby(['date','bus_route_id'])['station_sequence'].cumsum()[::-1]
combined['station_sequence'] = combined.groupby(['date','bus_route_id'])['station_sequence'].cumsum()

In [16]:
# Change the dtype of "date"
combined['weekday'] = combined['date'].dt.weekday.astype(np.int8)

In [17]:
# Holidays
national_holidays = [datetime.date(2019, 9,12),datetime.date(2019, 9,13), datetime.date(2019, 9,14),
                        datetime.date(2019, 10,3), datetime.date(2019, 10,9)]
combined['is_national_holiday'] = combined['date'].apply(lambda x: x in national_holidays).astype(np.int8)

In [18]:
# Sum-up the number of passengers for two intervers
morining_getin_cols = ['6~7_ride', '7~8_ride', '8~9_ride']
noon_getin_cols = ['9~10_ride', '10~11_ride', '11~12_ride']

morning_takeoff_cols = ['6~7_takeoff', '7~8_takeoff','8~9_takeoff']
noon_takeoff_cols = ['9~10_takeoff', '10~11_takeoff', '11~12_takeoff']

# Morning getin/takeoff & Noon getin/takeoff
combined['morning_getin'] = combined[morining_getin_cols].sum(axis=1)
combined['morning_takeoff'] = combined[morning_takeoff_cols].sum(axis=1)

combined['noon_getin'] = combined[noon_getin_cols].sum(axis=1)
combined['noon_takeoff'] = combined[noon_takeoff_cols].sum(axis=1)

combined = combined.drop(morining_getin_cols  + noon_getin_cols + morning_takeoff_cols + noon_takeoff_cols ,1)

In [19]:
# STATION_CODE

# Sum of passenger per morning (getin)  
station_morning_getin_sum = combined.groupby(['date','station_code'])['morning_getin'].sum().reset_index()
station_morning_getin_sum = station_morning_getin_sum.rename(columns = {'morning_getin': 'station_morning_getin_sum'})

# Sum of passenger per morning (takeoff)  
station_morning_takeoff_sum = combined.groupby(['date','station_code'])['morning_takeoff'].sum().reset_index()
station_morning_takeoff_sum = station_morning_takeoff_sum.rename(columns = {'morning_takeoff': 'station_morning_takeoff_sum'})

# Merge
combined = pd.merge(combined, station_morning_getin_sum , on =['date','station_code'], how='left')
combined = pd.merge(combined, station_morning_takeoff_sum , on =['date','station_code'], how='left')

In [20]:
# BUS_ROUTE

# Sum of passenger per morning (getin)  
bus_route_getin_sum = combined.groupby(['date','bus_route_id'])['morning_getin'].sum().reset_index()
bus_route_getin_sum = bus_route_getin_sum.rename(columns = {'morning_getin': 'bus_route_getin_sum'})

# Sum of passenger per morning (takeoff)  
bus_route_takeoff_sum = combined.groupby(['date','bus_route_id'])['morning_takeoff'].sum().reset_index()
bus_route_takeoff_sum = bus_route_takeoff_sum.rename(columns = {'morning_takeoff': 'bus_route_takeoff_sum'})

# Merge
combined = pd.merge(combined, bus_route_getin_sum , on =['date','bus_route_id'], how='left')
combined = pd.merge(combined, bus_route_takeoff_sum , on =['date','bus_route_id'], how='left')

In [21]:
# STATION_CODE

# mean of passenger per morning (getin)  -- noon getin not working
station_morning_getin_mean = combined.groupby(['date','station_code'])['morning_getin'].mean().reset_index()
station_morning_getin_mean = station_morning_getin_mean.rename(columns = {'morning_getin': 'station_morning_getin_mean'})

# mean of passenger per morning (getin)  -- noon getin not working
station_morning_takeoff_mean = combined.groupby(['date','station_code'])['morning_takeoff'].mean().reset_index()
station_morning_takeoff_mean = station_morning_takeoff_mean.rename(columns = {'morning_takeoff': 'station_morning_takeoff_mean'})

# Merge
combined = pd.merge(combined, station_morning_getin_mean , on =['date','station_code'], how='left')
combined = pd.merge(combined, station_morning_takeoff_mean , on =['date','station_code'], how='left')


In [22]:
# BUS_ROUTE

# mean of passenger per morning (getin)  
bus_route_getin_mean = combined.groupby(['date','bus_route_id'])['morning_getin'].mean().reset_index()
bus_route_getin_mean = bus_route_getin_mean.rename(columns = {'morning_getin': 'bus_route_getin_mean'})

# mean of passenger per morning (takeoff)  
bus_route_takeoff_mean = combined.groupby(['date','bus_route_id'])['morning_takeoff'].mean().reset_index()
bus_route_takeoff_mean = bus_route_takeoff_mean.rename(columns = {'morning_takeoff': 'bus_route_takeoff_mean'})

# Merge
combined = pd.merge(combined, bus_route_getin_mean , on =['date','bus_route_id'], how='left')
combined = pd.merge(combined, bus_route_takeoff_mean , on =['date','bus_route_id'], how='left')

In [23]:
# Kmeans

combined['bus_route_station'] = combined['bus_route_id'].astype(np.str)+'_'+combined['station_code'].astype(np.str)

In [24]:
# Kmeans1
df_cluster = combined[['date','bus_route_id','station_code','morning_getin']].copy()
df_cluster['bus_route_station'] = df_cluster['bus_route_id'].astype(np.str)+'_'+df_cluster['station_code'].astype(np.str)
df_cluster_pivot = pd.pivot_table(data = df_cluster, index='bus_route_station', columns='date',
                                  values='morning_getin', aggfunc='sum').fillna(0)

kmeans = MiniBatchKMeans(n_clusters=200, random_state=1993)

%time kmeans.fit(df_cluster_pivot)

df_cluster_pivot['kmeans1'] = kmeans.predict(df_cluster_pivot)

combined = pd.merge(combined, df_cluster_pivot[['kmeans1']], left_on = 'bus_route_station', right_index=True, 
                   how='left')

Wall time: 381 ms


In [25]:
# Kmeans2
df_cluster = combined[['date','bus_route_id','station_code','noon_getin']].copy()
df_cluster['bus_route_station'] = df_cluster['bus_route_id'].astype(np.str)+'_'+df_cluster['station_code'].astype(np.str)
df_cluster_pivot = pd.pivot_table(data = df_cluster, index='bus_route_station', columns='date',
                                  values='noon_getin', aggfunc='sum').fillna(0)

kmeans = MiniBatchKMeans(n_clusters=200, random_state=1993)

%time kmeans.fit(df_cluster_pivot)

df_cluster_pivot['kmeans2'] = kmeans.predict(df_cluster_pivot)

combined = pd.merge(combined, df_cluster_pivot[['kmeans2']], left_on = 'bus_route_station', right_index=True, 
                   how='left')

Wall time: 536 ms


In [26]:
# Merge 
combined = pd.merge(combined, df_commuter, on = 'station_code',how='left')

In [27]:
# Afternoon-Getoff-Amount
df_afternoon_getoff_amount['getoff_date'] = pd.to_datetime(df_afternoon_getoff_amount['getoff_date'] )
combined = pd.merge(combined, df_afternoon_getoff_amount,
                 left_on = ['bus_route_id','date','station_code'],
                 right_on = ['bus_route_id','getoff_date','getoff_station_code'],
                 how='left')

combined = combined.drop(['getoff_date','getoff_station_code'], 1)
combined['afternoon_takeoff'] = combined['afternoon_takeoff'].fillna(0)

In [28]:
# 이전 버스와의 배차간격?
date_route_stataion_waittime['geton_date'] = pd.to_datetime(date_route_stataion_waittime['geton_date'] )
combined = pd.merge(combined, date_route_stataion_waittime, 
                                         left_on =['date','bus_route_id'] ,
                                         right_on =['geton_date','bus_route_id'],
                                             how='left')
combined = combined.drop(['geton_date'],1)


In [29]:
# 6~9, 9~12시 사이에 각기 다른 집단의 사람들이 몇명 탑승했는지
geton_bus_passender_cluster_count_morning['geton_date'] = pd.to_datetime(geton_bus_passender_cluster_count_morning['geton_date'] )
combined = pd.merge( combined, geton_bus_passender_cluster_count_morning , left_on = ['date','station_code'],
                                         right_on = ['geton_date', 'geton_station_code'],
                                         how = 'left')

combined = combined.drop(['geton_station_code','geton_date'],1)

In [30]:
# 6~9, 9~12시 사이에 각기 다른 집단의 사람들이 몇명 내렸는지
takeoff_bus_passender_cluster_count_noon['geton_date'] = pd.to_datetime(takeoff_bus_passender_cluster_count_noon['geton_date'] )
combined = pd.merge( combined, takeoff_bus_passender_cluster_count_noon , left_on = ['date','station_code'],
                                                                         right_on = ['geton_date', 'getoff_station_code'],
                                                                         how = 'left')

combined = combined.drop(['getoff_station_code','geton_date'],1)

In [31]:
# 기상데이터 -- 12시 전에 수집된 강수량
df_rain = pd.read_csv('../preprocessed_external_dataset/hourly_rain.csv')
df_rain['date'] = pd.to_datetime(df_rain['date'])

combined = pd.merge(combined, df_rain, on='date', how='left')

# 기상데이터 -- 전날 강수량
df_daily_rain = pd.read_csv('../preprocessed_external_dataset/daily_rain.csv')
df_daily_rain['date'] = pd.to_datetime(df_daily_rain['date'])
df_daily_rain.columns = ['prev_date','prev_daily_rain']

combined['prev_date'] = pd.to_datetime(combined['date']) - pd.Timedelta('1 day')
combined = pd.merge(combined, df_daily_rain, on='prev_date', how='left')

# 기상데이터 -- 12시 전에 수집된 운집량
df_cloud = pd.read_csv('../preprocessed_external_dataset/hourly_cloud.csv')
df_cloud['date'] = pd.to_datetime(df_rain['date'])

combined = pd.merge(combined, df_cloud, on='date', how='left')

In [32]:
# Google-Map을 통한 정류장의 주소정보
# 위경도 좌표를 입력값으로 해당 좌표의 주소를 크롤링한 pickle파일입니다.
geo_df2 = pd.read_pickle('../preprocessed_external_dataset/second_whole_dict.pickle')
combined['latlong_second'] = combined['latitude'].astype(np.str) +'_'+ combined['longitude'].astype(np.str)
combined['latlong_second'] = combined['latlong_second'].apply(lambda x: geo_df2.get(x))

In [33]:
# 주소 별 거주자 수
df_pop = pd.read_csv('../preprocessed_external_dataset/제주도_거주자수.csv')
combined['district'] = combined['latlong_second'].apply(lambda x: x.split(' ')[1].split(' ')[0])
combined = pd.merge(combined, df_pop, on='district', how='left')

In [35]:
# Drop unnecessary columns
drop_cols = ['id','date', 'station_name','bus_route_station',] +\
                ['getin_user_count4_morning', 'takeoff_user_count4_noon', 'getin_user_count6_morning', 'takeoff_user_count6_noon',
                'prev_date','district',
                ]

In [36]:
train, test = combined[:n_trn].drop(drop_cols,1) , combined[n_trn:].drop(drop_cols,1)

In [37]:
train.to_pickle('preprocessed_train.pickle')
test.to_pickle('preprocessed_test.pickle')

In [39]:
# Label Encoding
cat_cols = ['bus_route_id','station_code','in_out','latlong_second'
            ]

for col in tqdm_notebook(cat_cols):
    lbl = LabelEncoder()
    lbl.fit( train[col].tolist() + test[col].tolist() )
    train[col] = lbl.transform( train[[col]]  )
    test[col] = lbl.transform( test[[col]]  )

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [40]:
# Before modeling
train_set = train.drop([target_col]+[],1)
test_set = test.drop([target_col]+[],1)

train_label = train[target_col]
test_label = test[target_col]

In [41]:
# Basic LGBM Model
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.01,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.78,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }

In [None]:
seeds = [1993]

outer_oof_train = np.zeros( train.shape[0] )
outer_oof_test = np.zeros( test.shape[0] )

for seed in tqdm_notebook(seeds):
    
    cv_list = []

    oof_train = np.zeros( train.shape[0] )
    final_test = np.zeros( test.shape[0] )

    kfolds = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=seed )

    for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate(kfolds.split(train_set, train_set['bus_route_id'])) ):

        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]
        
        dtrain = lgbm.Dataset( X_train, y_train )
        dvalid = lgbm.Dataset( X_valid, y_valid ,reference=dtrain)
        
        lgbm_param['seed'] = seed

        model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, valid_sets=(dtrain, dvalid), valid_names=('train','valid'), 
                            categorical_feature=['bus_route_id','station_code','weekday',\
                                                'kmeans1','kmeans2',
#                                                  'latlong_second'
#                                               'route_kmeans_cluster',
                                                ] ,
                           verbose_eval= 100)

        valid_pred = model.predict(X_valid)
        test_pred  = model.predict(test_set)

        oof_train[val_ind] += valid_pred
        final_test += test_pred

        cv_list.append( sqrt(mean_squared_error(y_valid, valid_pred)) )

        print('='*80)

    final_test /= n_splits

    print(f"Average CV : {np.mean(cv_list)}")
    print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")
    
    outer_oof_train += oof_train
    outer_oof_test += final_test
    
outer_oof_train /= len(seeds)
outer_oof_test /= len(seeds)

print(f"Overall for OOF: {sqrt(mean_squared_error(train_label, outer_oof_train))}")


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.94329	valid's rmse: 2.89743
[200]	train's rmse: 2.32829	valid's rmse: 2.38356
[300]	train's rmse: 2.08703	valid's rmse: 2.22306
[400]	train's rmse: 1.96039	valid's rmse: 2.1634
[500]	train's rmse: 1.87659	valid's rmse: 2.13815
[600]	train's rmse: 1.81553	valid's rmse: 2.1249


In [162]:
df_imp = pd.DataFrame(data = {'col': model.feature_name(),
                              'imp': model.feature_importance()
                              })
df_imp = df_imp.sort_values(by='imp', ascending=False)
df_imp

Unnamed: 0,col,imp
4,station_code,14359
0,bus_route_id,10088
11,noon_getin,7601
9,morning_getin,4530
28,takeoff_user_count1_noon,4067
25,next_bus_time_diff,4029
12,noon_takeoff,3774
33,hourly_cloud,3753
18,station_morning_takeoff_mean,3640
22,kmeans2,3562


In [None]:
oof_train = [x if x>0 else 0 for x in  oof_train]
final_test = [x if x>0 else 0 for x in  final_test]

In [None]:
print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")

In [None]:
sns.distplot( np.log1p( train_label ) )

In [None]:
sns.distplot( np.log1p( oof_train ) )
sns.distplot( np.log1p( final_test ) )

In [97]:
df_sub = pd.read_csv(path + 'submission_sample.csv')
df_sub['18~20_ride'] = final_test

df_sub.to_csv('ggg.csv',index=False)