In [5]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import gc
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import mean_absolute_error
from datetime import datetime
from xgboost import XGBRegressor
from haversine import haversine


In [6]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\제주도 도로\train.csv')
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\제주도 도로\test.csv')

display(train.head())
display(test.head())

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,height_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted
0,TEST_000000,20220825,목,17,3,107,연삼로,0,0,70.0,...,0.0,0,산지2교,33.499427,126.541298,없음,제주은행사거리,33.500772,126.543837,있음
1,TEST_000001,20220809,화,12,2,103,일반국도12호선,0,0,70.0,...,0.0,3,중문입구,33.258507,126.427003,없음,관광단지입구,33.258119,126.41584,없음
2,TEST_000002,20220805,금,2,1,103,일반국도16호선,0,0,60.0,...,0.0,0,도순3교,33.25896,126.476508,없음,도순2교,33.259206,126.474687,없음
3,TEST_000003,20220818,목,23,3,103,일반국도11호선,0,0,70.0,...,0.0,0,아라주공아파트,33.473494,126.545647,없음,인다마을,33.471061,126.545467,없음
4,TEST_000004,20220810,수,17,3,106,번영로,0,0,70.0,...,0.0,0,부록교 시종점,33.501477,126.569223,없음,봉개교 시종점,33.496863,126.58123,없음


## Feature Engineering

In [None]:
def cyclical_feature(df):
    df['sin_date'] = np.sin(2 * np.pi * df.base_hour / 24)
    df['cos_date'] = np.cos(2 * np.pi * df.base_hour / 24)

def group_time(df):
    df['group_time'] = '-'
    df.loc[(df['base_hour'] < 6), 'group_time'] = '새벽'
    df.loc[(df['base_hour'] >= 6) & (df['base_hour'] < 12), 'group_time'] = '아침'
    df.loc[(df['base_hour'] >= 12) & (df['base_hour'] < 19), 'group_time'] = '오후'
    df.loc[(df['base_hour'] >= 19) & (df['base_hour'] <= 24), 'group_time'] = '저녁'
    df.loc[(df['group_time'] == '-'), 'group_time'] = 'Na'

    return df['group_time']

def make_month(df):
    dt = df['base_date'].astype('str')
    month_data = pd.to_datetime(dt)
    md = month_data.month

    return md

def group_season(df):
    df['season'] = '-'
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    df.loc[(df['season'] == '-'), 'season'] = 'Na'

    return df['season']

def make_week(df):
    dt = df['base_date'].astype('str')
    data = pd.to_datetime(dt)
    week = [i.weekday() for i in data]
    df['week'] = week
    df.loc[(df['week'] <= 4), 'week'] = 0
    df.loc[(df['week'] > 4), 'week'] = 1

    return df['week']

def vacation(df):
    df['vacation'] = '-'
    df.loc[(df['month'] == 7) | (df['month'] == 8) | (df['month'] == 1) | (df['month'] == 2), 'vacation'] = 'vacation'
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5) | (df['month'] == 6) | 
           (df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11) | (df['month'] == 12), 'vacation'] ='semester'
    df.loc[(df['vacation'] == '-'), 'vacation'] = 'Na'

    return df['vacation']

def make_holiday(path):
    holiday = pd.read_csv(path)
    holiday['Year'] = holiday['Year'].astype('str')
    holiday['Month'] = holiday['Month'].astype('str')
    holiday['Day'] = holiday['Day'].astype('str')

    re_month = [holiday['Month'][i].zfill(2) for i in range(len(holiday))]
    re_day = [holiday['Day'][i].zfill(2) for i in range(len(holiday))]

    holiday['Month'] = re_month
    holiday['Day'] = re_day
    holiday['base_date'] = holiday['Year'] + holiday['Month'] + holiday['Day']
    holiday['holiday'] = 1

    holiday = holiday.drop(['Year', 'Month', 'Day', 'Info'], axis = 1)

    return holiday  

def make_holiday2(df,holiday):
    df['base_date'] = df['base_date'].astype('str')
    df = pd.merge(df, holiday, on = 'base_date', how = 'left')
    df['holiday'] = df['holiday'].fillna(0)

    return df['holiday']

def make_post_holiday(holiday, df):
    holiday_date = holiday['base_date']
    holiday_date = pd.to_datetime(holiday_date)
    post_holiday = holiday_date - pd.Timedelta(days = 1)
    holiday['post_date'] = post_holiday
    holiday = holiday.drop(['base_date'], axis = 1)
    holiday = holiday.rename(columns = ['holiday' : 'post_holiday'])

    df['post_holiday'] = df['base_date']
    df['post_date'] = df['post_date'].astype('str')
    df['post_date'] = pd.to_datetime(df['post_date'])

    df_merge_p = pd.merge(df, holiday, on = 'post_date', how = 'left')
    df_merge_p['post_holiday'] = df_merge_p['post_holiday'].fillna(0)

    return df_merge_p['post_holiday']

def make_pre_holiday(holiday, df):
    holiday_date = holiday['base_date']
    holiday_date = pd.to_datetime(holiday_date)
    pre_holiday = holiday_date + pd.Timedelta(days = 1)
    holiday['pre_date'] = pre_holiday
    holiday = holiday.drop(['base_date'], axis = 1)
    holiday = holiday.rename(columns = {'holiday' : 'pre_holiday'})

    df['pre_date'] = df['base_date']
    df['pre_date'] = df['pre_date'].astype('str')
    df['pre_date'] = pd.to_datetime(df['pre_date'])

    df_merge = pd.merge(df, holiday, on = 'pre_date', how = 'left')
    df_merge['pre_holiday'] = df_merge['pre_holiday'].fillna(0)

    return df_merge['pre_holiday']

def rest_day(df):
    df['week'] = df['week'].astype('str')
    df['rest'] = df['week'] + df['pre_holiday'] + df['holiday'] + df['post_holiday']
    df.loc[(df['rest'] >= 1), 'rest'] = 1
    df.loc[(df['rest'] == 0), 'rest'] = 0

def make_dist(df):
    start_location = tuple(zip(df['start_latitude'], df['start_longitude']))
    end_location = tuple(zip(df['end_latitude'], df['end_longitude']))
    hsine = [haversine(s, e) for s, e in zip(start_location, end_location)]

    return hsine

def make_cluster(train, test):
    train_c = train[['start_latitude', 'start_longitude']]
    test_c = test[['start_latitude', 'start_longitude']]
    cluster_centers = np.array([[33.26345514655621116162365069612860679626464843, 126.5203815031463392415389535017311573028564453], [33.37082277149481512878992361947894096374511718, 126.2976713570606790426609222777187824249267578], [33.48077890914120757770433556288480758666992187, 126.4946717292079512162672472186386585235595703] , [33.41815597422977646147046471014618873596191406, 126.7739831436176700663054361939430236816406250]])

    k_mean = KMeans(n_clusters=4, init=cluster_centers , random_state = 2)
    train['location_cluster'] = k_mean.fit_predict(train_c)
    test['location_cluster'] = k_mean.predict(test_c)
    
    return train, test

def jeju_dist(df):
    jeju_location = (33.4996213, 126.5311884)
    end_location = tuple(zip(df['end_latitude'], df['end_longitude']))
    hsine = [haversine(i, jeju_location) for i in end_location]

    return hsine

def seogwi_dist(df):
    jeju_location = (33.4996213, 126.5311884)
    end_location = tuple(zip(df['end_latitude'], df['end_longitude']))
    hsine = [haversine(i, jeju_location) for i in end_location]

    return hsine