### 필요 라이브러리 정의

In [11]:
import pandas as pd
import folium as g
from folium.plugins import MarkerCluster
from haversine import haversine
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

pd.options.display.float_format = '{:.5f}'.format

---

### 데이터 로드

In [12]:
train = pd.read_parquet('./jeju_data/train.parquet')
test = pd.read_parquet('./jeju_data/test.parquet')
weather = pd.read_csv('./jeju_data/jeju_weather.csv', encoding='cp949')
weather = weather[weather['지점명']=='제주']

---

### feature만드는 함수

In [13]:
def make_year(df):
    dt = df['base_date'].astype('str')
    month_data = pd.to_datetime(dt)
    md = month_data.dt.year
    return md


def make_month(df):
    dt = df['base_date'].astype('str')
    month_data = pd.to_datetime(dt)
    md = month_data.dt.month
    return md


def make_day(df):
    dt = df['base_date'].astype('str')
    month_data = pd.to_datetime(dt)
    md = month_data.dt.day
    return md


def turn_road_rate(df):
    df.loc[(df['start_turn_restricted'] == '있음') & (df['road_rating'] == 107), 'turn_road_rate'] = 0
    df.loc[(df['start_turn_restricted'] == '있음') & (df['road_rating'] == 103), 'turn_road_rate'] = 1
    df.loc[(df['start_turn_restricted'] == '없음') & (df['road_rating'] == 107), 'turn_road_rate'] = 2
    df.loc[(df['start_turn_restricted'] == '있음') & (df['road_rating'] == 106), 'turn_road_rate'] = 3
    df.loc[(df['start_turn_restricted'] == '없음') & (df['road_rating'] == 103), 'turn_road_rate'] = 4
    df.loc[(df['start_turn_restricted'] == '없음') & (df['road_rating'] == 106), 'turn_road_rate'] = 5
    return df['turn_road_rate']


def end_turn_road_rate(df):
    df.loc[(df['end_turn_restricted'] == '있음') & (df['road_rating'] == 107), 'end_turn_road_rate'] = 0
    df.loc[(df['end_turn_restricted'] == '있음') & (df['road_rating'] == 103), 'end_turn_road_rate'] = 1
    df.loc[(df['end_turn_restricted'] == '없음') & (df['road_rating'] == 107), 'end_turn_road_rate'] = 2
    df.loc[(df['end_turn_restricted'] == '있음') & (df['road_rating'] == 106), 'end_turn_road_rate'] = 3
    df.loc[(df['end_turn_restricted'] == '없음') & (df['road_rating'] == 103), 'end_turn_road_rate'] = 4
    df.loc[(df['end_turn_restricted'] == '없음') & (df['road_rating'] == 106), 'end_turn_road_rate'] = 5
    return df['end_turn_road_rate']


def make_dist(df):
    dist = []
    for i in tqdm(range(len(df))):
        start_location = (df['start_latitude'][i], df['start_longitude'][i])
        end_location = (df['end_latitude'][i], df['end_longitude'][i])
        
        dist.append(haversine(start_location, end_location))

    return dist


def make_week(df):
    dt = df['base_date'].astype('str')
    data = pd.to_datetime(dt)

    b_list = []
    
    for i in tqdm(range(len(df))):
        b_list.append(data[i].weekday())
    
    return b_list


def week_mapping(df):
    if df['week'] <= 4:
        val = 0
    else:
        val = 1
    return val


# cyclical continuous features - 24-hour time 주기성을 가지는 데이터를 알맞게 변환
def cyclical_feature(df):
    df['sin_time'] = np.sin(2*np.pi*df.base_hour/24)
    df['cos_time'] = np.cos(2*np.pi*df.base_hour/24)
    
    
def over_max_speed(df):
    df.loc[(df['maximum_speed_limit'] == 30), 'over_max_speed'] = 1
    df.loc[(df['maximum_speed_limit'] == 40), 'over_max_speed'] = 1
    df.loc[(df['maximum_speed_limit'] == 50), 'over_max_speed'] = 0
    df.loc[(df['maximum_speed_limit'] == 60), 'over_max_speed'] = 0
    df.loc[(df['maximum_speed_limit'] == 70), 'over_max_speed'] = 0
    df.loc[(df['maximum_speed_limit'] == 80), 'over_max_speed'] = 0
    
    return df['over_max_speed']


def make_Ymd(df):
    dt = df['일시'].astype('str')
    month_data = pd.to_datetime(dt).dt.strftime("%Y%m%d")
    return month_data

from geopy.geocoders import Nominatim
def geocoding_reverse(lat_lng_str): 
    geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
    address = geolocoder.reverse(lat_lng_str)

    return address

# train['location'] = train['start_latitude'].astype(str) + ',' + train['start_longitude'].astype(str)
# geocoding_reverse(train['location'][10])


### 함수적용

In [15]:
tra = make_dist(train)
train['year'] = make_year(train)
train['month'] = make_month(train)
train['day'] = make_day(train)
train['turn_road_rate'] = turn_road_rate(train)
train['end_turn_road_rate'] = end_turn_road_rate(train)
train['distance'] = tra
train['week'] = make_week(train)
train['week'] = train.apply(week_mapping, axis=1)
train['over_max_speed'] = over_max_speed(train)
# cyclical_feature(train)

tes = make_dist(test)
test['year'] = make_year(test)
test['month'] = make_month(test)
test['day'] = make_day(test)
test['turn_road_rate'] = turn_road_rate(test)
test['end_turn_road_rate'] = end_turn_road_rate(test)
test['distance'] = tes
test['week'] = make_week(test)
test['week'] = test.apply(week_mapping, axis=1)
test['over_max_speed'] = over_max_speed(test)

# cyclical_feature(test)

100%|██████████| 4701217/4701217 [01:49<00:00, 42939.86it/s]
100%|██████████| 4701217/4701217 [00:39<00:00, 118103.76it/s]
100%|██████████| 291241/291241 [00:06<00:00, 42892.52it/s]
100%|██████████| 291241/291241 [00:02<00:00, 122640.70it/s]


In [16]:
weather['일시'] = make_Ymd(weather)
weather = weather.drop(['지점', '지점명'], axis=1)
weather = weather.rename(columns={'일시': 'base_date'})
train['base_date'] = train['base_date'].astype('str')
merge = pd.merge(train, weather, on='base_date', how='left')

In [5]:
# def road_division(df):    
#     df.loc[(df['road_name']).str.contains('지방도'), 'road_division'] = 0
#     df.loc[(df['road_name']).str.contains('일반국도'), 'road_division'] = 1
#     df.loc[((df['road_name']).str.contains('지방도', na=False)) & ((train['road_name']).str.contains('일반국도', na=False)), 'road_division'] = 2

# train['road_division'] = road_division(train)

In [20]:
merge['일강수량(mm)'] = merge['일강수량(mm)'].fillna(0)


In [23]:
train = merge

In [7]:
# def road_division():
#     if train[train['road_name'].str.contains('지방도')]: return 0
#     elif train[train['road_name'].str.contains('일반국도')]: return 1
#     else: return 2

# train['road_division'] = train['road_division'].apply(road_division())

In [45]:
Train = train.copy()

In [24]:
indexl = train[train['road_name'] == '-'].index
train.drop(indexl, inplace=True)

tindexl = test[test['road_name'] == '-'].index
test.drop(tindexl, inplace=True)

---

In [25]:
str_col = ['day_of_week', 'start_turn_restricted', 'end_turn_restricted', 'road_name', 'start_node_name', 'end_node_name']
for i in str_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])


In [26]:
X = train.drop(
    ['id', 'base_date', 'target'], axis=1)

y = train['target']

test = test.drop(
    ['id', 'base_date'], axis=1
)

In [27]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1103)

In [23]:
# cat_best_parm = {'learning_rate': 0.21020261303127669, 'bagging_temperature': 75.5373108410501, 'n_estimators': 4891, 'max_depth': 12, 'random_strength': 19, 'colsample_bylevel': 0.45647247075797176, 'l2_leaf_reg': 4.82164247062514e-07, 'min_child_samples': 82, 'max_bin': 448, 'od_type': 'IncToDec'}
# xgb_best_param = {'n_estimators': 2626, 'max_depth': 15, 'min_child_weight': 39, 'gamma': 1, 'colsample_bytree': 0.8320047615067258, 'lambda': 8.576465850923702, 'alpha': 1.2726833950057483, 'subsample': 0.8}

In [10]:
# from catboost import CatBoostRegressor
# LR = CatBoostRegressor(iterations=1000).fit(x_train, y_train)
# y_pred = LR.predict(x_val)
# mae = mean_absolute_error(y_val, y_pred)
# print(mae)

In [28]:
LR = XGBRegressor().fit(x_train, y_train)
y_pred = LR.predict(x_val)
mae = mean_absolute_error(y_val, y_pred)
print(mae)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


3.767482240889293


In [1]:
# xgb_model = XGBRegressor(**xgb_best_param).fit(X, y)a

In [None]:
y_pred = xgb_model.predict(test)

In [None]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['target'] = y_pred
sample_submission.to_csv("./submit_xgb.csv", index=False)

In [10]:
# import optuna
# from optuna import Trial
# from optuna.samplers import TPESampler

# # 1. Define an objective function to be maximized.
# def objective_xgb(trial: Trial, x, y):


# # 2. Suggest values for the hyperparameters using a trial object
#     param = {
#         "n_estimators": trial.suggest_int('n_estimators', 500, 4000),
#         'max_depth': trial.suggest_int('max_depth', 8, 16),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#         'gamma': trial.suggest_int('gamma', 1, 3),
#         'learning_rate': 0.01,
#         'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#         'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#         'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
#         'random_state': 42
#     }

#     x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

#     model = XGBRegressor(**param)
#     xgb_model = model.fit(x_train, y_train, verbose=False, eval_set=[(x_val, y_val)])
#     y_pred = xgb_model.predict(x_val)
#     score = mean_absolute_error(y_val, y_pred)

#     return score


# study = optuna.create_study(direction='minimize', sampler=TPESampler())
# study.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=30)
# print('Best trial: score {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))