In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
tada_eta = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/tada_eta.xlsx')
print(tada_eta.columns)
tada_eta.head()

Index(['ATA', 'id', 'api_eta', 'created_at_kst', 'month', 'pickup_lng',
       'pickup_lat', 'pickup_gu', 'driver_id', 'driver_lng', 'driver_lat',
       'hour'],
      dtype='object')


Unnamed: 0,ATA,id,api_eta,created_at_kst,month,pickup_lng,pickup_lat,pickup_gu,driver_id,driver_lng,driver_lat,hour
0,10.6,R4L9FL6NVEL4D0,5.65,2019-12-01T00:55:09,12,127.091399,37.624545,노원구,DST52944,127.069978,37.616472,0
1,9.2,R3A7JBY7CS6TPD,8.6,2019-12-01T00:09:39,12,127.023351,37.625564,강북구,DPC72995,127.026479,37.604711,0
2,10.85,R2BIUY1TDK14XE,10.53,2019-12-15T00:18:42,12,126.881154,37.481641,금천구,DPP94923,126.906225,37.452167,0
3,5.52,R31PU5GNHSGU6C,1.92,2019-07-31T00:24:11,7,126.988375,37.567897,중구,DCT73585,126.99117,37.566416,0
4,8.6,R41WR6N1VT42DJ,7.1,2019-07-05T00:40:34,7,126.988684,37.566716,중구,DPJ86237,127.00686,37.571164,0


In [5]:
tada_eta['distance'] = ((tada_eta['pickup_lat']-tada_eta['driver_lat'])**2 + (tada_eta['pickup_lng']-tada_eta['driver_lng'])**2)*100000
tada_eta = tada_eta.drop(['id', 'created_at_kst', 'driver_id','pickup_gu'],1)

  


In [6]:
tada_eta = tada_eta.sample(frac=1, random_state=0).reset_index(drop=True)
tada_eta.head()

Unnamed: 0,ATA,api_eta,month,pickup_lng,pickup_lat,driver_lng,driver_lat,hour,distance
0,6.13,8.32,12,126.913779,37.54901,126.913082,37.527922,22,44.521977
1,5.37,3.05,12,127.089688,37.50171,127.084503,37.498104,11,3.989066
2,6.27,5.0,7,126.972378,37.56936,126.972541,37.576297,19,4.814699
3,10.52,8.47,12,126.974081,37.519973,127.001186,37.526196,21,77.341173
4,12.45,6.65,7,127.025221,37.52464,127.026035,37.512465,12,14.888851


In [7]:
tada_eta.shape

(13916, 9)

In [8]:
train = tada_eta[:12000]
test = tada_eta[12000:]

In [9]:
x_train = np.asarray(train.drop('ATA',1))
y_train = np.asarray(train['ATA'])
x_test = np.asarray(test.drop('ATA',1))
y_test = np.asarray(test['ATA'])
eta_features = [x for i,x in enumerate(tada_eta.columns) if i!=0]

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
eta_features

['api_eta',
 'month',
 'pickup_lng',
 'pickup_lat',
 'driver_lng',
 'driver_lat',
 'hour',
 'distance']

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# each feature should be centered (zero mean) and with unit variance
X_train_normalized = StandardScaler().fit(x_train).transform(x_train)  
pca_train = PCA(n_components = 6).fit(X_train_normalized)
X_train_pca = pca_train.transform(X_train_normalized)

In [12]:
X_test_normalized = StandardScaler().fit(x_test).transform(x_test)  
pca_test = PCA(n_components = 6).fit(X_test_normalized)
X_test_pca = pca_test.transform(X_test_normalized)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

# train set과 test set의 input과 output를 각각 이어 붙여서 X,y를 정의
X = np.concatenate((X_train_pca,X_test_pca), axis=0)
y = np.concatenate((y_train,y_test), axis=0)

In [14]:
# 전체 데이터 X에서 training data에 해당하는 index는 -1, test data에 해당하는 index는 0이 되도록,
# 여기서는 [-1, -1, ... , -1, 0, 0, ... , 0] 같은 형태의 1차원 배열 test fold와 predefined split을 정의
pds = PredefinedSplit(test_fold=[-1]*len(X_train_pca)+[0]*len(X_test_pca))


In [15]:
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
# Hyperparameter 조합 정의
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [2, 3, 4, 5],
              'min_samples_leaf': [1, 20, 100],
              'learning_rate': [0.01, 0.02, 0.05],
              'max_features': [1,2,3],
              'min_samples_split':[2,3],
              'loss': ['ls']}

In [16]:
# grid search 모델 정의, 학습 및 model selection
grid_search = GridSearchCV(estimator=ensemble.GradientBoostingRegressor(), 
                           param_grid=param_grid, 
                           cv=pds, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X, y)
print(grid_search.best_params_)

Fitting 1 folds for each of 648 candidates, totalling 648 fits




{'learning_rate': 0.05, 'loss': 'ls', 'max_depth': 3, 'max_features': 2, 'min_samples_leaf': 20, 'min_samples_split': 3, 'n_estimators': 200}


In [17]:
mse = mean_squared_error(y_test, grid_search.predict(X_test_pca))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
print("The initial error of API ETA on test set: {:.4f}".format(mean_squared_error(y_test, x_test[:,0]) ))

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, grid_search.predict(X_test_pca))
print("The mean absolute error (MAE) on test set: {:.4f}".format(mae))
print("The initial error of API ETA on test set: {:.4f}".format(mean_absolute_error(y_test, x_test[:,0]) ))

The mean squared error (MSE) on test set: 8.1941
The initial error of API ETA on test set: 13.7135
The mean absolute error (MAE) on test set: 2.1119
The initial error of API ETA on test set: 2.7047
