In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 # 範例 : 計程車費率預測
 https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

 # [作業目標]
 - 使用並觀察特徵組合, 在計程車費率預測競賽的影響

 # [作業重點]
 - 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
 - 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [2]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
# df.head()
df.shape



(5000, 6)

In [3]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
# df.head()
df.shape



(5000, 12)

In [4]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



Linear Reg Score : 0.02687687147563449
Gradient Boosting Reg Score : 0.7117873622216987


In [5]:
# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()



Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [6]:
# 結果 : 準確度上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


Linear Reg Score : 0.027551630955711625
Gradient Boosting Reg Score : 0.804506059384505


 # 作業1
 * 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?

In [7]:
# 稍微看一下大致 latitude 分布狀況
print(df['pickup_latitude'].describe())
print(df['dropoff_latitude'].describe())


count    5000.000000
mean       39.981148
std         5.989760
min       -73.999735
25%        40.735483
50%        40.752824
75%        40.767127
max        41.366138
Name: pickup_latitude, dtype: float64
count    5000.000000
mean       39.972626
std         6.016378
min       -74.002015
25%        40.733386
50%        40.752687
75%        40.767461
max        41.366138
Name: dropoff_latitude, dtype: float64


In [8]:
longitude_ratio = np.cos(40.7/180*np.pi)
df['distance_real'] = np.sqrt((df['longitude_diff']*longitude_ratio)**2 + df['latitude_diff']**2)
df[['distance_2D', 'distance_real', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()


Unnamed: 0,distance_2D,distance_real,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.007569,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,0.018289,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.007763,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.016885,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.032629,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [9]:
# 觀察結果 
df_temp = df.drop(['distance_2D'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


Linear Reg Score : 0.027504012313373648
Gradient Boosting Reg Score : 0.8054747748148199


 # 作業2
 * 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [10]:
train_X = scaler.fit_transform(df[['distance_real']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



Linear Reg Score : 0.0014467232932046097
Gradient Boosting Reg Score : 0.7188173679091587
