# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
***
- 使用程車費率預測競賽練習時間欄位處理

In [2]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
pick_up_df=df[['pickup_datetime']]
df=df.loc[:,df.columns!='pickup_datetime'].astype('float')
df = pd.merge(df,pick_up_df, left_index=True, right_index=True)
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime
0,-73.99058,40.761071,-73.981128,40.758634,2.0,2011-10-21 23:54:10 UTC
1,-73.988403,40.723431,-73.989647,40.741695,1.0,2015-02-03 10:42:03 UTC
2,-74.015785,40.71511,-74.012029,40.707888,2.0,2014-03-16 18:58:58 UTC
3,-73.977322,40.787275,-73.95803,40.778838,3.0,2009-06-13 16:10:54 UTC
4,-73.989683,40.729717,-73.98249,40.761887,3.0,2014-06-12 03:25:56 UTC


In [3]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%Y')).astype('float')
df['pickup_month'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%m')).astype('float')
df['pickup_day'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%d')).astype('float')
df['pickup_hour'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%H')).astype('float')
df['pickup_minute'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%M')).astype('float')
df['pickup_second'] = df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x, '%S')).astype('float')
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,-73.99058,40.761071,-73.981128,40.758634,2.0,2011-10-21 23:54:10,2011.0,10.0,21.0,23.0,54.0,10.0
1,-73.988403,40.723431,-73.989647,40.741695,1.0,2015-02-03 10:42:03,2015.0,2.0,3.0,10.0,42.0,3.0
2,-74.015785,40.71511,-74.012029,40.707888,2.0,2014-03-16 18:58:58,2014.0,3.0,16.0,18.0,58.0,58.0
3,-73.977322,40.787275,-73.95803,40.778838,3.0,2009-06-13 16:10:54,2009.0,6.0,13.0,16.0,10.0,54.0
4,-73.989683,40.729717,-73.98249,40.761887,3.0,2014-06-12 03:25:56,2014.0,6.0,12.0,3.0,25.0,56.0


In [4]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.026876871475641616
Gradient Boosting Reg Score : 0.7113108866038375


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [5]:
# 加入星期幾與第幾周兩個特徵
from sklearn.preprocessing import LabelEncoder
"""
Your Code Here
"""
df['week_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%a')).astype('str')
df['week_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%W')).astype('float')
df['week_day'] = LabelEncoder().fit_transform(df['week_day'])
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,week_year
0,-73.99058,40.761071,-73.981128,40.758634,2.0,2011-10-21 23:54:10,2011.0,10.0,21.0,23.0,54.0,10.0,0,42.0
1,-73.988403,40.723431,-73.989647,40.741695,1.0,2015-02-03 10:42:03,2015.0,2.0,3.0,10.0,42.0,3.0,5,5.0
2,-74.015785,40.71511,-74.012029,40.707888,2.0,2014-03-16 18:58:58,2014.0,3.0,16.0,18.0,58.0,58.0,3,10.0
3,-73.977322,40.787275,-73.95803,40.778838,3.0,2009-06-13 16:10:54,2009.0,6.0,13.0,16.0,10.0,54.0,2,23.0
4,-73.989683,40.729717,-73.98249,40.761887,3.0,2014-06-12 03:25:56,2014.0,6.0,12.0,3.0,25.0,56.0,4,23.0


In [6]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
df_temp =df_temp.astype("float")
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.028036235932507102
Gradient Boosting Reg Score : 0.7065930289158124


In [7]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,week_year,day_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2.0,2011-10-21 23:54:10,2011.0,10.0,21.0,23.0,54.0,10.0,0,42.0,-0.02545
1,-73.988403,40.723431,-73.989647,40.741695,1.0,2015-02-03 10:42:03,2015.0,2.0,3.0,10.0,42.0,3.0,5,5.0,0.333601
2,-74.015785,40.71511,-74.012029,40.707888,2.0,2014-03-16 18:58:58,2014.0,3.0,16.0,18.0,58.0,58.0,3,10.0,-0.967083
3,-73.977322,40.787275,-73.95803,40.778838,3.0,2009-06-13 16:10:54,2009.0,6.0,13.0,16.0,10.0,54.0,2,23.0,-0.888817
4,-73.989683,40.729717,-73.98249,40.761887,3.0,2014-06-12 03:25:56,2014.0,6.0,12.0,3.0,25.0,56.0,4,23.0,0.782427


In [8]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
df_temp =df_temp.astype("float")
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.027628129687838364
Gradient Boosting Reg Score : 0.7124815505745139


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

In [9]:
# 加上"年週期"與"周週期"特徵
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
df['year_cycle'] = df['year_cycle'].map(lambda x:math.cos(x*math.pi))
df['week_cycle'] = df['week_day']/3.5 + df['pickup_hour']/84
df['week_cycle'] = df['week_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,week_year,day_cycle,year_cycle,week_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2.0,2011-10-21 23:54:10,2011.0,10.0,21.0,23.0,54.0,10.0,0,42.0,-0.02545,0.777146,0.757972
1,-73.988403,40.723431,-73.989647,40.741695,1.0,2015-02-03 10:42:03,2015.0,2.0,3.0,10.0,42.0,3.0,5,5.0,0.333601,0.45399,-0.988831
2,-74.015785,40.71511,-74.012029,40.707888,2.0,2014-03-16 18:58:58,2014.0,3.0,16.0,18.0,58.0,58.0,3,10.0,-0.967083,-0.275637,-0.222521
3,-73.977322,40.787275,-73.95803,40.778838,3.0,2009-06-13 16:10:54,2009.0,6.0,13.0,16.0,10.0,54.0,2,23.0,-0.888817,-0.97437,0.680173
4,-73.989683,40.729717,-73.98249,40.761887,3.0,2014-06-12 03:25:56,2014.0,6.0,12.0,3.0,25.0,56.0,4,23.0,0.782427,-0.978148,-0.532032


In [10]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
df_temp =df_temp.astype("float")
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.027521418600309367
Gradient Boosting Reg Score : 0.7102399202867165
