In [17]:
import pandas as pd
import numpy as np 
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'C:/Users/francis/Machine_Learning/francis/Documents/GitHub/4th-ML100Days/data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

df.head()
Y_train = df['fare_amount']
df = df.drop('fare_amount',axis = 1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [18]:
# 時間轉換方式 : 使用 datetime.strptime 解析(parse)時間字串 / 使用 datetime.strftime 匯出時間格式(format)
#先將時間字串匯出成時間格式
# 參考官網 https://docs.python.org/3/library/datetime.html
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x,'%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x:datetime.datetime.strftime(x,'%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x:datetime.datetime.strftime(x,'%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x:datetime.datetime.strftime(x,'%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x:datetime.datetime.strftime(x,'%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x:datetime.datetime.strftime(x,'%S')).astype('int64')

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [21]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
#並對資料使用minmaxscaler處理
df_temp = df.drop(['pickup_datetime'],axis = 1)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Regression Score: {cross_val_score(Linear,X_train,Y_train,cv = 5).mean()}')
GDBR = GradientBoostingRegressor()
print(f'GDBR Score : {cross_val_score(GDBR,X_train,Y_train,cv = 5).mean()}')

Linear Regression Score: 0.026876871475636888
GDBR Score : 0.7105442634968269


In [23]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12+df['pickup_minute']/720+df['pickup_day']/43200
df['day_cycle'] = df['day_cycle'].apply(lambda x : math.sin(x * math.pi))
df['day_cycle'].head()

0   -0.024650
1    0.333601
2   -0.967856
3   -0.887447
4    0.780430
Name: day_cycle, dtype: float64

In [25]:
# 結果 : 預測力反而下降
df_temp_1 = df.drop(['pickup_datetime'] , axis = 1)
df_temp_1 = scaler.fit_transform(df_temp_1)
print(f'LinearRegression Score : {cross_val_score(Linear,X_train,Y_train,cv = 5).mean()}')
print(f'GDBR Score : {cross_val_score(GDBR,X_train,Y_train,cv = 5).mean()}')

LinearRegression Score : 0.026876871475636888
GDBR Score : 0.7100800778175612


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?


效果不會改善非常多

# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

效果有改善但沒有非常多


In [45]:
df['day of week'] = df['pickup_datetime'].apply(lambda x : datetime.datetime.strftime(x,'%w')).astype('int64')
df['week of year'] = df['pickup_datetime'].apply(lambda x : datetime.datetime.strftime(x,'%W')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_cycle,day of week,week of year,year_cycle,week_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,-0.02465,5,42,0.777146,3.130952
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,0.333601,2,5,0.45399,0.690476
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,-0.967856,0,10,-0.275637,1.071429
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,-0.887447,6,23,-0.97437,1.904762
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.78043,4,23,-0.978148,1.75


In [46]:
df_temp_1 = df.drop(['pickup_datetime'] , axis = 1)
df_temp_1 = scaler.fit_transform(df_temp_1)
print(f'Linear Regression Score : {cross_val_score(Linear,X_train,Y_train,cv =  5).mean()}')
print(f'Gradient Boosting Regressor Score : {cross_val_score(GDBR,X_train,Y_train,cv = 5).mean()}')

Linear Regression Score : 0.026876871475636888
Gradient Boosting Regressor Score : 0.7104646368267894


In [47]:
df['year_cycle'] = df['pickup_month'] / 6 + df['pickup_day'] / 180
df['year_cycle'] = df['year_cycle'].apply(lambda x :  math.cos(x*math.pi))
df['week_cycle'] = df['day of week'] / 3.5 + df['pickup_hour'] / 84
df['week_cycle'] = df['week_cycle'].apply(lambda x : math.sin(x*math.pi))

In [48]:
df_temp_1 = df.drop(['pickup_datetime'] , axis = 1)
df_temp_1 = scaler.fit_transform(df_temp_1)
print(f'Linear Regression Score : {cross_val_score(Linear,X_train,Y_train,cv = 5).mean()}')
print(f'Gradient Boosting Score : {cross_val_score(GDBR,X_train,Y_train,cv = 5).mean()}')

Linear Regression Score : 0.026876871475636888
Gradient Boosting Score : 0.7115735064697783
