# Construction of Date/Time Features for NYC Taxi Trip Data

In [1]:
library(dplyr)
library(lubridate)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date



In [2]:
trip <- read.csv("NYC_Taxi_2013_One_Percent_Trip.csv")

In [3]:
dim(trip)
summary(trip)
head(trip)

                            medallion      
 36CD8C29F77E2F6CBFFA96246F56DF76:    665  
 3717C04FF182C423838EE021476A7A69:    608  
 02F59ACEA8585B113601E19A3A5F4F47:    560  
 9CCF07B476B482C2050A8C63360586F7:    560  
 9F1F27280ACE1C0159EAFB1A7FF03771:    559  
 87EC61B520189EFEAD8FDFBABD756E4A:    546  
 (Other)                         :1700489  
                           hack_license     vendor_id      rate_code     
 149974E97D992C3A227ED35C5EB3AA6B:    448   CMT:847533   Min.   :  0.00  
 5BD83749A65720DA48E6DFEA47562405:    351   VTS:856454   1st Qu.:  1.00  
 D99D198395EC876497BBE06F1A26E954:    342                Median :  1.00  
 D20DFC59B81A2918082D920FAC2D01B3:    337                Mean   :  1.03  
 676BC3464C5238DCCB33E719A5759ED4:    336                3rd Qu.:  1.00  
 7B4B3A68E8CC41CEA47AAA2295A8E61E:    336                Max.   :210.00  
 (Other)                         :1701837                                
 store_and_fwd_flag             pickup_datetime   
    :

medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,2,631,2.5,-73.95961,40.71594,-73.96342,40.69217
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,2,666,1.8,-73.97337,40.76396,-73.9773,40.78214
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,2,1401,4.4,-73.97784,40.78282,-74.00089,40.73708
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,1,751,2.8,-73.98033,40.78438,-73.98429,40.74869
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,1,261,1.5,-73.98669,40.74947,-73.99068,40.76252
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,3,1484,2.3,-73.97337,40.74757,-73.98941,40.71875


## Reformat the Pick-Up and Drop-Off Date/Time Values

In [4]:
colnames(trip)
trip$pu_DT <- trip$pickup_datetime #copy the original date/time columns
trip$do_DT <- trip$dropoff_datetime
head(trip[ , c("pu_DT", "do_DT")])
class(trip$pu_DT)

pu_DT,do_DT
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM


In [5]:
trip$pu_DT <- as.character(trip$pu_DT) #convert the copied columns to strings
trip$do_DT <- as.character(trip$do_DT)
class(trip$do_DT)

In [6]:
trip$pickUp_dateString <- gsub(" .*$", "", trip$pu_DT) #use regex substitution to isolate the event date
trip$dropOff_dateString <- gsub(" .*$", "", trip$do_DT)
head(trip[ , (ncol(trip)-3):(ncol(trip))])

pu_DT,do_DT,pickUp_dateString,dropOff_dateString
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,3/8/2013,3/8/2013
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,3/9/2013,3/9/2013
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,3/9/2013,3/9/2013
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,3/9/2013,3/9/2013
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,3/9/2013,3/9/2013
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,3/10/2013,3/10/2013


In [7]:
trip$pu_fullTimeString <- gsub("^.*? ", "", trip$pu_DT) #use regex substitution to isolate the event time
trip$do_fullTimeString <- gsub("^.*? ", "", trip$do_DT)
n = ncol(trip)
head(trip[ , c(n-5, n-4, n-1, n)])

pu_DT,do_DT,pu_fullTimeString,do_fullTimeString
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,9:55:27 PM,10:05:59 PM
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,6:51:52 PM,7:02:59 PM
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,7:04:35 PM,7:27:57 PM
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,8:33:06 PM,8:45:38 PM
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,11:10:38 PM,11:14:59 PM
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,12:32:28 AM,12:57:12 AM


In [8]:
trip$pu_binnedTime <- gsub(" .*$", "", trip$pu_fullTimeString) #isolate numeric time from am/pm
trip$do_binnedTime <- gsub(" .*$", "", trip$do_fullTimeString)
head(trip[ , (n-5):(n+2)])

pu_DT,do_DT,pickUp_dateString,dropOff_dateString,pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,3/8/2013,3/8/2013,9:55:27 PM,10:05:59 PM,9:55:27,10:05:59
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,3/9/2013,3/9/2013,6:51:52 PM,7:02:59 PM,6:51:52,7:02:59
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,3/9/2013,3/9/2013,7:04:35 PM,7:27:57 PM,7:04:35,7:27:57
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,3/9/2013,3/9/2013,8:33:06 PM,8:45:38 PM,8:33:06,8:45:38
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,3/9/2013,3/9/2013,11:10:38 PM,11:14:59 PM,11:10:38,11:14:59
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,3/10/2013,3/10/2013,12:32:28 AM,12:57:12 AM,12:32:28,12:57:12


In [9]:
trip$pu_binnedTime <- gsub(":.*$", "", trip$pu_binnedTime) #reduce to just the hour 'bin' for the event occurence
trip$do_binnedTime <- gsub(":.*$", "", trip$do_binnedTime)
head(trip[ , (n-5):(n+2)])

pu_DT,do_DT,pickUp_dateString,dropOff_dateString,pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,3/8/2013,3/8/2013,9:55:27 PM,10:05:59 PM,9,10
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,3/9/2013,3/9/2013,6:51:52 PM,7:02:59 PM,6,7
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,3/9/2013,3/9/2013,7:04:35 PM,7:27:57 PM,7,7
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,3/9/2013,3/9/2013,8:33:06 PM,8:45:38 PM,8,8
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,3/9/2013,3/9/2013,11:10:38 PM,11:14:59 PM,11,11
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,3/10/2013,3/10/2013,12:32:28 AM,12:57:12 AM,12,12


In [10]:
trip$pu_amPM <- gsub("^.* ", "", trip$pu_fullTimeString)  # use regex to isolate AM/PM values
trip$do_amPM <- gsub("^.* ", "", trip$do_fullTimeString)
head(trip[ , c(n-5, n-4, n-1, n, n+1, n+2, n+3, n+4)])

pu_DT,do_DT,pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime,pu_amPM,do_amPM
3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,9:55:27 PM,10:05:59 PM,9,10,PM,PM
3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,6:51:52 PM,7:02:59 PM,6,7,PM,PM
3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,7:04:35 PM,7:27:57 PM,7,7,PM,PM
3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,8:33:06 PM,8:45:38 PM,8,8,PM,PM
3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,11:10:38 PM,11:14:59 PM,11,11,PM,PM
3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,12:32:28 AM,12:57:12 AM,12,12,AM,AM


In [11]:
trip$pu_binnedTime <- as.numeric(trip$pu_binnedTime) # convert "binned time" columns to numeric type
trip$do_binnedTime <- as.numeric(trip$do_binnedTime)

# for pickup/dropoff times after noon (12 pm) and before midnight (12 AM), add 12 to the binned time value 
#     e.g. 1 PM => 13 and 8 PM => 20
trip[trip$pu_amPM == "PM" & 
                trip$pu_binnedTime >= 1 & 
                trip$pu_binnedTime <= 11, ]$pu_binnedTime <- 
                lapply(trip[trip$pu_amPM == "PM" & 
                        trip$pu_binnedTime >= 1 & 
                        trip$pu_binnedTime <= 11, ]$pu_binnedTime, 
                        function(x) x+12)
                            
trip[trip$do_amPM == "PM" & 
                trip$do_binnedTime >= 1 & 
                trip$do_binnedTime <= 11, ]$do_binnedTime <- 
                lapply(trip[trip$do_amPM == "PM" & 
                        trip$do_binnedTime >= 1 & 
                        trip$do_binnedTime <= 11, ]$do_binnedTime, 
                        function(x) x+12)

In [12]:
head(trip[ , (n-1):ncol(trip)])

pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime,pu_amPM,do_amPM
9:55:27 PM,10:05:59 PM,21,22,PM,PM
6:51:52 PM,7:02:59 PM,18,19,PM,PM
7:04:35 PM,7:27:57 PM,19,19,PM,PM
8:33:06 PM,8:45:38 PM,20,20,PM,PM
11:10:38 PM,11:14:59 PM,23,23,PM,PM
12:32:28 AM,12:57:12 AM,12,12,AM,AM


In [13]:
# for pickup/dropoff times occurring during the midnight (12AM - 1AM) bin, 
#    convert the binnedTime value to 0

trip[trip$pu_amPM == "AM" & 
       trip$pu_binnedTime == 12, ]$pu_binnedTime <- 0
trip[trip$do_amPM == "AM" & 
       trip$do_binnedTime == 12, ]$do_binnedTime <- 0

In [14]:
head(trip[ , (n-1):ncol(trip)])

pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime,pu_amPM,do_amPM
9:55:27 PM,10:05:59 PM,21,22,PM,PM
6:51:52 PM,7:02:59 PM,18,19,PM,PM
7:04:35 PM,7:27:57 PM,19,19,PM,PM
8:33:06 PM,8:45:38 PM,20,20,PM,PM
11:10:38 PM,11:14:59 PM,23,23,PM,PM
12:32:28 AM,12:57:12 AM,0,0,AM,AM


In [15]:
# create string-type columns with binned time, adding a "0" before single character 'bins'
trip$pickUp_Hour <- as.character(trip$pu_binnedTime)
trip$dropOff_Hour <- as.character(trip$do_binnedTime)

trip$pickUp_Hour <- gsub("^(\\d)$", "0\\1", trip$pickUp_Hour)
trip$dropOff_Hour <- gsub("^(\\d)$", "0\\1", trip$dropOff_Hour)
head(trip[ , (n-1):ncol(trip)])

pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime,pu_amPM,do_amPM,pickUp_Hour,dropOff_Hour
9:55:27 PM,10:05:59 PM,21,22,PM,PM,21,22
6:51:52 PM,7:02:59 PM,18,19,PM,PM,18,19
7:04:35 PM,7:27:57 PM,19,19,PM,PM,19,19
8:33:06 PM,8:45:38 PM,20,20,PM,PM,20,20
11:10:38 PM,11:14:59 PM,23,23,PM,PM,23,23
12:32:28 AM,12:57:12 AM,0,0,AM,AM,0,0


In [16]:
# convert pickUp/dropOff_Hour to factor; create pickUp/dropOff_Date columns as factors
trip$pickUp_Hour <- as.factor(trip$pickUp_Hour)
trip$dropOff_Hour <- as.factor(trip$dropOff_Hour)

trip$pickUp_Date <- as.factor(trip$pickUp_dateString)
trip$dropOff_Date <- as.factor(trip$dropOff_dateString)

In [17]:
colnames(trip)
head(trip[ , (n-1):ncol(trip)])

pu_fullTimeString,do_fullTimeString,pu_binnedTime,do_binnedTime,pu_amPM,do_amPM,pickUp_Hour,dropOff_Hour,pickUp_Date,dropOff_Date
9:55:27 PM,10:05:59 PM,21,22,PM,PM,21,22,3/8/2013,3/8/2013
6:51:52 PM,7:02:59 PM,18,19,PM,PM,18,19,3/9/2013,3/9/2013
7:04:35 PM,7:27:57 PM,19,19,PM,PM,19,19,3/9/2013,3/9/2013
8:33:06 PM,8:45:38 PM,20,20,PM,PM,20,20,3/9/2013,3/9/2013
11:10:38 PM,11:14:59 PM,23,23,PM,PM,23,23,3/9/2013,3/9/2013
12:32:28 AM,12:57:12 AM,0,0,AM,AM,0,0,3/10/2013,3/10/2013


## Create Weeday, Holiday, and Time of Day Features

In [18]:
trip$pickUp_Day <- wday(as.Date(trip$pickUp_Date, "%m/%d/%Y"), label=T) # translate each date into Sun-Sat labels
trip$dropOff_Day <- wday(as.Date(trip$dropOff_Date, "%m/%d/%Y"), label=T)

trip$pickUp_Day <- as.factor(trip$pickUp_Day) # convert to factors
trip$dropOff_Day <- as.factor(trip$dropOff_Day)

head(trip[ , (ncol(trip) - 3):(ncol(trip))]) # checked: March 8, 2013 was in fact Friday

pickUp_Date,dropOff_Date,pickUp_Day,dropOff_Day
3/8/2013,3/8/2013,Fri,Fri
3/9/2013,3/9/2013,Sat,Sat
3/9/2013,3/9/2013,Sat,Sat
3/9/2013,3/9/2013,Sat,Sat
3/9/2013,3/9/2013,Sat,Sat
3/10/2013,3/10/2013,Sun,Sun


In [19]:
trip$pickUp_isWeekend <- ifelse(trip$pickUp_Day == "Sat" | 
                                  trip$pickUp_Day == "Sun", 1, 0)
trip$dropOff_isWeekend <- ifelse(trip$dropOff_Day == "Sat" | 
                                  trip$dropOff_Day == "Sun", 1, 0)
head(trip[ , (ncol(trip) - 5):(ncol(trip))]) 
table(trip$pickUp_Day, trip$pickUp_isWeekend)
table(trip$dropOff_Day, trip$dropOff_isWeekend)

pickUp_Date,dropOff_Date,pickUp_Day,dropOff_Day,pickUp_isWeekend,dropOff_isWeekend
3/8/2013,3/8/2013,Fri,Fri,0,0
3/9/2013,3/9/2013,Sat,Sat,1,1
3/9/2013,3/9/2013,Sat,Sat,1,1
3/9/2013,3/9/2013,Sat,Sat,1,1
3/9/2013,3/9/2013,Sat,Sat,1,1
3/10/2013,3/10/2013,Sun,Sun,1,1


       
             0      1
  Sun        0 225556
  Mon   221549      0
  Tues  240791      0
  Wed   245845      0
  Thurs 250569      0
  Fri   263159      0
  Sat        0 256518

       
             0      1
  Sun        0 227571
  Mon   221386      0
  Tues  240371      0
  Wed   245600      0
  Thurs 249975      0
  Fri   262644      0
  Sat        0 256440

In [20]:
# federal public holidays: new years day, MLK day, presidents' day, mother's day, memorial day
#     father's day, independence day, labor day, columbus day, veterans day, thanksgiving, 
#     day after thanksgiving, christmas day

trip$pickUp_isHoliday <- ifelse(trip$pickUp_Date == "1/1/2013" |
                                    trip$pickUp_Date == "1/21/2013" |
                                    trip$pickUp_Date == "2/18/2013" | 
                                    trip$pickUp_Date == "5/12/2013" | 
                                    trip$pickUp_Date == "5/27/2013" | 
                                    trip$pickUp_Date == "6/16/2013" | 
                                    trip$pickUp_Date == "7/4/2013" | 
                                    trip$pickUp_Date == "9/2/2013" | 
                                    trip$pickUp_Date == "10/4/2013" | 
                                    trip$pickUp_Date == "11/11/2013" | 
                                    trip$pickUp_Date == "11/28/2013" | 
                                    trip$pickUp_Date == "11/29/2013" | 
                                    trip$pickUp_Date == "12/25/2013", 1, 0)
trip$dropOff_isHoliday <- ifelse(trip$dropOff_Date == "1/1/2013" |
                                    trip$dropOff_Date == "1/21/2013" |
                                    trip$dropOff_Date == "2/18/2013" | 
                                    trip$dropOff_Date == "5/12/2013" | 
                                    trip$dropOff_Date == "5/27/2013" | 
                                    trip$dropOff_Date == "6/16/2013" | 
                                    trip$dropOff_Date == "7/4/2013" | 
                                    trip$dropOff_Date == "9/2/2013" | 
                                    trip$dropOff_Date == "10/4/2013" | 
                                    trip$dropOff_Date == "11/11/2013" | 
                                    trip$dropOff_Date == "11/28/2013" | 
                                    trip$dropOff_Date == "11/29/2013" | 
                                    trip$dropOff_Date == "12/25/2013", 1, 0)
nrow(trip[trip$pickUp_isHoliday, ])
nrow(trip[trip$dropOff_isHoliday, ])

In [21]:
# 4-part times of day: "morning", "afternoon", "evening", "night"
#      using BINNED times [5AM-10AM], [11AM-4PM], [5PM-10PM], [11PM-4AM]

trip$pickUp_TimeOfDay.quad <- "none"
trip$pickUp_TimeOfDay.quad <- ifelse(trip$pu_binnedTime >= 5 &
                                            trip$pu_binnedTime <= 10, 
                                            "morning", "none")
trip[trip$pickUp_TimeOfDay.quad == "none", ]$pickUp_TimeOfDay.quad <- 
       ifelse(trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime >= 11 & 
              trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime <= 16, 
              "afternoon", "none")
trip[trip$pickUp_TimeOfDay.quad == "none", ]$pickUp_TimeOfDay.quad <- 
       ifelse(trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime >= 17 & 
              trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime <= 22, 
              "evening", "none")
trip[trip$pickUp_TimeOfDay.quad == "none", ]$pickUp_TimeOfDay.quad <- 
       ifelse(trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime == 23 | 
              trip[trip$pickUp_TimeOfDay.quad == "none", ]$pu_binnedTime <= 4, 
              "night", "none")


trip$dropOff_TimeOfDay.quad <- "none"
trip$dropOff_TimeOfDay.quad <- ifelse(trip$do_binnedTime >= 5 &
                                            trip$do_binnedTime <= 10, 
                                            "morning", "none")
trip[trip$dropOff_TimeOfDay.quad == "none", ]$dropOff_TimeOfDay.quad <- 
       ifelse(trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime >= 11 & 
              trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime <= 16, 
              "afternoon", "none")
trip[trip$dropOff_TimeOfDay.quad == "none", ]$dropOff_TimeOfDay.quad <- 
       ifelse(trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime >= 17 & 
              trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime <= 22, 
              "evening", "none")
trip[trip$dropOff_TimeOfDay.quad == "none", ]$dropOff_TimeOfDay.quad <- 
       ifelse(trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime == 23 | 
              trip[trip$dropOff_TimeOfDay.quad == "none", ]$do_binnedTime <= 4, 
              "night", "none")

head(trip[ , c("pu_binnedTime","pickUp_TimeOfDay.quad", "do_binnedTime", "dropOff_TimeOfDay.quad")])

pu_binnedTime,pickUp_TimeOfDay.quad,do_binnedTime,dropOff_TimeOfDay.quad
21,evening,22,evening
18,evening,19,evening
19,evening,19,evening
20,evening,20,evening
23,night,23,night
0,night,0,night


In [22]:
# 7-part times of day:  early morning, morning, midday, afternoon, evening, night, late night
#    4AM-6AM, 7AM-10AM, 11AM-1PM, 2PM-5PM, 6PM-8PM, 9PM-11PM, 12AM-3AM

trip$pickUp_TimeOfDay.sept <- "none"
trip$pickUp_TimeOfDay.sept <- ifelse(trip$pu_binnedTime >= 4 & 
                                       trip$pu_binnedTime <= 6, 
                                       "earlyMorning", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime >= 7 & 
                trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime <= 10, 
                "morning", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime >= 11 & 
                trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime <= 13, 
                "midday", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime >= 14 & 
                trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime <= 17, 
                "afternoon", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime >= 18 & 
                trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime <= 20, 
                "evening", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime >= 21,  
                "night", "none")
trip[trip$pickUp_TimeOfDay.sept == "none", ]$pickUp_TimeOfDay.sept <- 
        ifelse(trip[trip$pickUp_TimeOfDay.sept == "none", ]$pu_binnedTime <= 3,  
                "lateNight", "none")


trip$dropOff_TimeOfDay.sept <- "none"
trip$dropOff_TimeOfDay.sept <- ifelse(trip$do_binnedTime >= 4 & 
                                       trip$do_binnedTime <= 6, 
                                       "earlyMorning", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime >= 7 & 
                trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime <= 10, 
                "morning", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime >= 11 & 
                trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime <= 13, 
                "midday", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime >= 14 & 
                trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime <= 17, 
                "afternoon", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime >= 18 & 
                trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime <= 20, 
                "evening", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime >= 21,  
                "night", "none")
trip[trip$dropOff_TimeOfDay.sept == "none", ]$dropOff_TimeOfDay.sept <- 
        ifelse(trip[trip$dropOff_TimeOfDay.sept == "none", ]$do_binnedTime <= 3,  
                "lateNight", "none")

In [23]:
# convert the 'time of day' features to type of factors
trip$pickUp_TimeOfDay.quad <- factor(trip$pickUp_TimeOfDay.quad, 
                                     levels=c("morning", "afternoon", 
                                     "evening", "night"), ordered = T)
trip$dropOff_TimeOfDay.quad <- factor(trip$dropOff_TimeOfDay.quad, 
                                         levels=c("morning", "afternoon", 
                                         "evening", "night"), ordered = T)

trip$pickUp_TimeOfDay.sept <- factor(trip$pickUp_TimeOfDay.sept, 
                                    levels=c("earlyMorning", "morning", "midday", "afternoon", 
                                            "evening", "night", "lateNight"), ordered = T)
trip$dropOff_TimeOfDay.sept <- factor(trip$dropOff_TimeOfDay.sept, 
                                     levels=c("earlyMorning", "morning", "midday", "afternoon", 
                                            "evening", "night", "lateNight"), ordered = T)

summary(trip[ , (ncol(trip)-3):(ncol(trip))])

 pickUp_TimeOfDay.quad dropOff_TimeOfDay.quad  pickUp_TimeOfDay.sept
 morning  :345891      morning  :331867       earlyMorning: 70941   
 afternoon:484032      afternoon:485392       morning     :294203   
 evening  :589354      evening  :586880       midday      :247452   
 night    :284710      night    :299848       afternoon   :319960   
                                              evening     :311432   
                                              night       :279356   
                                              lateNight   :180643   
  dropOff_TimeOfDay.sept
 earlyMorning: 67904    
 morning     :284760    
 midday      :244973    
 afternoon   :318043    
 evening     :313373    
 night       :283016    
 lateNight   :191918    

In [24]:
head(trip)

medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,pickUp_Day,dropOff_Day,pickUp_isWeekend,dropOff_isWeekend,pickUp_isHoliday,dropOff_isHoliday,pickUp_TimeOfDay.quad,dropOff_TimeOfDay.quad,pickUp_TimeOfDay.sept,dropOff_TimeOfDay.sept
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/8/2013 9:55:27 PM,3/8/2013 10:05:59 PM,2,631,2.5,...,Fri,Fri,0,0,0,0,evening,evening,night,night
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 6:51:52 PM,3/9/2013 7:02:59 PM,2,666,1.8,...,Sat,Sat,1,1,0,0,evening,evening,evening,evening
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 7:04:35 PM,3/9/2013 7:27:57 PM,2,1401,4.4,...,Sat,Sat,1,1,0,0,evening,evening,evening,evening
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 8:33:06 PM,3/9/2013 8:45:38 PM,1,751,2.8,...,Sat,Sat,1,1,0,0,evening,evening,evening,evening
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/9/2013 11:10:38 PM,3/9/2013 11:14:59 PM,1,261,1.5,...,Sat,Sat,1,1,0,0,night,night,night,night
00005007A9F30E289E760362F69E4EAD,A9AE329EA1138052DAC8FDFD8BA86603,CMT,1,N,3/10/2013 12:32:28 AM,3/10/2013 12:57:12 AM,3,1484,2.3,...,Sun,Sun,1,1,0,0,night,night,lateNight,lateNight


## More Date Reformatting

In [25]:
# make pickUp/dropOff_Day an ordered factor
trip$pickUp_Day <- factor(trip$pickUp_Day, levels=c("Sun", "Mon", "Tues", "Wed", 
                                          "Thurs", "Fri", "Sat"), ordered=T)
trip$dropOff_Day <- factor(trip$dropOff_Day, levels=c("Sun", "Mon", "Tues", "Wed", 
                                          "Thurs", "Fri", "Sat"), ordered=T)
summary(trip[ , c("pickUp_Day", "dropOff_Day")])

 pickUp_Day     dropOff_Day   
 Sun  :225556   Sun  :227571  
 Mon  :221549   Mon  :221386  
 Tues :240791   Tues :240371  
 Wed  :245845   Wed  :245600  
 Thurs:250569   Thurs:249975  
 Fri  :263159   Fri  :262644  
 Sat  :256518   Sat  :256440  

In [30]:
head(trip$pickUp_dateString) #checking to make sure this is the format/type I think it is
class(trip$pickUp_dateString)

In [31]:
# split month from full date
trip$pickUp_Month <- trip$pickUp_dateString
trip$pickUp_Month <- gsub("/.*$", "", trip$pickUp_Month)
trip$pickUp_Month <- as.factor(trip$pickUp_Month)

levels(trip$pickUp_Month)

In [32]:
# repeat for dropOff dates
trip$dropOff_Month <- trip$dropOff_dateString
trip$dropOff_Month <- gsub("/.*$", "", trip$dropOff_dateString)
trip$dropOff_Month <- as.factor(trip$dropOff_Month)

levels(trip$dropOff_Month)

In [33]:
# split day (of month) from full date
trip$pickUp_DayOfMonth <- trip$pickUp_dateString
trip$pickUp_DayOfMonth <- gsub("^.*?/(.*?)/.*$", "\\1", trip$pickUp_dateString)
trip$pickUp_DayOfMonth <- as.factor(trip$pickUp_DayOfMonth)

trip$dropOff_DayOfMonth <- trip$dropOff_dateString
trip$dropOff_DayOfMonth <- gsub("^.*?/(.*?)/.*$", "\\1", trip$dropOff_dateString)
trip$dropOff_DayOfMonth <- as.factor(trip$dropOff_DayOfMonth)

paste(levels(trip$pickUp_DayOfMonth), sep=" ")

In [34]:
# confirm that the the only year seen in the data is 2013
years <- trip$pickUp_dateString
years <- gsub("^.*/", "", years)
years <- as.factor(years)
levels(years) # confirmed only 2013, no need to process and add a feature for these

In [38]:
# choose columns to export for the date/time feature CSV
# Use medallion, hack_license, pickup_datetime from original data
export <- trip %>% select(medallion, hack_license, pickup_datetime)

# add the final versions of created features
export <- cbind(export, trip[ , c("pickUp_Hour", "dropOff_Hour", "pickUp_Date", 
                                 "dropOff_Date", "pickUp_Month", "dropOff_Month", 
                                 "pickUp_DayOfMonth", "dropOff_DayOfMonth", 
                                 "pickUp_Day", "dropOff_Day", 
                                 "pickUp_isWeekend", "dropOff_isWeekend", 
                                 "pickUp_isHoliday", "dropOff_isHoliday", 
                                 "pickUp_TimeOfDay.quad", "dropOff_TimeOfDay.quad", 
                                 "pickUp_TimeOfDay.sept", "dropOff_TimeOfDay.sept")])

dim(export)
summary(export)

                            medallion      
 36CD8C29F77E2F6CBFFA96246F56DF76:    665  
 3717C04FF182C423838EE021476A7A69:    608  
 02F59ACEA8585B113601E19A3A5F4F47:    560  
 9CCF07B476B482C2050A8C63360586F7:    560  
 9F1F27280ACE1C0159EAFB1A7FF03771:    559  
 87EC61B520189EFEAD8FDFBABD756E4A:    546  
 (Other)                         :1700489  
                           hack_license                 pickup_datetime   
 149974E97D992C3A227ED35C5EB3AA6B:    448   4/3/2013 7:34:00 PM :     13  
 5BD83749A65720DA48E6DFEA47562405:    351   11/1/2013 8:05:00 PM:     12  
 D99D198395EC876497BBE06F1A26E954:    342   3/16/2013 7:58:00 PM:     12  
 D20DFC59B81A2918082D920FAC2D01B3:    337   3/28/2013 9:25:00 AM:     12  
 676BC3464C5238DCCB33E719A5759ED4:    336   3/9/2013 10:58:00 PM:     12  
 7B4B3A68E8CC41CEA47AAA2295A8E61E:    336   3/9/2013 9:57:00 PM :     12  
 (Other)                         :1701837   (Other)             :1703914  
  pickUp_Hour       dropOff_Hour        pickUp_D

In [39]:
write.csv(export, "trip_DateTimeFeatures.csv")

## Info Posted for the Features of "trip_DateTimeFeatures.csv"

zipped CSV file available here:  
https://drive.google.com/open?id=0B5IBlGlKA58cVUhUX3JRdmdFN3c


The CSV file contains 1,703,987 observations with 21 feature columns and an index column ("X") unhelpfully provided by Rstudio when I wrote the data to a CSV file.  The file only contains date/time features created from the pickup and dropoff information from the Data Science 450 capstone NYC taxi trip data ("NYC_Taxi_2013_One_Percent_Trip.CSV") only.  Features were not created from the pickup information in the NYC taxi fare data ("NYC_Taxi_2013_One_Percent_Fare.CSV") given that, line for line in the original CSV files, the pickup date and time information, as well as the medallion and hack license information are identical.  



---------------EXPLANATIONS BY FEATURE---------------

<b>"X"</b> - index created by 'write.csv' function in Rstudio

<b>"medallion"</b> - original feature from "NYC_Taxi_2013_One_Percent_Trip"

<b>"hack_license"</b> - original feature from "NYC_Taxi_2013_One_Percent_Trip"

<b>"pickup_datetime"</b> - original feature from "NYC_Taxi_2013_One_Percent_Trip"

<b>"pickUp_Hour"</b> - the hour "bin" in which the trip pick up occured, in a 24-hour format ranging from 00 (for pickups between midnight to 1:00 AM) to 23 (for pickups between 11:00PM and midnight). This features was written and saved as a factor with levels labelled "00" to "23", but when this CSV file is read into R, they tend to be conveerted to integers 0-23, losing this formatting.

<b>"dropOff_Hour"</b> - the hour "bin" in which the trip drop off occured, in the same 24-hour format used for "pickUp_Hour" 

<b>"pickUp_Date"</b> - the date of the trip pickup in the format "%m/%d/%Y"

<b>"dropOff_Date"</b> - the date of the trip dropoff in the format "%m/%d/%Y" 

<b>"pickUp_Month"</b> - the month number of the trip pickup.  In R this can be converted to the month name as an ordered factor using (from base package) "DATAFRAME\$pickUp_Month <- month.abb[DATAFRAME\$pickUp_Month]". The result of this is a vector of character (string) class, which can then be converted to a factor with or without ordered levels.  To convert this feature directly to a factor with ordered levels of month abbreviations use package lubridate and its function "month": "DATAFRAME\$pickUp_Month <- month(ymd(010101) + months(DATAFRAME\$pickUp_Month - 1), label=T, abbr=T)"

<b>"dropOff_Month"</b> - the month number of the trip dropoff.  This can be converted in R using the same approaches described for "pickUp_Month"

<b>"pickUp_DayOfMonth"</b> - the day number of the original pickup date/time (eg. "3" in "1/3/2013")

<b>"dropOff_DayOfMonth"</b> -   the day number of the original dropoff date/time (eg. "3" in "1/3/2013")

<b>"pickUp_Day"</b> - the abbreviated day of the week of the trip pickup ("Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", and "Sun")

<b>"dropOff_Day"</b> - the abbreviated day of the week of the trip dropoff in the same format as "pickUp_Day" 

<b>"pickUp_isWeekend"</b> - binary feature indicating whether the trip pickup occured between 00:00-23:59, inclusive, on a Saturday or Sunday (value "1"), or not ("0").  

<b>"dropOff_isWeekend"</b> - binary feature indicating whether the trip dropoff occured on a Saturday or Sunday, defined and formatted in the same manner as "pickUp_isWeekend"

<b>"pickUp_isHoliday"</b> - binary feature indicating whether the trip pickup occured between 00:00-23:59, inclusive, on a federal holiday (value "1"), or not ("0").  The federal holidays covered by this data were:
<ul>        <li>New Years Day               1/1/2013</li>
			<li>MLK Day                     1/21/2013</li>
			<li>Presidents' Day             2/18/2013</li>
			<li>Mother's Day                5/12/2013</li>
			<li>Memorial Day                5/27/2013</li>
            <li>Father's Day                6/16/2013</li>
			<li>Independence Day            7/4/2013</li>
			<li>Labor Day                   9/2/2013</li>
			<li>Columbus Day                10/4/2013</li>
			<li>Veterans' Day               11/11/2013</li>
			<li>Thanksgiving                11/28/2013</li>
            <li>day after Thanksgiving      11/29/2013</li>
			<li>Christmas Day               12/25/2013</li>
            </ul>
            
<b>"dropOff_isHoliday"</b> - binary feature indicating whether the trip dropoff occured on a federal holiday, defined and formatted in the same manner as "pickUp_isHoliday" 

<b>"pickUp_TimeOfDay.quad"</b> - factor with four levels indicating the time of day of the trip pickup based on the following level/time mappings.  The divisions in the 24-hour day were made to roughly approximate the human workday cycle rather than to divide the 24-hour period into equal time spans.  
<ul>        <li>"morning"           05:00-10:59</li>
            <li>"afternoon"         11:00-16:59</li>
            <li>"evening"           17:00-22:59</li>
            <li>"night"             23:00-23:59 and 00:00-04:49</li>
</ul>

<b>"dropOff_TimeOfDay.quad"</b> - factor with four levels indicating the time of day of the trip dropoff, based on the same level/time mappings used for "pickUp_TimeOfDay.quad"

<b>"pickUp_TimeOfDay.sept"</b> - factor with seven levels indicating the time of day of the trip pickup based on the following level/time mappings. These divisions made to roughly approximate the human workday cycle rather than to divide the 24-hour period into equal time spans.
<ul>        <li>"earlyMorning"      04:00-06:59</li>
            <li>"morning"           07:00-10:59</li>
            <li>"midday"            11:00-13:59</li>
            <li>"afternoon"         14:00-17:59</li>
            <li>"evening"           18:00-20:59</li>
            <li>"night"             21:00-23:59</li>
            <li>"lateNight"         00:00-03:59</li>
</ul>

<b>"dropOff_TimeOfDay.sept"</b> - factor with seven levels indicating the time of day of the trip dropoff, based on the same level/time mappings used for "pickUp_TimeOfDay.sept"