In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessMonthBegin
import datetime
from math import radians, cos, sin, asin, sqrt
def haversine(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    miles_list = []
    for i in df.index.tolist():
        df.loc[i,'start station longitude'], df.loc[i,'start station latitude'], \
               df.loc[i,'end station longitude'], df.loc[i,'end station latitude'] = map(radians, 
        [df.loc[i,'start station longitude'], df.loc[i,'start station latitude'], df.loc[i,'end station longitude'], 
         df.loc[i,'end station latitude']])
        # haversine formula 
        dlon = df.loc[i,'end station longitude'] - df.loc[i,'start station longitude'] 
        dlat = df.loc[i,'end station latitude'] - df.loc[i,'start station latitude'] 
        a = sin(dlat/2)**2 + cos(df.loc[i,'start station latitude']) * cos(df.loc[i,'end station latitude']) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        # Radius of earth in miles is 3958.8
        miles = 3958.8 * c
        miles_list.append(miles)
    return pd.Series(miles_list)



In [2]:
rider_df = pd.read_csv(r'C:\Users\mmotd\OneDrive\Documents\Boot Camp Files\Capstone\downsampled_rider.csv.gz')

In [3]:
def feature_engineering(rider_df):
    rider_df = rider_df.drop('Unnamed: 0', axis = 1)
    rider_df['weekend'] = pd.Series([1 if (i == 'Saturday') | (i == 'Sunday') else 0 for i in rider_df.dayofweek])
    rider_df['miles'] = haversine(rider_df.loc[:,['start station longitude', 'start station latitude', 
                              'end station longitude', 'end station latitude']])
    rider_df = rider_df.drop('Sample_num', axis = 1)
    federal_holidays = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar())
    holidays = pd.Series(pd.date_range(start='01/01/2013', end='11/30/2020',freq=federal_holidays).tolist())
    holidays = [i.date() for i in holidays]
    rider_df['start_date'] = pd.to_datetime(rider_df['start_date'])
    rider_df['holidays'] = rider_df['start_date'].isin(holidays)

In [4]:
rider_df = feature_engineering(rider_df)