In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessMonthBegin
import datetime
from math import radians, cos, sin, asin, sqrt
def haversine(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    miles_list = []
    for i in df.index.tolist():
        df.loc[i,'start station longitude'], df.loc[i,'start station latitude'], \
               df.loc[i,'end station longitude'], df.loc[i,'end station latitude'] = map(radians, 
        [df.loc[i,'start station longitude'], df.loc[i,'start station latitude'], df.loc[i,'end station longitude'], 
         df.loc[i,'end station latitude']])
        # haversine formula 
        dlon = df.loc[i,'end station longitude'] - df.loc[i,'start station longitude'] 
        dlat = df.loc[i,'end station latitude'] - df.loc[i,'start station latitude'] 
        a = sin(dlat/2)**2 + cos(df.loc[i,'start station latitude']) * cos(df.loc[i,'end station latitude']) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        # Radius of earth in miles is 3958.8
        miles = 3958.8 * c
        miles_list.append(miles)
    return pd.Series(miles_list)



In [2]:
rider_df = pd.read_csv(r'C:\Users\mmotd\OneDrive\Documents\Boot Camp Files\Capstone\downsampled_rider.csv.gz', 
                      parse_dates = ['starttime', 'stoptime', 'start_date', 'stop_date'])

In [3]:
def data_cleaning(rider_df):
    rider_df['miles'] = haversine(rider_df.loc[:,['start station longitude', 'start station latitude', 
                                                  'end station longitude', 'end station latitude']])
    rider_df['counts'] = 1
    rider_df = rider_df.drop(['Unnamed: 0','Sample_num','start station longitude', 'start station latitude', 
                                                  'end station longitude', 'end station latitude'], axis = 1)
    rider_df = rider_df.set_index('starttime')
    return rider_df

In [4]:
rider_df = data_cleaning(rider_df)

In [81]:
def feature_engineering(rider_df):
    rider_df['dayofweek'] = pd.Series([i.dayofweek for i in rider_df['starttime']])
    rider_df['weekend'] = pd.Series([1 if (i == 'Saturday') | (i == 'Sunday') else 0 for i in rider_df.dayofweek])
    rider_df['start_date'] = rider_df['starttime'].dt.date
    
    federal_holidays = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar())
    holidays = pd.Series(pd.date_range(start='01/01/2013', end='11/30/2020',freq=federal_holidays).tolist())
    holidays = [i.date() for i in holidays]
    rider_df['holidays'] = rider_df['start_date'].isin(holidays)
    
    rider_df['month'] = rider_df['starttime'].dt.strftime('%m')
    
    rider_df['quarter'] = pd.Series(['Q1' if (i >= '01' and i <= '03') else 'Q2' if (i >= '04' and i <= '06') else 'Q3'\
                                     if (i >= '07' and i <= '09') else 'Q4' for i in rider_df['month']])
    '''
    Creating a dataframe that distinguishes between winter and summer daylight savings times.
    A 0 denotes winter while a 1 denotes summer. Filtered the winter dsv and summer dsv and stored
    into separate lists. Created boolean columns for each category and created a column that determines
    if the timestamp does not fall under the dsv boundaries.
    '''
    
    sorted_times = rider_df['starttime'].sort_values(ascending = False)
    start = sorted_times.min()
    end = sorted_times.max()
    dates = pd.date_range(start=start, end=end,  tz='US/Eastern')
    df1 = pd.DataFrame({'dst_flag': 1, 'date1': dates.tz_localize(None)}, index=dates)

    # add extra day on each end so that there are no nan's after the join    
    dates = pd.to_datetime(pd.date_range(start=start - pd.to_timedelta(1, 'd'), end=end + pd.to_timedelta(1, 'd'), freq='h'), utc=True)
    df2 = pd.DataFrame({'date2': dates.tz_localize(None)}, index=dates)
    
    out = df1.join(df2)
    out['dst_flag'] = (out['date1'] - out['date2']) / pd.to_timedelta(1, unit='h') + 5
    out.drop(columns=['date1', 'date2'], inplace=True)
    summer_dst = pd.Series(out[out['dst_flag'] == 1].index).dt.strftime('%Y-%m-%d')
    winter_dst = pd.Series(out[out['dst_flag'] == 0].index).dt.strftime('%Y-%m-%d')
    total_dst = summer_dst + winter_dst
    rider_df['summer_dst'] = rider_df['start_date'].astype(str).isin(summer_dst.astype(str)).astype(int)
    rider_df['winter_dst'] = rider_df['start_date'].astype(str).isin(winter_dst.astype(str)).astype(int)
    rider_df['not_dst'] = rider_df['start_date'].astype(str).isin(total_dst.astype(str)).astype(int)
    # Reversing the boolean values. Want 0 to be equal to dsv and 1 to be not dsv
    rider_df['not_dst'] = pd.Series([0 if i == 1 else 1 for i in rider_df['not_dst']])
    
    rider_df['dayofyear'] = pd.Series([i.dayofyear for i in list(annual_rides['starttime'])])
    rider_df['year'] = rider_df['starttime'].dt.year
    return rider_df

In [103]:
def time_agg(df,variable, frequency):
    agg_df = pd.DataFrame(df.groupby([pd.Grouper(freq=frequency)])[variable].count()).reset_index()
    agg_df = feature_engineering(agg_df)
    return agg_df

In [104]:
monthly_rides = time_agg(rider_df, 'counts', 'm')

In [110]:
px.area(monthly_rides, x='month', y="counts", color = 'year', title = 'Area Plot of Rides per Year')

In [106]:
monthly_miles = time_agg(rider_df, 'miles', 'm')

In [109]:
px.area(monthly_miles, x='month', y="miles", color = 'year', title = 'Area Plot of Miles per Year')