In [3]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import swifter
import pickle
import re

In [4]:
riders = pd.read_csv('../data/output.csv',\
                     usecols = ['tripduration','starttime','stoptime','start station id', 'start station name',\
                               'end station id', 'bikeid'],\
                     parse_dates = ['starttime', 'stoptime'],\
                     dtype = {'start station id': 'Int32', 'end station id': 'Int32'})

In [5]:
riders.head(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,bikeid
0,362,2017-09-01 00:00:17,2017-09-01 00:06:19,3331,Riverside Dr & W 104 St,3328,14530
1,188,2017-09-01 00:00:21,2017-09-01 00:03:30,3101,N 12 St & Bedford Ave,3100,15475
2,305,2017-09-01 00:00:25,2017-09-01 00:05:30,3140,1 Ave & E 78 St,3141,30346
3,223,2017-09-01 00:00:52,2017-09-01 00:04:36,236,St Marks Pl & 2 Ave,473,28056
4,758,2017-09-01 00:01:01,2017-09-01 00:13:40,3427,Lafayette St & Jersey St,3431,25413


In [6]:
def cleaning_ridersdata(df):
    
    """This function will clean the riders data.
    The data cleaning function contains several new features.
    """
    
    df.starttime.astype('M8[us]')
    df.stoptime.astype('M8[us]')
    
    # Drop missing values
    df = df.dropna(subset=['start station id', 'end station id']).reset_index()
    
    # create year column
    df['year'] = df.starttime.dt.year
    print('completed year column')
    
    # create date column
    df['start_date'] = df.starttime.dt.date
    df['stop_date'] = df.stoptime.dt.date
    print('completed date column')
    
    # create hour column
    df['start_hour'] = df.starttime.dt.hour
    df['stop_hour'] = df.stoptime.dt.hour
    print('completed hour column')
    
    # create minutes column
    df['start_min'] = df.starttime.dt.minute
    df['stop_min'] = df.starttime.dt.minute
    print('completed minutes column')
    
    # For easier computation and analysis, we decided to round the minutes by 20 minutes time frame
    df['start_min'] = df['start_min'].apply(lambda x: '00' if x < 20 else '20' if x < 40 else '40')
    df['stop_min'] = df['stop_min'].apply(lambda x: '00' if x < 20 else '20' if x < 40 else '40')
    
    
    # create season column
    df['season'] = df.starttime.dt.month.apply(lambda x: 'winter' if x <= 2 else 'spring' if x <= 5 else \
    'summer' if x <= 8 else 'fall' if x <= 11 else 'winter')
    print('completed season column')
    
    # create day of week column
    df['dayofweek'] = df['starttime'].dt.weekday.apply(lambda x: 'Monday' if x == 0 else 'Tuesday' if x==1 else \
    'Wednesday'if x == 2 else 'Thursday' if x == 3 else 'Friday' if x == 4 else 'Saturday' if x == 5 else 'Sunday')
    print('completed week column')
    
    # create interval column
    df['start_interval'] = df.apply(lambda x: str(x['start_hour']) + ":" + str(x['start_min']), axis=1)
    df['stop_interval'] = df.apply(lambda x: str(x['stop_hour']) + ":" + str(x['stop_min']), axis=1)
    

In [27]:
riders.to_csv("../Data/riders_cleaned.csv")

In [None]:
cleaning_ridersdata(riders)

completed year column
completed date column
completed hour column
completed minutes column
completed season column
completed week column


In [8]:
riders = pd.read_csv('../data/riders_cleaned.csv.gz',\
                    usecols = ['tripduration','starttime','stoptime','start station id', 'start station name',\
                               'end station id', 'bikeid', 'start_date', 'stop_date', 'start_hour', 'season',\
                              'stop_hour', 'start_min', 'stop_min','dayofweek'],\
                     dtype = {'start station id': 'Int32', 'end station id': 'Int32'})

In [5]:
riders.dtypes

tripduration           int64
starttime             object
stoptime              object
start station id       Int32
start station name    object
end station id         Int32
bikeid                 int64
start_date            object
stop_date             object
start_hour             int64
stop_hour              int64
start_min              int64
stop_min               int64
dtype: object

In [9]:
# A function that creates a dataframe that aggregates a bike demands in a hour interval
def create_outgoing_rides(df):
    outgoing_rides = df.groupby(['start station id','start_date','season','dayofweek','start_hour'])\
    [['starttime']].count().reset_index().rename(columns={'starttime':'outgoing_bike_count'})
        
    print('completed first job')
        
    return outgoing_rides

In [11]:
outgoing = create_outgoing_rides(riders)

completed first job


In [12]:
outgoing

Unnamed: 0,start station id,start_date,season,dayofweek,start_hour,outgoing_bike_count
0,72,2013-06-01,summer,Saturday,0,1
1,72,2013-06-01,summer,Saturday,7,1
2,72,2013-06-01,summer,Saturday,8,2
3,72,2013-06-01,summer,Saturday,10,1
4,72,2013-06-01,summer,Saturday,11,1
...,...,...,...,...,...,...
22289226,4249,2020-11-30,fall,Monday,17,1
22289227,4249,2020-11-30,fall,Monday,18,1
22289228,4249,2020-11-30,fall,Monday,19,2
22289229,4249,2020-11-30,fall,Monday,20,2


In [13]:
outgoing['bike_demand'] = outgoing.apply(lambda x: 'High' if x['outgoing_bike_count'] > outgoing.outgoing_bike_count.describe()['75%'] else \
'Medium' if x['outgoing_bike_count']>outgoing.outgoing_bike_count.describe()['25%'] else 'Low', axis=1)

KeyboardInterrupt: 

In [None]:
outgoing.to_csv("../Data/outgoing.csv")

In [None]:
# A function that creates a dataframe that aggregates a dock demands in a hour interval
def create_incoming_rides(df):
    incoming_rides = df.groupby(['end station id','stoptime_date','season','dayofweek','stop_hour'])\
    [['stoptime']].count().reset_index().rename(columns={'stoptime':'incoming_bike_count'})
        
    print('completed first job')
    
    return incoming_rides

In [None]:
incoming = create_incoming_rides(riders)

In [None]:
incoming['dock_demand'] = incoming.apply(lambda x: 'High' if x['incoming_bike_count'] > incoming.incoming_bike_count.describe()['75%'] else \
'Medium' if x['incoming_bike_count'] > incoming.incoming_bike_count.describe()['25%'] else 'Low', axis = 1)
        

In [None]:
incoming.to_csv("../Data/incoming.csv")