In [9]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import swifter
import pickle
import re

In [15]:
riders = pd.read_csv('../data/output.csv',\
                     parse_dates = ['starttime', 'stoptime'],\
                     dtype = {'start station id': 'Int32', 'end station id': 'Int32'})

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
riders.head(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,362,2017-09-01 00:00:17,2017-09-01 00:06:19,3331,Riverside Dr & W 104 St,40.801343,-73.971146,3328,W 100 St & Manhattan Ave,40.795,-73.9645,14530,Subscriber,1993,1
1,188,2017-09-01 00:00:21,2017-09-01 00:03:30,3101,N 12 St & Bedford Ave,40.720798,-73.954847,3100,Nassau Ave & Newell St,40.724813,-73.947526,15475,Subscriber,1988,1
2,305,2017-09-01 00:00:25,2017-09-01 00:05:30,3140,1 Ave & E 78 St,40.771404,-73.953517,3141,1 Ave & E 68 St,40.765005,-73.958185,30346,Subscriber,1969,1
3,223,2017-09-01 00:00:52,2017-09-01 00:04:36,236,St Marks Pl & 2 Ave,40.728419,-73.98714,473,Rivington St & Chrystie St,40.721101,-73.991925,28056,Subscriber,1993,1
4,758,2017-09-01 00:01:01,2017-09-01 00:13:40,3427,Lafayette St & Jersey St,40.724305,-73.99601,3431,E 35 St & 3 Ave,40.746524,-73.977885,25413,Subscriber,1987,1


In [18]:
def cleaning_ridersdata(df):
    df.starttime.astype('M8[us]')
    df.stoptime.astype('M8[us]')
    
    
    # create year column
    df['year'] = df.starttime.dt.year
    print('completed year column')
    
    # create date column
    df['start_date'] = df.starttime.dt.date
    df['stop_date'] = df.stoptime.dt.date
    print('completed date column')
    
    # create hour column
    df['start_hour'] = df.starttime.dt.hour
    df['stop_hour'] = df.stoptime.dt.hour
    print('completed hour column')
    
    # create minutes column
    df['start_min'] = df.starttime.dt.minute
    df['stop_min'] = df.starttime.dt.minute
    print('completed minutes column')
    
    # For easier computation and analysis, we decided to round the minutes by 20 minutes time frame
    df['start_min'] = df['start_min'].apply(lambda x: '00' if x < 20 else '20' if x < 40 else '40')
    df['stop_min'] = df['stop_min'].apply(lambda x: '00' if x < 20 else '20' if x < 40 else '40')
    
    
    # create season column
    df['season'] = df.starttime.dt.month.apply(lambda x: 'winter' if x <= 2 else 'spring' if x <= 5 else \
    'summer' if x <= 8 else 'fall' if x <= 11 else 'winter')
    print('completed season column')
    
    # create day of week column
    df['dayofweek'] = df['starttime'].dt.weekday.apply(lambda x: 'Monday' if x == 0 else 'Tuesday' if x==1 else \
    'Wednesday'if x == 2 else 'Thursday' if x == 3 else 'Friday' if x == 4 else 'Saturday' if x == 5 else 'Sunday')
    print('completed week column')
    
    # create interval column
    df['start_interval'] = df.apply(lambda x: str(x['start_hour']) + ":" + str(x['start_min']), axis=1)
    df['stop_interval'] = df.apply(lambda x: str(x['stop_hour']) + ":" + str(x['stop_min']), axis=1)
    

In [None]:
cleaning_ridersdata(riders)

In [7]:
riders.head(10)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,start_date,stop_date,start_hour,stop_hour,start_min,stop_min,season,dayofweek,start_interval,stop_interval
0,362,2017-09-01 00:00:17,2017-09-01 00:06:19,3331,Riverside Dr & W 104 St,40.801343,-73.971146,3328,W 100 St & Manhattan Ave,40.795,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
1,188,2017-09-01 00:00:21,2017-09-01 00:03:30,3101,N 12 St & Bedford Ave,40.720798,-73.954847,3100,Nassau Ave & Newell St,40.724813,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
2,305,2017-09-01 00:00:25,2017-09-01 00:05:30,3140,1 Ave & E 78 St,40.771404,-73.953517,3141,1 Ave & E 68 St,40.765005,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
3,223,2017-09-01 00:00:52,2017-09-01 00:04:36,236,St Marks Pl & 2 Ave,40.728419,-73.98714,473,Rivington St & Chrystie St,40.721101,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
4,758,2017-09-01 00:01:01,2017-09-01 00:13:40,3427,Lafayette St & Jersey St,40.724305,-73.99601,3431,E 35 St & 3 Ave,40.746524,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
5,2089,2017-09-01 00:01:20,2017-09-01 00:36:09,3016,Kent Ave & N 7 St,40.720368,-73.961651,3358,Garfield Pl & 8 Ave,40.671198,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
6,121,2017-09-01 00:01:22,2017-09-01 00:03:23,3357,W 106 St & Amsterdam Ave,40.800836,-73.966449,3343,W 107 St & Columbus Ave,40.799757,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
7,408,2017-09-01 00:01:39,2017-09-01 00:08:27,470,W 20 St & 8 Ave,40.743453,-74.00004,388,W 26 St & 10 Ave,40.749718,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
8,485,2017-09-01 00:01:41,2017-09-01 00:09:46,513,W 56 St & 10 Ave,40.768254,-73.988639,529,W 42 St & 8 Ave,40.75757,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00
9,220,2017-09-01 00:01:40,2017-09-01 00:05:21,405,Washington St & Gansevoort St,40.739323,-74.008119,358,Christopher St & Greenwich St,40.732916,...,2017-09-01,2017-09-01,0,0,0,0,fall,Friday,0:00,0:00


In [14]:
riders.to_csv("../Data/riders_cleaned.csv")

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+------------+--------+----------+
| Column     | Found  | Expected |
+------------+--------+----------+
| birth year | object | float64  |
+------------+--------+----------+

The following columns also raised exceptions on conversion:

- birth year
  ValueError("could not convert string to float: '\\\\N'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'birth year': 'object'}

to the call to `read_csv`/`read_table`.

In [11]:
# A function that creates a dataframe that aggregates a bike demands in a hour interval
def create_outgoing_rides(df):
    with ProgressBar():
        outgoing_rides = df.groupby(['start station id','start_date','season','dayofweek','start_hour'])\
        [['starttime']].count().reset_index().rename(columns={'starttime':'outgoing_bike_count'}).compute(scheduler="processes")
    
        outgoing_rides['bike_demand']=outgoing_rides.swifter.apply(lambda x: 'High' if x['outgoing_bike_count']> outgoing_rides.outgoing_bike_count.describe()['75%'] else \
        'Medium' if x['outgoing_bike_count']>outgoing_rides.outgoing_bike_count.describe()['25%'] else 'Low', axis=1)

IndentationError: expected an indented block (<ipython-input-11-6778a0b27894>, line 4)

In [None]:
# A function that creates a dataframe that aggregates a dock demands in a hour interval
def create_incoming_rides(df):
    with ProgressBar():
        incoming_rides = df.groupby(['end station id','stoptime_date','season','dayofweek','stoptime_interval'])\
        [['stoptime']].count().reset_index().rename(columns={'stoptime':'incoming_bike_count'}).compute(scheduler="processes")
        incoming_rides['dock_demand']=incoming_rides.swifter.apply(lambda x: 'High' if x['incoming_bike_count']>4 else \
        'Medium' if x['incoming_bike_count']>1 else 'Low', axis=1) # 75 percentile is 4 and 25 percentile is 1.
    
    