In [60]:
import os
import sys
import pandas as pd
from pandas import to_datetime
import numpy as np


In [52]:
def get_tripdata_year(year):
    for m in range(12):
        get_tripdata(year,m+1)

In [58]:
def get_tripdata(year,month):
    ym = (f"{year}{month:02d}")
    filename = ym + '-citibike-tripdata.csv'
    print(filename)
    os.system('wget https://s3.amazonaws.com/tripdata/' + filename + '.zip') 
    os.system('unzip ' + filename + '.zip')
    os.system('rm ' + filename + '.zip')
    preparefile(filename)
    os.system('rm ' + filename)

In [55]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate distance in miles between two cooridnates
    latitidude, longitude in decimal degrees 

    """
    lon1, lat1, lon2, lat2 = map(np.radians ,[lon1, lat1, lon2, lat2])

    h = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    miles = 3959 * (2 * np.arcsin(np.sqrt(h))) 
    return miles 

In [64]:
def preparefile(filename):
    """
    Clean and convert citibike tripdata csv file to parquet 
    """
    try:
        df = pd.read_csv(filename)
    except IOError as e:
        print(e)
        exit 
    
    # convert start and end time columns to datetime 
    df['starttime'] = pd.to_datetime(df['starttime'])
    df['stoptime'] = pd.to_datetime(df['stoptime'])
    
    # convert trip duration from seconds to minutes 
    df['tripminutes'] = df['tripduration'] // 60 
    
    # exctract date and time components, convert to category 
    df['start hour']=df['starttime'].dt.hour.astype('category')
    df['start day']=df['starttime'].dt.day.astype('category')
    df['weekday']=df['starttime'].dt.weekday.astype('category') # day of week 
    df['month']=df['starttime'].dt.month.astype('category')
    df['year']=df['starttime'].dt.year.astype('category')
    
    # create weekend column as True for weekend days 
    df['weekend'] = [ d >= 5 for d in df['weekday']]
        
    # convert object columns to categories
    cols = ['start station name', 'end station name', 'bikeid',  'usertype', 'gender']
    for col in cols:
        df[col] = df[col].astype('category')
        
    # ensure station ids are integer (due to possible missing values)
    df['start station id'] = df['start station id'].astype('Int64')
    df['end station id'] = df['end station id'].astype('Int64')
    
    # Create age column, omitting rows w/o gender specified to avoid default 1969
    # and born before 1946 
    skip = (df['gender'] == 0) | (df['birth year'] < 1946) 
    df['age'] = (2020 - df['birth year']).mask(skip,None)
    
    # calculate distance between stations 
    df['distance']=haversine(df['start station latitude'],df['start station longitude'],df['end station latitude'],df['end station longitude'])

    # write file with same name and extension parquet
    parquetfile=filename.split('.')[0]+'.parquet'
    df.to_parquet(parquetfile) 
    return parquetfile 


In [66]:
get_tripdata_year(2019)

201901-citibike-tripdata.csv
201902-citibike-tripdata.csv
201903-citibike-tripdata.csv
201904-citibike-tripdata.csv
201905-citibike-tripdata.csv
201906-citibike-tripdata.csv
201907-citibike-tripdata.csv
201908-citibike-tripdata.csv
201909-citibike-tripdata.csv
201910-citibike-tripdata.csv
201911-citibike-tripdata.csv
201912-citibike-tripdata.csv


In [67]:
get_tripdata_year(2020)

202001-citibike-tripdata.csv
202002-citibike-tripdata.csv
202003-citibike-tripdata.csv
202004-citibike-tripdata.csv
202005-citibike-tripdata.csv
202006-citibike-tripdata.csv
202007-citibike-tripdata.csv
202008-citibike-tripdata.csv
202009-citibike-tripdata.csv
202010-citibike-tripdata.csv
202011-citibike-tripdata.csv
202012-citibike-tripdata.csv
