In [1]:
#Imports
import pandas as pd
from zipfile import ZipFile
from StringIO import StringIO
import urllib2
import numpy as np
import datetime
from scipy.stats import chisquare

In [2]:
# Pull down Citibike data into memory. 
ride_frames = []

# A list of months, only valid for 01-06
months = ['04']#['01', '02', '03', '04', '05', '06']
for month in months:
    url = "https://s3.amazonaws.com/tripdata/2015%s-citibike-tripdata.zip" % month
    r = urllib2.urlopen(url).read()
    f = ZipFile(StringIO(r))
    csv = f.open("2015%s-citibike-tripdata.csv" % month)
    frame = pd.read_csv(csv, parse_dates=['starttime'])
    ride_frames.append(frame)
    
# Concatenate all the individual months into one.
rides = pd.concat(ride_frames, ignore_index=True)

In [3]:
# Add separate day and date attributes to the dataframe
rides['day'] = rides['starttime'].dt.dayofweek
rides['date'] = rides['starttime'].dt.dayofyear

In [4]:
# Create separate variables for the weekend and weekday rides.
weekends = rides[(rides.day == 5) | (rides.day == 6)] 
weekdays = rides[(rides.day != 5) & (rides.day != 6)]

# Get the number of weekends and weekdays in the sample to normalize the values
num_weekends = len(np.unique(weekends['date']))
num_weekdays = len(np.unique(weekdays['date']))

# Get the average number of rides for customers/subscribers on weekdays/weekends
custWknds = float(len(weekends[weekends.usertype == 'Customer']))/num_weekends
subWknds = float(len(weekends[weekends.usertype == 'Subscriber']))/num_weekends
custWkday = float(len(weekdays[weekdays.usertype == 'Customer']))/num_weekdays
subWkday = float(len(weekdays[weekdays.usertype == 'Subscriber']))/num_weekdays

In [6]:
# Test whether the total customers on weekends is different than weekday
# The expected value for the weekday is the weekday count and the expected value for the weekend is
# The proportion of customers on weekdays times the total number of rides on weekends.

# Get the average total number of rides for weekends and weekdays
totalWknds = custWknds + subWknds
totalWkday = custWkday + subWkday

# Get the percent of rides that are made by customers on weekdays
custWkdayPercent = custWkday/totalWkday

# The sample values are the customer rides for weekday/weekend
sample = [custWkday, custWknds]

# Make the expected value for the weekend the customer weekday percent
# scaled to the total number of weekend rides
expected = [custWkday, totalWknds*custWkdayPercent]

chisquare(sample, expected)

(7229.5300960536615, 0.0)