In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_201128.txt')
df.head()

In [None]:
#Convert Date and Time into a more useable DateTime format and drop old columns.
df['DateTime'] = pd.to_datetime(df.DATE+' '+df.TIME)

#Record what day of the week each entry is from
df['Day']=df.DateTime.dt.day_name()

#SCP represents a specific turnstile.
#C/A represents a control area. This is a bank of turnstiles.  
#Unit represents a remote unit. Usually a whole station, or an area in a complex station.
#We can combine all three to create a unique ID for any turnstile
df['Unit_ID'] = df.SCP+' '+df['C/A']+' '+df.UNIT

#Rename Exits column to remove extra spaces
df.rename(columns={'EXITS                                                               ':'EXITS'},inplace=True)

#Calculate the entries/exits in a four hour period by finding the difference between rows
df['Four Hour Entries'] = df.groupby('Unit_ID').ENTRIES.diff()
df['Four Hour Exits'] = df.groupby('Unit_ID').EXITS.diff()

#Replace NaN, negative values, and impossibly large values by the average of the turnstile
df['Four Hour Entries'].fillna(-1, inplace=True)
df['Four Hour Exits'].fillna(-1, inplace=True)
df['Four Hour Entries'] = df.groupby(['Unit_ID','C/A'])['Four Hour Entries'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))
df['Four Hour Exits'] = df.groupby(['Unit_ID','C/A'])['Four Hour Exits'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))


In [None]:
#Find daily sum for each turnstile for each day of the week
df_daily = df.groupby(['Unit_ID','DATE'])['Four Hour Entries','Four Hour Exits'].sum()
df_daily.head(20)