In [1]:
import pandas as pd
import numpy as np

from dateutil.parser import parse
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#store the files for reference
files = !ls | grep turnstile.*txt
working_file = files[0]
zip_codes_raw = 'zips.csv'

In [3]:
df = pd.read_csv(working_file,
                usecols = ['C/A', 'UNIT', 'SCP', 'STATION', 'DATE', 'TIME', 'ENTRIES', 'EXITS                                                               '])
df.rename(columns = {
    'C/A': 'area',
    'UNIT': 'unit',
    'SCP': 'scp',
    'STATION': 'station',
    'DATE': 'date',
    'TIME': 'time',
    'ENTRIES': 'entries',
    'EXITS                                                               ': 'exits'},
    inplace = True)
# group = df.groupby(['area','unit', 'scp', 'station', 'date', 'time', 'entries', 'exits'])
# date = pd.DataFrame(group.size().reset_index())
# del date[0]
# date = date.sort_values(by='date').groupby(['station'])


# date['entries'] = date['entries'] - date['entries'].shift()
# date['exits'] = date['exits'] - date['exits'].shift()
# date['daily'] = date['entries']- date['exits']
# date = date[date.time == '20:00:00']


df['hourly_entries'] = df.sort_values(['station','area','unit','scp','date','time'])['entries'].diff()
df['hourly_exits'] = df.sort_values(['station','area','unit','scp','date','time'])['exits'].diff()

hourly = ['hourly_entries', 'hourly_exits']

for item in hourly:
    df[item].fillna(0, inplace=True)

In [4]:
zip_codes = pd.read_csv(zip_codes_raw)

In [5]:
df = df.merge(zip_codes, on='station')

In [6]:
df.isnull().sum()

area              0
unit              0
scp               0
station           0
date              0
time              0
entries           0
exits             0
hourly_entries    0
hourly_exits      0
zip_code          0
dtype: int64

In [7]:
#set all negative hourly entries to absolute value
for item in hourly:
    df.loc[df[item] < 0, item] = abs(df[item])
    df[df[item] < 0]

In [8]:
df['hourly_entries'].describe(percentiles=[0.01, 0.999])

count    1.942560e+05
mean     1.410978e+06
std      4.068987e+07
min      0.000000e+00
1%       0.000000e+00
50%      8.700000e+01
99.9%    3.211954e+08
max      2.108298e+09
Name: hourly_entries, dtype: float64

In [9]:
df['hourly_exits'].describe(percentiles=[0.01, 0.999])

count    1.942560e+05
mean     1.145393e+06
std      3.617327e+07
min      0.000000e+00
1%       0.000000e+00
50%      5.700000e+01
99.9%    2.989942e+08
max      2.087387e+09
Name: hourly_exits, dtype: float64

In [10]:
for item in hourly:
    df['good'] = df[item] <= 5000

In [11]:
hourly

['hourly_entries', 'hourly_exits']

In [12]:
df['traffic'] = df[hourly].sum(axis=1)

In [13]:
#zip_codes with median income > $200k
#and % commute to work by public transporation >= 70%
potential_census_zips = [10024, 11217, 11238]

In [14]:
potential_stations = df[df['zip_code'].isin(potential_census_zips)]
potential_stations = potential_stations[potential_stations['good']==True]

In [15]:
len(potential_stations)

4822

In [16]:
stations = list(potential_stations['station'].unique())
stations

['PROSPECT PARK',
 'ATL AV-BARCLAY',
 '79 ST',
 'ATLANTIC AV',
 '81 ST-MUSEUM',
 'LAFAYETTE AV',
 'CLASSON AV',
 'NEVINS ST',
 'GRAND ARMY PLAZ',
 'EASTN PKWY-MUSM']

In [23]:
potential_stations[potential_stations['date'] == '04/30/2016'].sort_values(by=['unit','date','time'])

Unnamed: 0,area,unit,scp,station,date,time,entries,exits,hourly_entries,hourly_exits,zip_code,good,traffic
187911,R608,R056,00-00-00,NEVINS ST,04/30/2016,04:00:00,847462,1493474,40.0,45.0,11217,True,85.0
187953,R608,R056,00-00-01,NEVINS ST,04/30/2016,04:00:00,8592045,4496105,13.0,14.0,11217,True,27.0
187995,R608,R056,00-03-00,NEVINS ST,04/30/2016,04:00:00,3899353,4156076,30.0,18.0,11217,True,48.0
188037,R608,R056,00-03-01,NEVINS ST,04/30/2016,04:00:00,2764594,1916360,15.0,16.0,11217,True,31.0
188079,R608,R056,00-03-02,NEVINS ST,04/30/2016,04:00:00,662640,271304,16.0,9.0,11217,True,25.0
188121,R609,R056,01-00-00,NEVINS ST,04/30/2016,04:00:00,791555,813572,8.0,3.0,11217,True,11.0
188167,R609,R056,01-00-01,NEVINS ST,04/30/2016,04:00:00,1458431,1004935,13.0,13.0,11217,True,26.0
188213,R609,R056,01-00-02,NEVINS ST,04/30/2016,04:00:00,3728682,1483432,21.0,6.0,11217,True,27.0
188259,R609,R056,01-03-00,NEVINS ST,04/30/2016,04:00:00,3592894,10454816,9.0,15.0,11217,True,24.0
188305,R609,R056,01-03-01,NEVINS ST,04/30/2016,04:00:00,1908826,1591480,16.0,14.0,11217,True,30.0


In [None]:
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

In [None]:
potential_stations['datetime'] = (potential_stations['date'] + ' ' + potential_stations['time']).apply(str)
potential_stations['datetime'] = potential_stations['datetime'].apply(parse)
potential_stations['parsed_time'] = potential_stations['time'].apply(parse)

In [None]:
#potential_stations = potential_stations.set_index(potential_stations['datetime'])

In [None]:
station = potential_stations[potential_stations['station'] == 'PROSPECT PARK']

In [None]:
station[station['time'] == '00:00:00']

In [None]:
station.columns.values

In [None]:
station = station.groupby(by=['parsed_time']).sum()[['hourly_entries', 'hourly_exits', 'traffic']]

In [None]:
station

In [None]:
station.index

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot(station.index, station['traffic'])