### Problem Statement
[insert]


### Import Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# https://data.ny.gov/Transportation/Turnstile-Usage-Data-2016/ekwu-khcy
# https://data.ny.gov/api/views/ekwu-khcy/rows.csv?accessType=DOWNLOAD
# parse_dates = pd.datetools.to_datetime
data = pd.read_csv('Turnstile_Usage_Data__2016.csv',parse_dates=True, dtype={'C/A': str,'Unit':str,'SCP':str,'Station':str,'Line Name':str, 'Division':str,'Date': object, 'Time':object,'Description':str,'Entries':int,'Exits':int})
data = pd.DataFrame(data) # change to pandas dataframe
# parse spaces from columns; to use better approach when 
data=data.rename(columns = {'Exits                                                     ':'Exits'})
# Do we need to convert Date column to datetime?
# data['Date'] = data['Date'].to_datetime()
# data['Date'] = data['Date'].dt.date
data.head()

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,A002,R051,02-00-00,59 ST,NQR456,BMT,03/04/2016,23:00:00,REGULAR,5572864,1881239
1,A002,R051,02-00-00,59 ST,NQR456,BMT,03/04/2016,19:00:00,REGULAR,5572521,1881206
2,A002,R051,02-00-00,59 ST,NQR456,BMT,03/04/2016,15:00:00,REGULAR,5571587,1881113
3,A002,R051,02-00-00,59 ST,NQR456,BMT,03/04/2016,11:00:00,REGULAR,5571313,1881031
4,A002,R051,02-00-00,59 ST,NQR456,BMT,03/04/2016,08:10:05,REGULAR,5571173,1880736


In [3]:
# data['Date'] = data['Date'].astype('datetime64') # convert data column to date type
# print(type(data['Date']))
# remove first few columns, deemed irrelevant
# data = data.iloc[:,:]
# data.head()

### Process Data

In [4]:
data = data.groupby(["Station", "Date",'Time'],as_index=False).max() # sort by station, date
data.head()

Unnamed: 0,Station,Date,Time,C/A,Unit,SCP,Line Name,Division,Description,Entries,Exits
0,1 AV,01/02/2016,03:00:00,H008,R248,01-00-04,L,BMT,REGULAR,952275081,996872113
1,1 AV,01/02/2016,07:00:00,H008,R248,01-00-04,L,BMT,REGULAR,952275131,996872154
2,1 AV,01/02/2016,11:00:00,H008,R248,01-00-04,L,BMT,REGULAR,952275240,996872335
3,1 AV,01/02/2016,15:00:00,H008,R248,01-00-04,L,BMT,REGULAR,952275606,996872571
4,1 AV,01/02/2016,19:00:00,H008,R248,01-00-04,L,BMT,REGULAR,952276196,996873070


In [5]:
data = data.groupby(['Station','Date']).agg(lambda x: max(x) - min(x))

In [6]:
# Groupby Station, Day
data = data.groupby(["Station", "Date"],as_index=False).max() # sort by station, date
data = pd.DataFrame(data)
data.head()

Unnamed: 0,Entries,Exits
0,1752,1411
1,1533,923
2,2234,2251
3,2254,2335
4,2669,2262


In [7]:
# Data presented as cumulative; add columns to track the increment
# Add Traffic_Tot column as sum of Entry and Exit to represent activity
data['Entry_Inc'] = abs(data.Entries.diff())
# data['Entry_Inc2'] = data['Entry_Inc']

data['Exits_Inc'] = abs(data.Exits.diff())
data['Traffic_Tot'] = np.add(data.Entry_Inc, data.Exits_Inc)
data.head()

Unnamed: 0,Entries,Exits,Entry_Inc,Exits_Inc,Traffic_Tot
0,1752,1411,,,
1,1533,923,219.0,488.0,707.0
2,2234,2251,701.0,1328.0,2029.0
3,2254,2335,20.0,84.0,104.0
4,2669,2262,415.0,73.0,488.0


In [8]:
# filter outliers above specified quantile
quant_filt = 0.99
q = data["Entry_Inc"].quantile(quant_filt)
data = data[data["Entry_Inc"] < q]

r = data["Exits_Inc"].quantile(quant_filt)
data = data[data["Exits_Inc"] < r]

s = data["Traffic_Tot"].quantile(quant_filt)
data = data[data["Traffic_Tot"] < s]

# data = data.dropna() # apparently no issues with NaN cells; running as best practice

data.head()

Unnamed: 0,Entries,Exits,Entry_Inc,Exits_Inc,Traffic_Tot
1,1533,923,219.0,488.0,707.0
2,2234,2251,701.0,1328.0,2029.0
3,2254,2335,20.0,84.0,104.0
4,2669,2262,415.0,73.0,488.0
5,2661,2082,8.0,180.0,188.0


### Sort Data
Data is sorted by Station.

In [9]:
# data_stndatetime = data.groupby([ "Station", "Date",'Time']).sum() # sort by station, date and time
# data_stndatetime.head()

In [10]:
data_stn = data.groupby([ "Station"]).sum() # sort by station
# data_stn = pd.DataFrame(data_stn) # convert to panda dataframe
data_stn['Station_ID'] = data_stn.index # Make Station_ID a non-index column
data_stn = data_stn.sort_values(['Traffic_Tot'], ascending=[False]) # Sort stations by traffic
data_stn['Idx'] = np.arange(data_stn.shape[0]) # add enumerated column
data_stn = data_stn.set_index('Idx') # assign enumerated column as index column
data_stn.head()

KeyError: 'Station'

## Prep Data for Charting

In [None]:
# demographic data: NYC census
# map of startups in NYC
# calendar of tech events
# unicorn 

# import seaborn as sns
sns.set_style("whitegrid")


In [None]:
data_stn = data_stn.iloc[:20,:] # limit to 20 most active stations

data_stn.head()


In [None]:
# verify results: check http://web.mta.info/nyct/facts/ffsubway.htm
ax = sns.barplot(x="Traffic_Tot", y='Station_ID', data=data_stn)
# gut feeling is that 23rd St is not a top station.  If more time, would
# further analyze 23rd Street for outliers.


In [None]:
# PROBLEM STATEMENT
# Optimize utilization of street teams by using data science by utilizing MTA 
# and [Demographic] data

# PRESENTATION
# Executive Summary / Scenario
# MTA findings
# Demographic findings
# Combined MTA/Demo Analysis
# Conclusion
# Q&A

# TODO:
# Clean up JN
# day/hour analyses
# combine demographic data
# Map which includes demographic data (ie income) and station activity (by size of dot)
# Analyze top stations by time period (season, month, week, day, hour)
# add lat/long
# vet data better / graph 

# Tomorrow morning
# Finished pres, practice

### Top Station Activity: Penn Station 34th Street

In [None]:
data_Penn = data[data['Station'] == '34 ST-PENN STA']
# data_Penn.head()
np.max(data_Penn['Traffic_Tot'])


In [None]:
y = pd.Series(data_Penn['Traffic_Tot'])
x = pd.Series(data_Penn['Date'])

plt.plot(x,y)
# axes = plt.gca()
# axes.set_ylim([1000,2000])
plt.show()

### Top Station Activity: 23rd St

In [None]:
data_23ST = data[data['Station'] == '23 ST']
# data_23ST.head()
np.max(data_23ST['Traffic_Tot'])


In [None]:
y = pd.Series(data_23ST['Traffic_Tot'])
x = pd.Series(data_23ST['Date'])

plt.plot(x,y)
# axes = plt.gca()
# axes.set_ylim([1000,2000])
plt.show()

### Top Station Activity: 72nd St

In [None]:
data_72ST = data[data['Station'] == '72 ST']
y = pd.Series(data_72ST['Traffic_Tot'])
x = pd.Series(data_72ST['Date'])

plt.plot(x,y)
# axes = plt.gca()
# axes.set_ylim([1000,2000])
plt.show()

In [None]:
data = pd.DataFrame(data)
data['Date'] = pd.to_datetime(data['Date'])
type(data['Date'])