In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

# Importing Data

In [16]:
apr1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_170401.txt'

df1 = pd.read_csv(apr1)

# Initial Data Cleaning

In [17]:
# The column names contain unneeded whitespace.
df1.columns.values

array(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE',
       'TIME', 'DESC', 'ENTRIES',
       'EXITS                                                               '], dtype=object)

In [25]:
# The dataframe also lacks a timeseries.
df1.dtypes

C/A         object
UNIT        object
SCP         object
STATION     object
LINENAME    object
DIVISION    object
DATE        object
TIME        object
DESC        object
ENTRIES      int64
EXITS        int64
dtype: object

In [21]:
# Strip whitespace from column names
df1.columns = [s.strip() for s in df1.columns.values]

# Create datetime column with datetime datatype
df1['Datetime'] = pd.to_datetime(df1.DATE + ' ' + df1.TIME,
                                 format = '%m/%d/%Y %H:%M:%S')

df1.iloc[:5]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,Datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,00:00:00,REGULAR,6108321,2069313,2017-03-25 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,04:00:00,REGULAR,6108343,2069319,2017-03-25 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,08:00:00,REGULAR,6108360,2069347,2017-03-25 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,12:00:00,REGULAR,6108462,2069456,2017-03-25 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,16:00:00,REGULAR,6108689,2069525,2017-03-25 16:00:00


In [7]:
# Chain methods together to further clean data:
    # drop old date and time columns
    # rename columns
    
dict_col_rename = {'C/A' : 'C_A', 'UNIT' : 'Unit', 'STATION' : 'Station', 'LINENAME' : 'Linename',
                  'DIVISION' : 'Division', 'DESC' : 'Desc', 'ENTRIES' : 'Entries', 'EXITS' : 'Exits',
                  'DATE' : 'Date'}    

df2 = (df1
       #.drop('DATE', axis = 1)
       .drop('TIME', axis = 1)
       .rename(columns = dict_col_rename)
      )

df2.iloc[:5]

Unnamed: 0,C_A,Unit,SCP,Station,Linename,Division,Date,Desc,Entries,Exits,Datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108321,2069313,2017-03-25 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108343,2069319,2017-03-25 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108360,2069347,2017-03-25 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108462,2069456,2017-03-25 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108689,2069525,2017-03-25 16:00:00


# Removing Erroneous Observations

In [5]:
# Check uniqueness of rows/indexes by getting counts.
(df2
 .groupby(['C_A', 'Unit', 'SCP', 'Station', 'Datetime'])
 .count() #.Entries.count()
 .reset_index()
 .sort_values('Entries', ascending = False)
 .iloc[:5]
)

Unnamed: 0,C_A,Unit,SCP,Station,Datetime,Linename,Division,Date,Desc,Entries,Exits
0,A002,R051,02-00-00,59 ST,2017-03-25 00:00:00,1,1,1,1,1,1
132213,R147,R033,04-00-01,TIMES SQ-42 ST,2017-03-31 16:00:00,1,1,1,1,1,1
132239,R147,R033,04-00-02,TIMES SQ-42 ST,2017-03-29 00:00:00,1,1,1,1,1,1
132240,R147,R033,04-00-02,TIMES SQ-42 ST,2017-03-29 04:00:00,1,1,1,1,1,1
132241,R147,R033,04-00-02,TIMES SQ-42 ST,2017-03-29 08:00:00,1,1,1,1,1,1


In [6]:
# On 3/25, we don't seem to have two entries for same time, but let's take a look anyway.
from datetime import datetime as dt

import datetime

mask = ((df2["C_A"] == "A002") & 
(df2["Unit"] == "R051") & 
(df2["SCP"] == "02-00-00") & 
(df2["Station"] == "59 ST") &
(df2["Datetime"].dt.date == datetime.datetime(2017, 3, 25).date()))
df2[mask].head()

Unnamed: 0,C_A,Unit,SCP,Station,Linename,Division,Date,Desc,Entries,Exits,Datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108321,2069313,2017-03-25 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108343,2069319,2017-03-25 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108360,2069347,2017-03-25 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108462,2069456,2017-03-25 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108689,2069525,2017-03-25 16:00:00


In [7]:
df2.Desc.value_counts()

REGULAR       197257
RECOVR AUD      1111
Name: Desc, dtype: int64

In [8]:
# Questions for the future, if there is time:
    # Are there other values of DESC?
    # Are there other fields to check for odd values?

# Drop duplicates.
df_no_dupe = df2.drop_duplicates(subset=['C_A', 'Unit', 'SCP', 'Station', 'Datetime'])

# Check uniqueness again after data cleaning to confirm cleanness.

In [9]:
df_no_dupe.head()

Unnamed: 0,C_A,Unit,SCP,Station,Linename,Division,Date,Desc,Entries,Exits,Datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108321,2069313,2017-03-25 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108343,2069319,2017-03-25 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108360,2069347,2017-03-25 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108462,2069456,2017-03-25 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/25/2017,REGULAR,6108689,2069525,2017-03-25 16:00:00


# Calculating the Entries and Exits per Day

In [11]:
df_daily_entries = (df_no_dupe
            .groupby(['C_A', 'Unit', 'SCP', 'Station', 'Date'])
            .Entries
            .first()
            .reset_index()
           )

df_daily_exits = (df_no_dupe
            .groupby(['C_A', 'Unit', 'SCP', 'Station', 'Date'])
            .Exits
            .first()
            .reset_index()
           )


In [12]:
# Calculate the differences by day
df_daily_entries[["Prev_date", "Prev_entries"]] = (df_daily_entries
                                                       .groupby(["C_A", "Unit", "SCP", "Station"])["Date", "Entries"]
                                                       .transform(lambda grp: grp.shift(1)))

df_daily_exits[["Prev_date", "Prev_exits"]]   = (df_daily_exits
                                                       .groupby(["C_A", "Unit", "SCP", "Station"])["Date", "Exits"]
                                                       .transform(lambda grp: grp.shift(1)))

# Drop all the null values generated above
df_daily_entries.dropna(subset=["Prev_date"], axis=0, inplace=True)
df_daily_exits.dropna(subset=["Prev_date"], axis=0, inplace=True)

In [13]:
df_daily_entries.head()

Unnamed: 0,C_A,Unit,SCP,Station,Date,Entries,Prev_date,Prev_entries
1,A002,R051,02-00-00,59 ST,03/26/2017,6109170,03/25/2017,6108321.0
2,A002,R051,02-00-00,59 ST,03/27/2017,6109800,03/26/2017,6109170.0
3,A002,R051,02-00-00,59 ST,03/28/2017,6111349,03/27/2017,6109800.0
4,A002,R051,02-00-00,59 ST,03/29/2017,6112817,03/28/2017,6111349.0
5,A002,R051,02-00-00,59 ST,03/30/2017,6114493,03/29/2017,6112817.0


In [14]:
# Check for any counters that have been reversed
df_daily_entries[df_daily_entries["Entries"] < df_daily_entries["Prev_entries"]].head()

# WTC: Is this something that can be solved by sorting before applying the transform above?

Unnamed: 0,C_A,Unit,SCP,Station,Date,Entries,Prev_date,Prev_entries
239,A011,R080,01-00-00,57 ST-7 AV,03/26/2017,887123235,03/25/2017,887125400.0
240,A011,R080,01-00-00,57 ST-7 AV,03/27/2017,887121510,03/26/2017,887123235.0
241,A011,R080,01-00-00,57 ST-7 AV,03/28/2017,887118538,03/27/2017,887121510.0
242,A011,R080,01-00-00,57 ST-7 AV,03/29/2017,887115153,03/28/2017,887118538.0
243,A011,R080,01-00-00,57 ST-7 AV,03/30/2017,887111638,03/29/2017,887115153.0


In [16]:
##### HAVE NOT EDITED YET

# Pick a value from one of the counters reversed above & check for it 
# What's the deal with counter being in reverse
# mask = ((turnstiles_df["C/A"] == "A011") & 
# (turnstiles_df["UNIT"] == "R080") & 
# (turnstiles_df["SCP"] == "01-00-00") & 
# (turnstiles_df["STATION"] == "57 ST-7 AV") &
# (turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2016, 8, 27).date()))
# turnstiles_df[mask].head()

In [17]:
# Let's see how many stations have this problem

(df_daily_entries[df_daily_entries["Entries"] < df_daily_entries["Prev_entries"]]
    .groupby(["C_A", "Unit", "SCP", "Station"])
    .size())

C_A    Unit  SCP       Station        
A011   R080  01-00-00  57 ST-7 AV         6
A025   R023  01-03-01  34 ST-HERALD SQ    6
             01-03-02  34 ST-HERALD SQ    6
A035   R170  00-00-01  14 ST-UNION SQ     2
A049   R088  02-05-00  CORTLANDT ST       5
A066   R118  00-00-00  CANAL ST           6
C019   R232  00-00-02  45 ST              2
H003   R163  01-00-02  6 AV               6
H023   R236  00-06-00  DEKALB AV          6
J034   R007  00-00-02  104 ST             6
JFK03  R536  00-00-01  JFK JAMAICA CT1    1
             00-00-05  JFK JAMAICA CT1    1
             00-03-02  JFK JAMAICA CT1    1
K026   R100  00-00-01  METROPOLITAN AV    4
N063A  R011  00-00-04  42 ST-PORT AUTH    6
             00-00-05  42 ST-PORT AUTH    6
             00-00-08  42 ST-PORT AUTH    6
N078   R175  01-03-00  14 ST              6
N111   R284  00-06-01  CLINTON-WASH AV    6
N137   R354  00-06-01  104 ST             6
N203   R195  00-00-01  161/YANKEE STAD    6
N305   R017  01-03-04  LEXINGTON AV/5

In [18]:
def get_daily_counts(row, max_counter, cols):
    counter = row[cols[0]] - row[cols[1]]
    if counter < 0:
        # May be counter is reversed?
        counter = -counter
    if counter > max_counter:
        print(row[cols[0]], row[cols[1]])
        counter = min(row[cols[0]], row[cols[1]])
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
df_daily_entries["Daily_Entries"] = df_daily_entries.apply(get_daily_counts, axis=1, args=(1000000, ['Entries', 'Prev_entries']))
df_daily_exits["Daily_Exits"] = df_daily_exits.apply(get_daily_counts, axis=1, args=(1000000, ['Exits', 'Prev_exits']))

153 284994405.0
838882067 3322993.0
1792 8034135.0
134217746 573098.0
33 1174380855.0
167779695 661998.0
554 2713008.0
0 6471847.0


In [19]:
df_daily_exits.head()

Unnamed: 0,C_A,Unit,SCP,Station,Date,Exits,Prev_date,Prev_exits,Daily_Exits
1,A002,R051,02-00-00,59 ST,03/26/2017,2069608,03/25/2017,2069313.0,295.0
2,A002,R051,02-00-00,59 ST,03/27/2017,2069820,03/26/2017,2069608.0,212.0
3,A002,R051,02-00-00,59 ST,03/28/2017,2070422,03/27/2017,2069820.0,602.0
4,A002,R051,02-00-00,59 ST,03/29/2017,2070999,03/28/2017,2070422.0,577.0
5,A002,R051,02-00-00,59 ST,03/30/2017,2071599,03/29/2017,2070999.0,600.0


In [20]:
pd.merge(df_daily_entries, df_daily_exits, on=['C_A','Unit','SCP', 'Station', 'Date', 'Prev_date'])

Unnamed: 0,C_A,Unit,SCP,Station,Date,Entries,Prev_date,Prev_entries,Daily_Entries,Exits,Prev_exits,Daily_Exits
0,A002,R051,02-00-00,59 ST,03/26/2017,6109170,03/25/2017,6108321.0,849.0,2069608,2069313.0,295.0
1,A002,R051,02-00-00,59 ST,03/27/2017,6109800,03/26/2017,6109170.0,630.0,2069820,2069608.0,212.0
2,A002,R051,02-00-00,59 ST,03/28/2017,6111349,03/27/2017,6109800.0,1549.0,2070422,2069820.0,602.0
3,A002,R051,02-00-00,59 ST,03/29/2017,6112817,03/28/2017,6111349.0,1468.0,2070999,2070422.0,577.0
4,A002,R051,02-00-00,59 ST,03/30/2017,6114493,03/29/2017,6112817.0,1676.0,2071599,2070999.0,600.0
5,A002,R051,02-00-00,59 ST,03/31/2017,6116109,03/30/2017,6114493.0,1616.0,2072198,2071599.0,599.0
6,A002,R051,02-00-01,59 ST,03/26/2017,5549522,03/25/2017,5548867.0,655.0,1227166,1226992.0,174.0
7,A002,R051,02-00-01,59 ST,03/27/2017,5549998,03/26/2017,5549522.0,476.0,1227299,1227166.0,133.0
8,A002,R051,02-00-01,59 ST,03/28/2017,5551095,03/27/2017,5549998.0,1097.0,1227583,1227299.0,284.0
9,A002,R051,02-00-01,59 ST,03/29/2017,5552220,03/28/2017,5551095.0,1125.0,1227937,1227583.0,354.0


# Calculating the Entries and Exits per Hour


In [21]:
# Use concat (as a join) to fix ("de-cumulate") the columns Entries and Exits

df_shift = (df_no_dupe
            .copy()
            .drop('Linename', axis = 1)
            .drop('Division', axis = 1)
           )

df_shift[['Datetime_Prev', 'Entries_Prev', 'Exits_Prev']] = (df_shift
            .groupby(['C_A', 'Unit', 'SCP', 'Station'])['Datetime', 'Entries', 'Exits']
            .transform(lambda grp: grp.shift(1)))


# df_shift.columns
# df_shift['Datetime_Prev', 'Entries_Prev', 'Exits_Prev'] = (
#     df_no_dupe#[['C_A', 'Unit', 'SCP', 'Station', 'Datetime', 'Entries', 'Exits']]
#             .groupby(['C_A', 'Unit', 'SCP', 'Station'])['Datetime', 'Entries', 'Exits']
#             #.groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
#             .transform(lambda grp: grp.shift(1))
#             #.shift(periods = 1)
#             #.rename(columns = {'Entries' : 'Entries_Shift', 'Exits' : 'Exits_Shift', 
#             #                   'Datetime' : 'Prev_datetime'})
#            )
df_shift.head()

Unnamed: 0,C_A,Unit,SCP,Station,Date,Desc,Entries,Exits,Datetime,Datetime_Prev,Entries_Prev,Exits_Prev
0,A002,R051,02-00-00,59 ST,03/25/2017,REGULAR,6108321,2069313,2017-03-25 00:00:00,NaT,,
1,A002,R051,02-00-00,59 ST,03/25/2017,REGULAR,6108343,2069319,2017-03-25 04:00:00,2017-03-25 00:00:00,6108321.0,2069313.0
2,A002,R051,02-00-00,59 ST,03/25/2017,REGULAR,6108360,2069347,2017-03-25 08:00:00,2017-03-25 04:00:00,6108343.0,2069319.0
3,A002,R051,02-00-00,59 ST,03/25/2017,REGULAR,6108462,2069456,2017-03-25 12:00:00,2017-03-25 08:00:00,6108360.0,2069347.0
4,A002,R051,02-00-00,59 ST,03/25/2017,REGULAR,6108689,2069525,2017-03-25 16:00:00,2017-03-25 12:00:00,6108462.0,2069456.0


In [22]:
# df_w_shift = pd.concat([df_no_dupe, df_shift], axis = 1)

df_shift['Entries'] = df_shift['Entries'] - df_shift['Entries_Prev']
df_shift['Exits'] = df_shift['Exits'] - df_shift['Exits_Prev']

df_shift = df_shift.dropna(how = 'any')

# df_w_shift = (df_w_shift
#        .drop('Entries_Shift', 1)
#        .drop('Exits_Shift', 1)
#        .dropna(how = 'any')
#       )

df_shift.tail()

Unnamed: 0,C_A,Unit,SCP,Station,Date,Desc,Entries,Exits,Datetime,Datetime_Prev,Entries_Prev,Exits_Prev
198363,TRAM2,R469,00-05-01,RIT-ROOSEVELT,03/31/2017,REGULAR,0.0,0.0,2017-03-31 05:00:00,2017-03-31 01:00:00,5554.0,294.0
198364,TRAM2,R469,00-05-01,RIT-ROOSEVELT,03/31/2017,REGULAR,0.0,0.0,2017-03-31 09:00:00,2017-03-31 05:00:00,5554.0,294.0
198365,TRAM2,R469,00-05-01,RIT-ROOSEVELT,03/31/2017,REGULAR,0.0,0.0,2017-03-31 13:00:00,2017-03-31 09:00:00,5554.0,294.0
198366,TRAM2,R469,00-05-01,RIT-ROOSEVELT,03/31/2017,REGULAR,0.0,0.0,2017-03-31 17:00:00,2017-03-31 13:00:00,5554.0,294.0
198367,TRAM2,R469,00-05-01,RIT-ROOSEVELT,03/31/2017,REGULAR,0.0,0.0,2017-03-31 21:00:00,2017-03-31 17:00:00,5554.0,294.0


In [23]:
# Challenge 3 - Total Daily Entries
#df3or4['Datetime'].dt.date == datetime.datetime(YYYY, MM, DD).date()

In [None]:
# def get_daily_counts(row, max_counter):
#     counter = abs(row["ENTRIES"] - row["PREV_ENTRIES"])
    
#     if counter > max_counter:
#         print(row["ENTRIES"], row["PREV_ENTRIES"])
#         return 0
#     return counter

# # If counter is > 1Million, then the counter might have been reset.  
# # Just set it to zero as different counters have different cycle limits
# _ = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)