## 01-mta_pandas

Topic: Challenge Set 1  
Subject: Explore MTA turnstile data  
Date: 01/14/2018  
Name: Jon Kislin  
Worked with: Jit, Browning, Jaydon  

#### Setup

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt

%matplotlib inline

#### Challenge 1 - Import the data into pandas

In [3]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    # base url
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        # concat our week onto url.. 
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [170506, 170513, 170520, 170527]
turnstiles_df = get_data(week_nums)

In [11]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529


#### Challenge 2: Convert data into timeseries

Some preliminary inspection:

In [13]:
turnstiles_df.columns =\
[column.strip() for column in turnstiles_df.columns]

In [14]:
turnstiles_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [15]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529


In [29]:
# number of observations, by date, across all stations
turnstiles_df.DATE.value_counts().sort_index()

04/29/2017    28387
04/30/2017    28041
05/01/2017    28082
05/02/2017    28165
05/03/2017    28115
05/04/2017    28494
05/05/2017    28006
05/06/2017    28023
05/07/2017    27946
05/08/2017    28693
05/09/2017    28344
05/10/2017    28104
05/11/2017    27908
05/12/2017    27843
05/13/2017    27993
05/14/2017    27983
05/15/2017    28150
05/16/2017    28081
05/17/2017    27977
05/18/2017    27845
05/19/2017    27446
05/20/2017    27908
05/21/2017    27877
05/22/2017    27925
05/23/2017    27979
05/24/2017    28055
05/25/2017    27953
05/26/2017    27766
Name: DATE, dtype: int64

In [32]:
turnstiles_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

Getting down to business:

In [34]:
# develop filter, .. 
mask = ((turnstiles_df["C/A"] == "A002") & 
        (turnstiles_df["UNIT"] == "R051") & 
        (turnstiles_df["SCP"] == "02-00-00") & 
        (turnstiles_df["STATION"] == "59 ST"))
turnstiles_df[mask].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529


In [36]:
turnstiles_df["DATE_TIME"]\
= pd.to_datetime(turnstiles_df.DATE + 
                 " " +
                 turnstiles_df.TIME, format="%m/%d/%Y %H:%M:%S")
# this makes me giddy with excitement that it works!

In [38]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315,2017-04-29 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319,2017-04-29 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353,2017-04-29 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453,2017-04-29 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529,2017-04-29 16:00:00


In [40]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315,2017-04-29 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319,2017-04-29 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353,2017-04-29 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453,2017-04-29 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529,2017-04-29 16:00:00


In [41]:
mask = ((turnstiles_df["C/A"] == "R626") & 
(turnstiles_df["UNIT"] == "R062") & 
(turnstiles_df["SCP"] == "00-00-00") & 
(turnstiles_df["STATION"] == "CROWN HTS-UTICA"))
turnstiles_df[mask].head()
## what's the pint of this exactly?

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
191040,R626,R062,00-00-00,CROWN HTS-UTICA,34,IRT,04/29/2017,00:00:00,REGULAR,12797792,2847652,2017-04-29 00:00:00
191041,R626,R062,00-00-00,CROWN HTS-UTICA,34,IRT,04/29/2017,04:00:00,REGULAR,12797825,2847689,2017-04-29 04:00:00
191042,R626,R062,00-00-00,CROWN HTS-UTICA,34,IRT,04/29/2017,08:00:00,REGULAR,12798036,2847709,2017-04-29 08:00:00
191043,R626,R062,00-00-00,CROWN HTS-UTICA,34,IRT,04/29/2017,12:00:00,REGULAR,12798568,2847766,2017-04-29 12:00:00
191044,R626,R062,00-00-00,CROWN HTS-UTICA,34,IRT,04/29/2017,16:00:00,REGULAR,12799068,2847868,2017-04-29 16:00:00


In [43]:
# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

# this is also very clever

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
108172,G001,R151,00-00-00,CONEY IS-STILLW,2017-05-09 05:00:00,2
697077,R504,R276,00-00-01,VERNON-JACKSON,2017-05-26 20:00:00,2
0,A002,R051,02-00-00,59 ST,2017-04-29 00:00:00,1
523407,R145,R032,00-00-02,TIMES SQ-42 ST,2017-05-13 00:00:00,1
523387,R145,R032,00-00-02,TIMES SQ-42 ST,2017-05-09 16:00:00,1


In [48]:
turnstiles_df.DESC.value_counts()

REGULAR       782645
RECOVR AUD      2444
Name: DESC, dtype: int64

In [49]:
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True, ascending=False)
# Get rid of the duplicate entry, be default we'll keep first entry
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [50]:
turnstiles_df.DESC.value_counts() ## duplicates dropped.

REGULAR       782645
RECOVR AUD      2442
Name: DESC, dtype: int64

In [51]:
turnstiles_df = turnstiles_df.drop(["EXITS", "DESC"], axis=1, errors="ignore")
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,DATE_TIME
195462,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/26/2017,21:00:00,5554,2017-05-26 21:00:00
195461,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/26/2017,17:00:00,5554,2017-05-26 17:00:00
195460,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/26/2017,13:00:00,5554,2017-05-26 13:00:00
195459,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/26/2017,09:00:00,5554,2017-05-26 09:00:00
195458,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/26/2017,05:00:00,5554,2017-05-26 05:00:00


#### Challenge 3: total daily entries

In [71]:
# returns a series
turnstiles_daily = turnstiles_df.groupby\
                (["C/A", "UNIT", "SCP", "STATION", "DATE"])\
                .ENTRIES.first().reset_index()
        
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES
0,A002,R051,02-00-00,59 ST,04/29/2017,6158521
1,A002,R051,02-00-00,59 ST,04/30/2017,6159015
2,A002,R051,02-00-00,59 ST,05/01/2017,6160374
3,A002,R051,02-00-00,59 ST,05/02/2017,6161830
4,A002,R051,02-00-00,59 ST,05/03/2017,6163254


In [73]:
# define two new columns using 
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .transform(lambda grp: grp.shift(1)))

In [74]:
turnstiles_daily.head(50)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES
0,A002,R051,02-00-00,59 ST,04/29/2017,6158521,,
1,A002,R051,02-00-00,59 ST,04/30/2017,6159015,04/29/2017,6158521.0
2,A002,R051,02-00-00,59 ST,05/01/2017,6160374,04/30/2017,6159015.0
3,A002,R051,02-00-00,59 ST,05/02/2017,6161830,05/01/2017,6160374.0
4,A002,R051,02-00-00,59 ST,05/03/2017,6163254,05/02/2017,6161830.0
5,A002,R051,02-00-00,59 ST,05/04/2017,6164865,05/03/2017,6163254.0
6,A002,R051,02-00-00,59 ST,05/05/2017,6166192,05/04/2017,6164865.0
7,A002,R051,02-00-00,59 ST,05/06/2017,6167090,05/05/2017,6166192.0
8,A002,R051,02-00-00,59 ST,05/07/2017,6167857,05/06/2017,6167090.0
9,A002,R051,02-00-00,59 ST,05/08/2017,6169303,05/07/2017,6167857.0


In [75]:
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [76]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES
1,A002,R051,02-00-00,59 ST,04/30/2017,6159015,04/29/2017,6158521.0
2,A002,R051,02-00-00,59 ST,05/01/2017,6160374,04/30/2017,6159015.0
3,A002,R051,02-00-00,59 ST,05/02/2017,6161830,05/01/2017,6160374.0
4,A002,R051,02-00-00,59 ST,05/03/2017,6163254,05/02/2017,6161830.0
5,A002,R051,02-00-00,59 ST,05/04/2017,6164865,05/03/2017,6163254.0


In [77]:
# sanity check : when are previous entries greater than current ? 
turnstiles_daily[turnstiles_daily["ENTRIES"] < turnstiles_daily["PREV_ENTRIES"]].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES
884,A010,R080,00-00-05,57 ST-7 AV,05/15/2017,3576,05/14/2017,430596.0
953,A011,R080,01-00-00,57 ST-7 AV,04/30/2017,887015788,04/29/2017,887017761.0
954,A011,R080,01-00-00,57 ST-7 AV,05/01/2017,887013153,04/30/2017,887015788.0
955,A011,R080,01-00-00,57 ST-7 AV,05/02/2017,887009858,05/01/2017,887013153.0
956,A011,R080,01-00-00,57 ST-7 AV,05/03/2017,887006216,05/02/2017,887009858.0


In [84]:
# What's the deal with counter going in reverse ? 
# investigate this with turnstiles_df
mask = ((turnstiles_df["C/A"] == "A010") & 
(turnstiles_df["UNIT"] == "R080") & 
(turnstiles_df["SCP"] == "00-00-05") & 
(turnstiles_df["STATION"] == "57 ST-7 AV") &
(turnstiles_df["DATE_TIME"].dt.date == datetime(2017, 5, 14).date()))
turnstiles_df[mask].head()

# entries going up, datetime going down

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,DATE_TIME
1307,A010,R080,00-00-05,57 ST-7 AV,NQRW,BMT,05/14/2017,20:00:00,430596,2017-05-14 20:00:00
1306,A010,R080,00-00-05,57 ST-7 AV,NQRW,BMT,05/14/2017,16:00:00,430273,2017-05-14 16:00:00
1305,A010,R080,00-00-05,57 ST-7 AV,NQRW,BMT,05/14/2017,12:00:00,430026,2017-05-14 12:00:00
1304,A010,R080,00-00-05,57 ST-7 AV,NQRW,BMT,05/14/2017,08:00:00,429872,2017-05-14 08:00:00
1303,A010,R080,00-00-05,57 ST-7 AV,NQRW,BMT,05/14/2017,04:00:00,429856,2017-05-14 04:00:00


In [85]:
# Let's see how many stations have this problem

(turnstiles_daily[turnstiles_daily["ENTRIES"] < turnstiles_daily["PREV_ENTRIES"]]
    .groupby(["C/A", "UNIT", "SCP", "STATION"])
     .size())

# size vs count? 
# size includes NaN values, count does not

C/A    UNIT  SCP       STATION        
A010   R080  00-00-05  57 ST-7 AV          1
A011   R080  01-00-00  57 ST-7 AV         27
A025   R023  01-03-01  34 ST-HERALD SQ    27
             01-03-02  34 ST-HERALD SQ    27
A037   R170  05-00-00  14 ST-UNION SQ      1
A049   R088  02-05-00  CORTLANDT ST       18
A054   R227  01-06-00  RECTOR ST           1
A060   R001  00-00-06  WHITEHALL S-FRY     1
A066   R118  00-00-00  CANAL ST           27
A071   R044  02-00-02  CHAMBERS ST         1
B016   R098  00-03-01  CHURCH AV           1
B022   R229  00-05-00  AVENUE M            1
C021   R212  00-00-02  59 ST              27
E013   R373  00-00-02  20 AV               1
H003   R163  01-00-02  6 AV               27
H023   R236  00-06-00  DEKALB AV          26
H037   R349  00-00-00  SUTTER AV           1
H041   R152  00-05-01  CANARSIE-ROCKAW     1
J024   R437  00-00-00  CRESCENT ST         1
J034   R007  00-00-02  104 ST             24
JFK01  R535  00-00-02  HOWARD BCH JFK      1
JFK03  R536  00-

In [86]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        # see how large these values get..
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
_ = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

1810 1272207.0
371 100838424.0
96 2605955.0
574 4607298.0
16796105 61127.0
142 16796105.0
231 7586045.0
1425 11823537.0
342 7896924.0
976 5439401.0
83942693 381404.0
33554825 3050267.0
52 1946567.0
2164 13435856.0
1346 1356336.0
117440513 471859.0
691259285 8105279.0
288 5961768.0
1448 10512102.0
646 2525748.0
183 605785512.0
4241 20236930.0
503286695 14838374.0
1370 856595404.0
70041 19461075.0
33554892 2724829.0
1050 2136240.0
117442931 19188583.0
1560221276 2525075.0


In [87]:
# slightly different approach
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # May be counter is reversed?
        counter = -counter
    if counter > max_counter:
        print(row["ENTRIES"], row["PREV_ENTRIES"],row['C/A'],row['UNIT'],
             row['SCP'],row['STATION'])
        # assumption is that 
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
turnstiles_daily["DAILY_ENTRIES"] = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

1810 1272207.0 A037 R170 05-00-00 14 ST-UNION SQ
371 100838424.0 A054 R227 01-06-00 RECTOR ST
96 2605955.0 A071 R044 02-00-02 CHAMBERS ST
574 4607298.0 B016 R098 00-03-01 CHURCH AV
16796105 61127.0 B022 R229 00-05-00 AVENUE M
142 16796105.0 B022 R229 00-05-00 AVENUE M
231 7586045.0 E013 R373 00-00-02 20 AV
1425 11823537.0 H037 R349 00-00-00 SUTTER AV
342 7896924.0 J024 R437 00-00-00 CRESCENT ST
976 5439401.0 N063 R011 02-00-00 42 ST-PORT AUTH
83942693 381404.0 N138 R355 01-06-01 111 ST
33554825 3050267.0 N223 R156 01-06-01 BEDFORD PK BLVD
52 1946567.0 N319 R298 01-06-01 NORTHERN BLVD
2164 13435856.0 N501 R020 01-03-03 47-50 STS ROCK
1346 1356336.0 PTH03 R552 00-00-02 JOURNAL SQUARE
117440513 471859.0 R160A R164 00-05-00 66 ST-LINCOLN
691259285 8105279.0 R161B R452 00-00-03 72 ST
288 5961768.0 R165 R167 01-00-02 86 ST
1448 10512102.0 R165 R167 01-00-04 86 ST
646 2525748.0 R169 R168 01-00-02 96 ST
183 605785512.0 R169 R168 01-03-01 96 ST
4241 20236930.0 R240 R047 00-00-01 GRD CNTRL-42 ST

In [88]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES,DAILY_ENTRIES
1,A002,R051,02-00-00,59 ST,04/30/2017,6159015,04/29/2017,6158521.0,494.0
2,A002,R051,02-00-00,59 ST,05/01/2017,6160374,04/30/2017,6159015.0,1359.0
3,A002,R051,02-00-00,59 ST,05/02/2017,6161830,05/01/2017,6160374.0,1456.0
4,A002,R051,02-00-00,59 ST,05/03/2017,6163254,05/02/2017,6161830.0,1424.0
5,A002,R051,02-00-00,59 ST,05/04/2017,6164865,05/03/2017,6163254.0,1611.0
