In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


import sys
sys.path.append('./code')
import get_mta_data as gmd
import process_mta_data as pmd

import datetime

## import data

In [2]:
mta = gmd.load_local_data([2019], [4])

## Tidy up data

The functions that are becoming standard for us.

In [3]:
mta = pmd.clean_col_names(mta)

In [4]:
mta = pmd.add_datetime(mta)

In [5]:
mta = pmd.convert_date_to_datetime(mta)

Remove duplicate values

In [6]:
mta.DESC.value_counts()

REGULAR       747615
RECOVR AUD      3621
Name: DESC, dtype: int64

There are 3621 rows with 'RECOVER AUD'. In Lara's presentation, these were shown to be be duplicates in at least some cases. We will remove them all.

In [7]:
mta = mta[mta.DESC != 'RECOVR AUD']
mta.DESC.value_counts()

REGULAR    747615
Name: DESC, dtype: int64

This confirms that the 'RECOVER AUD' entries have been removed

Here's what the data looks like now:

In [8]:
mta

Unnamed: 0,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,00:00:00,REGULAR,6999064,2373568,2019-03-30 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,04:00:00,REGULAR,6999084,2373576,2019-03-30 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,08:00:00,REGULAR,6999107,2373622,2019-03-30 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,12:00:00,REGULAR,6999214,2373710,2019-03-30 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,16:00:00,REGULAR,6999451,2373781,2019-03-30 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
751231,R161B,R452,00-03-02,72 ST,123,IRT,2019-04-25,01:00:00,REGULAR,20605432,8466842,2019-04-25 01:00:00
751232,R161B,R452,00-03-02,72 ST,123,IRT,2019-04-25,05:00:00,REGULAR,20605443,8466850,2019-04-25 05:00:00
751233,R161B,R452,00-03-02,72 ST,123,IRT,2019-04-25,09:00:00,REGULAR,20606268,8467023,2019-04-25 09:00:00
751234,R161B,R452,00-03-02,72 ST,123,IRT,2019-04-25,13:00:00,REGULAR,20607206,8467308,2019-04-25 13:00:00


Let's check datetime for value counts

In [9]:
mta.datetime.value_counts()

2019-04-01 16:00:00    2480
2019-04-03 12:00:00    2480
2019-04-04 08:00:00    2480
2019-03-31 20:00:00    2480
2019-03-31 16:00:00    2480
                       ... 
2019-04-18 02:03:31       1
2019-04-10 20:33:25       1
2019-04-02 04:48:27       1
2019-03-31 06:21:57       1
2019-04-05 11:19:48       1
Name: datetime, Length: 53309, dtype: int64

This shows that there are some entries at odd times. There appear to be more at standardized times.  
So, let's isolate those at 00:00:00 time

In [19]:
mask = mta['TIME'] == "00:00:00"
mta_midnight = mta[mask]

In [39]:
mta_midnight.groupby(["CA", "UNIT", "SCP", "STATION", "DATE"]).nth(0).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,LINENAME,DIVISION,TIME,DESC,ENTRIES,EXITS,datetime
CA,UNIT,SCP,STATION,DATE,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A002,R051,02-00-00,59 ST,2019-03-30,NQR456W,BMT,00:00:00,REGULAR,6999064,2373568,2019-03-30
A002,R051,02-00-00,59 ST,2019-03-31,NQR456W,BMT,00:00:00,REGULAR,6999957,2373867,2019-03-31
A002,R051,02-00-00,59 ST,2019-04-01,NQR456W,BMT,00:00:00,REGULAR,7000528,2374095,2019-04-01
A002,R051,02-00-00,59 ST,2019-04-02,NQR456W,BMT,00:00:00,REGULAR,7002087,2374579,2019-04-02
A002,R051,02-00-00,59 ST,2019-04-03,NQR456W,BMT,00:00:00,REGULAR,7003680,2375133,2019-04-03


In [40]:
mta_midnight.groupby(["CA", "UNIT", "SCP", "STATION", "DATE"]).nth(1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,LINENAME,DIVISION,TIME,DESC,ENTRIES,EXITS,datetime
CA,UNIT,SCP,STATION,DATE,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


This suggests that, for midnight only entries, there are no duplicate entries