In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('./code')
import get_mta_data as gmd
import process_mta_data as pmd

#### Challenge 1   
  
- Open up a new IPython notebook
- Download a few MTA turnstile data files
- Load the files into a pandas DataFrame (hint: `pd.read_csv()` to load files and `pd.concat()` to combine DataFrames)

In [2]:
#get list of filenames based on month,year
years = [2019]
months = [4]
files = gmd.generate_filenames(years,months)

In [3]:
#get data from files into dataframe named mta
mta_df_list = []
for file in files:
    mta_df_list += [pd.read_csv('../data/raw/'+file)]
mta = pd.concat(mta_df_list, ignore_index=True)

In [4]:
#rename row 10 to 'EXITS'
mta.rename(columns={mta.columns[10]:'EXITS'},inplace=True)

#### Challenge 2

- Let's turn this into a time series.

- Our pandas dataframe has columns called `Date` and `Time` (what datatype did pandas assign to these columns on import?), however in python and pandas we can convert date and time information to _datetime_ objects, which allow us to do time-based operations

- Using either [pd.to_datetime](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html) in pandas or the [python datetime library](https://docs.python.org/2/library/datetime.html), combine the `Date` and `Time` columns into a single new column of the datetime datatype

In [5]:
#go to datetime
mta['DATETIME'] = mta.DATE+' '+mta.TIME
mta['DATETIME'] = pd.to_datetime(mta['DATETIME'],format='%m/%d/%Y %H:%M:%S')
mta['DATE'] = pd.to_datetime(mta['DATE'])
mta.sort_values(by=['C/A', 'UNIT', 'SCP', 'STATION','DATETIME'],inplace=True)


#### Challenge 3

- Each row is a turnstile, identified by a combination of  the `C/A`, `UNIT`, `SCP`, and `STATION` columns, with information on entries and exits at that turnstile every n hours. (What is n?) We want total daily entries. 
- Group the data so that it represents **daily entries** for each turnstile (hint: `pd.groupby` or `DataFrame.groupby`)

In [6]:
# look at time difference
mta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
611115,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,00:00:00,REGULAR,6999064,2373568,2019-03-30 00:00:00
611116,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,04:00:00,REGULAR,6999084,2373576,2019-03-30 04:00:00
611117,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,08:00:00,REGULAR,6999107,2373622,2019-03-30 08:00:00
611118,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,12:00:00,REGULAR,6999214,2373710,2019-03-30 12:00:00
611119,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,16:00:00,REGULAR,6999451,2373781,2019-03-30 16:00:00


n is 4

In [7]:
# check for duplicate entries
(mta.groupby(['C/A', 'UNIT', 'SCP', 'STATION','DATETIME'])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False))

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
161388,JFK03,R536,00-03-02,JFK JAMAICA CT1,2019-04-07 01:00:00,2
665566,R283,R221,00-00-01,167 ST,2019-04-16 04:00:00,2
160330,JFK03,R536,00-00-02,JFK JAMAICA CT1,2019-04-07 01:00:00,2
160841,JFK03,R536,00-00-05,JFK JAMAICA CT1,2019-04-07 01:00:00,2
214239,N071,R013,00-00-03,34 ST-PENN STA,2019-04-20 08:00:00,2
...,...,...,...,...,...,...
271364,N141,R356,00-00-03,OZONE PK LEFFRT,2019-04-21 08:00:00,1
271365,N141,R356,00-00-03,OZONE PK LEFFRT,2019-04-21 12:00:00,1
271366,N141,R356,00-00-03,OZONE PK LEFFRT,2019-04-21 16:00:00,1
271367,N141,R356,00-00-03,OZONE PK LEFFRT,2019-04-21 20:00:00,1


In [8]:
# look at a duplicate
(mta[(mta['C/A']=='JFK03') & 
    (mta['UNIT']=='R536') & 
    (mta['SCP']=='00-03-02') & 
    (mta['STATION']=='JFK JAMAICA CT1') &
    (mta['DATETIME'] >= '2019-04-07 00:00:00') &
    (mta['DATETIME'] <= '2019-04-07 20:00:00')]
    )

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
442916,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,01:00:00,REGULAR,827,122,2019-04-07 01:00:00
442917,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,01:00:00,RECOVR AUD,780,120,2019-04-07 01:00:00
442918,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,05:00:00,REGULAR,880,151,2019-04-07 05:00:00
442919,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,09:00:00,REGULAR,1008,196,2019-04-07 09:00:00
442920,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,13:00:00,REGULAR,1188,248,2019-04-07 13:00:00
442921,JFK03,R536,00-03-02,JFK JAMAICA CT1,E,IND,2019-04-07,17:00:00,REGULAR,1546,349,2019-04-07 17:00:00


In [16]:
# add DESC to the sort to drop the RECOVR AUDs
mta = mta_backup.copy()
mta.sort_values(by=['C/A', 'UNIT', 'SCP', 'STATION','DATETIME','DESC'],inplace=True)
mta.drop_duplicates(subset=['C/A', 'UNIT', 'SCP', 'STATION','DATETIME'],keep='last',inplace=True)
mta['DESC'] = mta[mta['DESC'] != 'RECOVR AUD']['DESC']
mta['DESC'].value_counts()
#mta.drop(['DESC'],axis=1,inplace=True)

REGULAR    810059
Name: DESC, dtype: int64

In [17]:
# get to daily counts and
mta = mta_backup.copy()
mask = mta.duplicated(['C/A', 'UNIT', 'SCP', 'STATION'])
mta['INS'] = np.where(mask, mta['ENTRIES'] - mta['ENTRIES'].shift(1), np.nan)
mta['OUTS'] = np.where(mask, mta['EXITS'] - mta['EXITS'].shift(1), np.nan)
mta['INS'].value_counts().sort_index()

-2.071639e+09    1
-1.835264e+09    1
-1.200927e+08    1
-6.702047e+07    1
-1.671309e+07    1
                ..
 2.339553e+08    1
 3.107406e+08    1
 1.053482e+09    1
 1.431673e+09    1
 2.011374e+09    1
Name: INS, Length: 3767, dtype: int64

In [22]:
mta[mta['INS'] > 500000]
mta.reset_index(drop=True)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,INS,OUTS
611115,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,00:00:00,REGULAR,6999064,2373568,2019-03-30 00:00:00,,
611116,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,04:00:00,REGULAR,6999084,2373576,2019-03-30 04:00:00,20.0,8.0
611117,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,08:00:00,REGULAR,6999107,2373622,2019-03-30 08:00:00,23.0,46.0
611118,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,12:00:00,REGULAR,6999214,2373710,2019-03-30 12:00:00,107.0,88.0
611119,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-30,16:00:00,REGULAR,6999451,2373781,2019-03-30 16:00:00,237.0,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203852,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-04-26,05:00:00,REGULAR,5554,376,2019-04-26 05:00:00,0.0,0.0
203853,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-04-26,09:00:00,REGULAR,5554,376,2019-04-26 09:00:00,0.0,0.0
203854,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-04-26,13:00:00,REGULAR,5554,376,2019-04-26 13:00:00,0.0,0.0
203855,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-04-26,17:00:00,REGULAR,5554,376,2019-04-26 17:00:00,0.0,0.0


#### Challenge 4

We will plot the daily time series for a single turnstile.

In ipython notebook, add this to the beginning of your next cell:
```
    %matplotlib inline
```
This will make your matplotlib graphs integrate nicely with the
notebook. 

To plot the time series, import matplotlib with
```
    import matplotlib.pyplot as plt
```
- Select one turnstile (combination of [`C/A`, `UNIT`, `SCP`, `STATION`])
- Filter your pandas DataFrame so it only represents a single turnstile (make sure that turnstile has entries for multiple days).
- Using matplotlib, create a plot with the days on the X axis and the daily entries for that turnstile on the Y axis

You can use the following as a starting point:
```
    plt.figure(figsize=(10,3))
    plt.plot(<days>,<entries>)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
mta_group.groups

In [None]:
import matplotlib.dates as mdates

ts0 = mta_sorted[(mta_sorted['C/A']=='A002') & 
          (mta_sorted['UNIT']=='R051') & 
          (mta_sorted['SCP']=='02-00-00') & 
          (mta_sorted['STATION']=='59 ST')]
plt.plot_date(ts0['DATETIME'],ts0['ENTRIES'],xdate=True,ydate=False)
plt.xticks(rotation='vertical')