In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

# Challenge 1 and 2
apr1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_170401.txt'

df1 = pd.read_csv(apr1)

In [3]:
df1.columns.values

array(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE',
       'TIME', 'DESC', 'ENTRIES',
       'EXITS                                                               '], dtype=object)

In [4]:
# Clean the data

# Create datetime column with datetime datatype
df1['Datetime'] = pd.to_datetime(df1.DATE + ' ' + df1.TIME,
   format = '%m/%d/%Y %H:%M:%S')

# Strip whitespace from column names
df1.columns = [s.strip() for s in df1.columns.values]

# Chain methods together to achieve several goals:
    # drop old date and time columns
    # rename columns
    # set the index

dict_col_rename = {'C/A' : 'C_A', 'UNIT' : 'Unit', 'STATION' : 'Station', 'LINENAME' : 'Linename',
                  'DIVISION' : 'Division', 'DESC' : 'Desc', 'ENTRIES' : 'Entries', 'EXITS' : 'Exits'}    


df1 = (df1
       .drop('DATE', 1)
       .drop('TIME', 1)
       .rename(columns = dict_col_rename)
       .set_index(['C_A', 'Unit', 'SCP', 'Station', 'Datetime'])
      )

In [5]:
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Linename,Division,Desc,Entries,Exits
C_A,Unit,SCP,Station,Datetime,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A002,R051,02-00-00,59 ST,2017-03-25 00:00:00,NQR456W,BMT,REGULAR,6108321,2069313
A002,R051,02-00-00,59 ST,2017-03-25 04:00:00,NQR456W,BMT,REGULAR,6108343,2069319
A002,R051,02-00-00,59 ST,2017-03-25 08:00:00,NQR456W,BMT,REGULAR,6108360,2069347
A002,R051,02-00-00,59 ST,2017-03-25 12:00:00,NQR456W,BMT,REGULAR,6108462,2069456
A002,R051,02-00-00,59 ST,2017-03-25 16:00:00,NQR456W,BMT,REGULAR,6108689,2069525


In [11]:
#df1.Datetime.value_counts()
#df1.reset_index().Datetime.value_counts()
df1.columns

Index(['Linename', 'Division', 'Desc', 'Entries', 'Exits'], dtype='object')

In [None]:
# Krishna builds a boolean "mask" that includes a consition using datetime.datetime
# (or something like that).

In [72]:
# Use concat (as a join) to fix ("de-cumulate") the columns Entries and Exits

df2 = (df1[['Entries', 'Exits']]
       .shift()
       .rename(columns = {'Entries' : 'Entries_Shift', 'Exits' : 'Exits_Shift'})
      )


In [102]:
df3 = pd.concat([df1, df2], axis = 1)

df3['Entries'] = df3['Entries'] - df3['Entries_Shift']
df3['Exits'] = df3['Exits'] - df3['Exits_Shift']

df3 = (df3
       .drop('Entries_Shift', 1)
       .drop('Exits_Shift', 1)
       .dropna(how = 'any')
      )

df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Linename,Division,Desc,Entries,Exits
C_A,Unit,SCP,Station,Datetime,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A002,R051,02-00-00,59 ST,2017-03-25 04:00:00,NQR456W,BMT,REGULAR,22.0,6.0
A002,R051,02-00-00,59 ST,2017-03-25 08:00:00,NQR456W,BMT,REGULAR,17.0,28.0
A002,R051,02-00-00,59 ST,2017-03-25 12:00:00,NQR456W,BMT,REGULAR,102.0,109.0
A002,R051,02-00-00,59 ST,2017-03-25 16:00:00,NQR456W,BMT,REGULAR,227.0,69.0
A002,R051,02-00-00,59 ST,2017-03-25 20:00:00,NQR456W,BMT,REGULAR,294.0,45.0


In [None]:
# Check uniqueness of rows/indexes by getting counts.
df4 = (df3
       .groupby(['C_A', 'Unit', 'SCP', 'Station', 'Datetime'])
       .count() #.Entries.count()
       .reset_index()
       .sort_values('Entries', ascending = False)
       .iloc[:5] #.head()
      )


In [98]:
# Challenge 3 - Total Daily Entries


[nan, 0.0, 0.0, 0.0, 0.0]


In [103]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198368 entries, 0 to 198367
Data columns (total 4 columns):
Entries    198368 non-null int64
Exits      198368 non-null int64
Entries    198367 non-null float64
Exits      198367 non-null float64
dtypes: float64(2), int64(2)
memory usage: 6.1 MB


In [109]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 198367 entries, (A002, R051, 02-00-00, 59 ST, 2017-03-25 04:00:00) to (TRAM2, R469, 00-05-01, RIT-ROOSEVELT, 2017-03-31 21:00:00)
Data columns (total 5 columns):
Linename    198367 non-null object
Division    198367 non-null object
Desc        198367 non-null object
Entries     198367 non-null float64
Exits       198367 non-null float64
dtypes: float64(2), object(3)
memory usage: 9.6+ MB
