# Load data

In [52]:
import os

import pandas as pd
import numpy as np
import matplotlib as plt

def get_data(folder):
    """
    Reads in turnstile data from a specified folder in Data
    
    Input: turnstile data file i.e. 2016-2017_turnstile_data
    Output: a DataFrame with all rows from all files in folder
    """
    
    col_names = ['C/A',
                 'UNIT',
                 'SCP',
                 'STATION',
                 'LINENAME',
                 'DIVISION',
                 'DATE',
                 'TIME',
                 'DESC',
                 'ENTRIES',
                 'EXITS                                                               ']

    ## absolute path to Data folder
    data_dir = os.getcwd()+"/Data/" 
    
    return_df = pd.DataFrame(columns=col_names)
    for file in os.listdir(data_dir+folder):
        if not file.startswith('.'):
            file_path = "Data/"+folder+'/'+file
            return_df = pd.concat([return_df, pd.read_csv(file_path)],axis=0)
            
    return_df.rename(columns={return_df.columns[10]:'EXITS'},inplace=True)
    
    return(return_df)

In [53]:
df = get_data("2016-2019_turnstile_data")

In [54]:
df['STATION'].nunique()

381

In [55]:
df['ENTRIES'] = df['ENTRIES'].astype(np.int)
df['EXITS'] = df['EXITS'].astype(np.int)

In [56]:
df.to_csv("2016-2019_turnstile_data.csv")

# Chronologically sort DataFrame

# Develop method to count turnstile

In [57]:
sample_df = df[df['STATION'] == '59 ST']

In [63]:
sample_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [65]:
sample_df[['DATE','TIME','ENTRIES']]

Unnamed: 0,DATE,TIME,ENTRIES
0,03/23/2019,00:00:00,6989774
1,03/23/2019,04:00:00,6989795
2,03/23/2019,08:00:00,6989813
3,03/23/2019,12:00:00,6989924
4,03/23/2019,16:00:00,6990200
...,...,...,...
157647,03/09/2018,11:00:00,232801
157648,03/09/2018,12:12:19,232807
157649,03/09/2018,15:00:00,232818
157650,03/09/2018,19:00:00,232889


In [75]:
sample_df['ENTRIES']

0         6989774
1         6989795
2         6989813
3         6989924
4         6990200
           ...   
157647     232801
157648     232807
157649     232818
157650     232889
157651     232905
Name: ENTRIES, Length: 115306, dtype: int64

In [42]:
df.groupby(['STATION'])['ENTRIES'].mean()

STATION
59 ST    3.803133e+07
Name: ENTRIES, dtype: float64

In [None]:
df.shape

In [8]:
# No missing values
df.isnull().sum()

C/A                                                                     0
UNIT                                                                    0
SCP                                                                     0
STATION                                                                 0
LINENAME                                                                0
DIVISION                                                                0
DATE                                                                    0
TIME                                                                    0
DESC                                                                    0
ENTRIES                                                                 0
EXITS                                                                   0
dtype: int64

In [76]:
df['C/A'].nunique()

751

## Reduce scope by plotting