In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", 500)
df = pd.read_csv('mta/turnstile_210102.txt')
df


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/26/2020,03:00:00,REGULAR,7508509,2557569
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/26/2020,07:00:00,REGULAR,7508511,2557581
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/26/2020,11:00:00,REGULAR,7508531,2557636
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/26/2020,15:00:00,REGULAR,7508610,2557667
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/26/2020,19:00:00,REGULAR,7508767,2557689
...,...,...,...,...,...,...,...,...,...,...,...
210428,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,01/01/2021,04:00:00,REGULAR,5554,541
210429,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,01/01/2021,08:00:00,REGULAR,5554,541
210430,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,01/01/2021,12:00:00,REGULAR,5554,541
210431,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,01/01/2021,16:00:00,REGULAR,5554,541


In [2]:
def mta_to_df():
    filenames = os.listdir('mta')
    data = pd.DataFrame()

    for file in filenames:
        df = pd.read_csv('mta/' + file, parse_dates=[7])
        data = pd.concat([data, df])
    return data

def clean_mta_df(df):
    new_df = df.copy()
    new_df.dropna(axis=1, how='all', inplace=True)
    col_names = ['area', 'unit', 'scp', 'station', 'linename', 'division', 'date', 'time', 'desc', 'entries', 'exits']
    new_df.columns = col_names
    new_df.date = pd.to_datetime(new_df.date)
    
    return new_df

In [3]:
df = mta_to_df()
df = clean_mta_df(df)

df.sample(10, random_state=0)

Unnamed: 0,area,unit,scp,station,linename,division,date,time,desc,entries,exits
76438,N207,R104,00-00-03,167 ST,BD,IND,2020-10-28,2021-01-05 12:00:00,REGULAR,10587318.0,4267207.0
26771,D008,R392,00-00-00,18 AV,N,BMT,2020-12-18,2021-01-05 04:00:00,REGULAR,8033945.0,7882499.0
14151,B010,R412,00-03-02,BOTANIC GARDEN,S2345,BMT,2020-12-15,2021-01-05 04:00:00,REGULAR,119826.0,764137.0
175822,R294,R052,00-00-00,WOODLAWN,4,IRT,2020-10-31,2021-01-05 00:00:00,REGULAR,8801165.0,7183987.0
165642,R238A,R046,02-00-02,GRD CNTRL-42 ST,4567S,IRT,2020-12-24,2021-01-05 11:00:00,REGULAR,3407731.0,1028747.0
148585,R169,R168,01-00-02,96 ST,123,IRT,2020-10-31,2021-01-05 09:00:00,REGULAR,657537.0,486918.0
126789,PTH16,R550,01-00-05,LACKAWANNA,1,PTH,2020-11-05,2021-01-05 21:08:03,REGULAR,90633.0,125560.0
20728,C008,R099,00-03-03,DEKALB AV,BDNQR,BMT,2020-12-31,2021-01-05 19:00:00,REGULAR,7625536.0,4324729.0
35364,H015,R250,01-00-01,GRAND ST,L,BMT,2020-11-03,2021-01-05 03:00:00,REGULAR,506636.0,123577.0
209228,S101,R070,00-00-01,ST. GEORGE,1,SRT,2020-12-13,2021-01-05 11:00:00,REGULAR,1374006.0,296.0


### List of things to do:
* convert entries/exits from cumulative to number per hour
* make histograms to get an idea of which stations are the most promising at which times

In [4]:
entries = df.entries.sort_values()
entries.min(), entries.max()

(0.0, 2128735323.0)

In [5]:
df.sort_values(['station','entries'], inplace=True)
df.head(1000).tail(950)

Unnamed: 0,area,unit,scp,station,linename,division,date,time,desc,entries,exits
33240,H007A,R248,02-01-01,1 AV,L,BMT,2020-12-13,2021-01-05 11:00:00,REGULAR,1.0,21.0
33241,H007A,R248,02-01-01,1 AV,L,BMT,2020-12-13,2021-01-05 15:00:00,REGULAR,1.0,21.0
33242,H007A,R248,02-01-01,1 AV,L,BMT,2020-12-13,2021-01-05 19:00:00,REGULAR,1.0,21.0
33243,H007A,R248,02-01-01,1 AV,L,BMT,2020-12-13,2021-01-05 23:00:00,REGULAR,1.0,21.0
33244,H007A,R248,02-01-01,1 AV,L,BMT,2020-12-14,2021-01-05 03:00:00,REGULAR,1.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...
33117,H007A,R248,02-03-00,1 AV,L,BMT,2020-11-09,2021-01-05 19:00:00,REGULAR,39963.0,646578.0
33118,H007A,R248,02-03-00,1 AV,L,BMT,2020-11-09,2021-01-05 23:00:00,REGULAR,40024.0,646779.0
33119,H007A,R248,02-03-00,1 AV,L,BMT,2020-11-10,2021-01-05 03:00:00,REGULAR,40031.0,646805.0
33120,H007A,R248,02-03-00,1 AV,L,BMT,2020-11-10,2021-01-05 07:00:00,REGULAR,40035.0,646859.0


In [6]:
test_df = df.sort_values(by=['station', 'date'], ascending=[True, False])
filt = test_df.station = '1 AV'
df.head()

Unnamed: 0,area,unit,scp,station,linename,division,date,time,desc,entries,exits
33587,H007A,R248,02-01-01,1 AV,L,BMT,2020-11-14,2021-01-05 03:00:00,REGULAR,1.0,18.0
33588,H007A,R248,02-01-01,1 AV,L,BMT,2020-11-14,2021-01-05 07:00:00,REGULAR,1.0,18.0
33589,H007A,R248,02-01-01,1 AV,L,BMT,2020-11-14,2021-01-05 11:00:00,REGULAR,1.0,18.0
33590,H007A,R248,02-01-01,1 AV,L,BMT,2020-11-14,2021-01-05 15:00:00,REGULAR,1.0,18.0
33591,H007A,R248,02-01-01,1 AV,L,BMT,2020-11-14,2021-01-05 19:00:00,REGULAR,1.0,18.0


In [7]:
def get_steps(df):

    station_names = list(df.station.unique())
    groups = df.groupby('station')
    print(len(groups))

    step_entries = {}
    step_exits = {}

    for station, group in groups:
        if station == '135 ST':
            step_entries[station] = list(group.entries.diff())
            step_exits[station] = list(group.exits.diff())
            continue

    return step_entries, step_exits

step_entries, step_exits = get_steps(df)

def step_cols(step_entries, step_exits):
    """Takes step_entries and exits (dicts{station_name: values})"""
    pass
    



378


In [8]:
station_names = list(df.station.unique())

step_entries = {}
step_exits = {}

dfs = []

for station in station_names:
    filt = (df.station == station)
    temp_df = df[filt]
    
    temp_df.sort_values(by=['station', 'date', 'time'], ascending=[True, False, False], inplace=True)
    Got i

dfs

SyntaxError: invalid syntax (<ipython-input-8-26ad83add238>, line 13)

In [None]:
station_names = list(df.station.unique())

step_entries = {}
step_exits = {}

dfs = []

for station in station_names:
    print(station)

In [None]:
df.entries.diff()

In [None]:
step_entries.keys()

In [None]:
df['step_entries'] = pd.Series(step_entries)

In [None]:
df

In [None]:
df.step_entries.value_counts()