In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as mat
import datetime as dt

In [2]:
# Read the datafiles
data_path = '../data/'
data_file_list = ['turnstile_191102.txt',
                 'turnstile_191109.txt',
                 'turnstile_191116.txt',
                 'turnstile_191130.txt',
                 'turnstile_191207.txt',
                 'turnstile_191214.txt',
                 'turnstile_191221.txt',
                 'turnstile_191228.txt'
                 'turnstile_200104.txt']

booth_key_filename = 'Remote-Booth-Station.csv'

In [3]:
turnstiles = pd.read_csv(data_path + data_file_list[0])
booth_key = pd.read_csv(data_path + booth_key_filename)
turnstiles, booth_key

(          C/A  UNIT       SCP        STATION LINENAME DIVISION        DATE  \
 0        A002  R051  02-00-00          59 ST  NQR456W      BMT  10/26/2019   
 1        A002  R051  02-00-00          59 ST  NQR456W      BMT  10/26/2019   
 2        A002  R051  02-00-00          59 ST  NQR456W      BMT  10/26/2019   
 3        A002  R051  02-00-00          59 ST  NQR456W      BMT  10/26/2019   
 4        A002  R051  02-00-00          59 ST  NQR456W      BMT  10/26/2019   
 ...       ...   ...       ...            ...      ...      ...         ...   
 206041  TRAM2  R469  00-05-01  RIT-ROOSEVELT        R      RIT  11/01/2019   
 206042  TRAM2  R469  00-05-01  RIT-ROOSEVELT        R      RIT  11/01/2019   
 206043  TRAM2  R469  00-05-01  RIT-ROOSEVELT        R      RIT  11/01/2019   
 206044  TRAM2  R469  00-05-01  RIT-ROOSEVELT        R      RIT  11/01/2019   
 206045  TRAM2  R469  00-05-01  RIT-ROOSEVELT        R      RIT  11/01/2019   
 
             TIME     DESC  ENTRIES  \
 0       00

In [4]:
turnstiles.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,7247322,2455491
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,04:00:00,REGULAR,7247336,2455499
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,08:00:00,REGULAR,7247351,2455532
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,12:00:00,REGULAR,7247463,2455623
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,16:00:00,REGULAR,7247755,2455679


In [5]:
turnstiles.columns = [x.strip() for x in turnstiles.columns]
turnstiles.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [6]:
turnstiles.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)

In [8]:
# Initialize the variables we're using for cleaning and summarizing
delta_entry = [0]
delta_exit = [0]
turnstiles['date'] = pd.to_datetime(turnstiles['date'])
turnstiles['time'] = pd.to_datetime(turnstiles['time'])
turnstiles['time'] = turnstiles['time'].dt.time
turnstiles['weekday'] = turnstiles['date'].dt.weekday_name

# Loop through the rows of the dataframe
for row in range(1, len(turnstiles)) :
    # Assume we're going to have a 0 change for each row
    entry_update = 0
    exit_update = 0
    
    # If we're still dealing with the same station
    if((turnstiles['station'][row] == turnstiles['station'][row - 1]) &
       (turnstiles['desc'][row] == 'REGULAR') &
       (turnstiles['date'][row] == turnstiles['date'][row - 1])) :
        
        # And we havn't changed a date
        if(turnstiles['entries'][row] > turnstiles['entries'][row - 1]) :
            entry_update = (turnstiles['entries'][row] - turnstiles['entries'][row - 1])
        
        if(turnstiles['exits'][row] > turnstiles['exits'][row - 1]) :
            exit_update = (turnstiles['exits'][row] - turnstiles['exits'][row - 1])
        
        # Check for 'outliers'
        if(entry_update > 20000) :
            entry_update = 0
            
        if(exit_update > 20000) :
            exit_update = 0
    
    # Update the lists
    delta_entry.append(entry_update)
    delta_exit.append(exit_update)


# Add new columns to the data frame for the newly calculated data
turnstiles['deltaEntry'] = delta_entry
turnstiles['deltaExit'] = delta_exit


In [9]:
turnstiles.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,weekday,deltaEntry,deltaExit
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-10-26,00:00:00,REGULAR,7247322,2455491,Saturday,0,0
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-10-26,04:00:00,REGULAR,7247336,2455499,Saturday,14,8
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-10-26,08:00:00,REGULAR,7247351,2455532,Saturday,15,33
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-10-26,12:00:00,REGULAR,7247463,2455623,Saturday,112,91
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-10-26,16:00:00,REGULAR,7247755,2455679,Saturday,292,56


In [12]:
turnstiles.groupby(['station', 'time'])[['deltaEntry', 'deltaExit']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,deltaEntry,deltaExit
station,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AV,00:00:00,0,0
1 AV,04:00:00,3043,2543
1 AV,08:00:00,7455,15606
1 AV,12:00:00,25108,28776
1 AV,16:00:00,25183,22008
...,...,...,...
ZEREGA AV,05:00:00,426,579
ZEREGA AV,09:00:00,6879,2805
ZEREGA AV,13:00:00,3206,2116
ZEREGA AV,17:00:00,3793,3802


In [13]:
import folium

In [14]:
def generateBaseMap(default_location=[40.693943, -73.985880], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [15]:
basemap = generateBaseMap()
basemap

In [16]:
station_data = 'NYC_Transit_Subway_Entrance_And_Exit_Data.csv'
stations = pd.read_csv(data_path+station_data)

In [17]:
BBox = (stations['Entrance Longitude'].min(),   stations['Entrance Longitude'].max(),      
         stations['Entrance Latitude'].min(), stations['Entrance Latitude'].max())
BBox

(-74.03138299999999, 73.99332700000001, 40.575499, 40.903597)

In [18]:
stations.describe()

Unnamed: 0,Station Latitude,Station Longitude,Route8,Route9,Route10,Route11,Entrance Latitude,Entrance Longitude
count,1868.0,1868.0,48.0,28.0,23.0,23.0,1868.0,1868.0
mean,40.734065,-73.943609,2.979167,2.535714,3.0,7.0,40.734063,-73.86437
std,0.070376,0.05723,1.940493,1.170063,0.0,0.0,0.070382,3.423331
min,40.576127,-74.030876,1.0,2.0,3.0,7.0,40.575499,-74.031383
25%,40.689941,-73.987495,1.0,2.0,3.0,7.0,40.689808,-73.987482
50%,40.733422,-73.958145,4.0,2.0,3.0,7.0,40.733535,-73.958385
75%,40.768247,-73.911794,5.0,2.0,3.0,7.0,40.767655,-73.911159
max,40.903125,-73.755405,5.0,5.0,3.0,7.0,40.903597,73.993327


In [19]:
turnstiles.describe()

Unnamed: 0,entries,exits,deltaEntry,deltaExit
count,206046.0,206046.0,206046.0,206046.0
mean,41013410.0,33952140.0,161.158644,127.94856
std,211337500.0,195836000.0,261.822016,244.203327
min,0.0,0.0,0.0,0.0
25%,298731.2,138708.2,0.0,0.0
50%,1969911.0,1157290.0,40.0,31.0
75%,6621930.0,4505968.0,220.0,148.0
max,2129010000.0,2123659000.0,12526.0,10727.0


In [20]:
stations.head()

Unnamed: 0,Division,Line,Station Name,Station Latitude,Station Longitude,Route1,Route2,Route3,Route4,Route5,...,ADA,ADA Notes,Free Crossover,North South Street,East West Street,Corner,Entrance Latitude,Entrance Longitude,Station Location,Entrance Location
0,BMT,4 Avenue,25th St,40.660397,-73.998091,R,,,,,...,False,,False,4th Ave,25th St,SW,40.660489,-73.99822,"(40.660397, -73.998091)","(40.660489, -73.99822)"
1,BMT,4 Avenue,25th St,40.660397,-73.998091,R,,,,,...,False,,False,4th Ave,25th St,SE,40.660323,-73.997952,"(40.660397, -73.998091)","(40.660323, -73.997952)"
2,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NW,40.654676,-74.004306,"(40.655144, -74.003549)","(40.654676, -74.004306)"
3,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NE,40.654365,-74.004113,"(40.655144, -74.003549)","(40.654365, -74.004113)"
4,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NW,40.65449,-74.004499,"(40.655144, -74.003549)","(40.65449, -74.004499)"


In [21]:
stations.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

In [22]:
stations.station_name.unique()

array(['25th St', '36th St', '45th St', '53rd St', '59th St', '77th St',
       '86th St', '95th St', '9th St', 'Atlantic Av-Barclays Ctr',
       'Bay Ridge Av', 'DeKalb Av', 'Pacific St', 'Prospect Av',
       'Union St', '30 Av-Grand Av', '36 Av-Washington Av',
       '39 Av-Beebe Av', 'Astoria Blvd-Hoyt Av', 'Broadway',
       'Ditmars Blvd', '7th Av', 'Atlantic Av', 'Av H', 'Av J', 'Av M',
       'Av U', 'Beverly Rd', 'Brighton Beach', 'Church Av',
       'Cortelyou Rd', 'Kings Highway', 'Neck Rd', 'Newkirk Av',
       'Ocean Parkway', 'Parkside Av', 'Prospect Park', 'Sheepshead Bay',
       'Stillwell Av', 'West 8th St', '23rd St', '28th St', '34th St',
       '49th St', '57th St', '5th Av', '8th St', 'Canal St (UL)',
       'City Hall', 'Cortlandt St', 'Court St', 'Lawrence St',
       'Lexington Av', 'Prince St', 'Rector St', 'Times Square-42nd St',
       'Union Square', 'Whitehall St', '104th St-102nd St', '111th St',
       '121st St', 'Alabama Av', 'Chauncey St', 'Cleveland

In [23]:
len(stations.station_name)

1868

In [24]:
stations.head()

Unnamed: 0,division,line,station_name,station_latitude,station_longitude,route1,route2,route3,route4,route5,...,ada,ada_notes,free_crossover,north_south_street,east_west_street,corner,entrance_latitude,entrance_longitude,station_location,entrance_location
0,BMT,4 Avenue,25th St,40.660397,-73.998091,R,,,,,...,False,,False,4th Ave,25th St,SW,40.660489,-73.99822,"(40.660397, -73.998091)","(40.660489, -73.99822)"
1,BMT,4 Avenue,25th St,40.660397,-73.998091,R,,,,,...,False,,False,4th Ave,25th St,SE,40.660323,-73.997952,"(40.660397, -73.998091)","(40.660323, -73.997952)"
2,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NW,40.654676,-74.004306,"(40.655144, -74.003549)","(40.654676, -74.004306)"
3,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NE,40.654365,-74.004113,"(40.655144, -74.003549)","(40.654365, -74.004113)"
4,BMT,4 Avenue,36th St,40.655144,-74.003549,N,R,,,,...,False,,True,4th Ave,36th St,NW,40.65449,-74.004499,"(40.655144, -74.003549)","(40.65449, -74.004499)"


In [25]:
stations.division.value_counts()

IND    726
IRT    700
BMT    442
Name: division, dtype: int64

In [26]:
from folium.plugins import HeatMap
stations_copy = stations.copy()
stations_copy['count'] = 1
basemap = generateBaseMap()
HeatMap(data=stations_copy[['station_latitude', 'station_longitude', 'count']].groupby(['station_latitude', 'station_longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(basemap)

<folium.plugins.heat_map.HeatMap at 0x7fbf987a75d0>

In [27]:
basemap

In [28]:
less_stations = stations_copy[stations_copy['division'] == 'BMT']

In [29]:
for lat,lon,station in zip(less_stations['station_latitude'],less_stations['station_longitude'],less_stations['station_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            fill=True,
                            fill_opacity=0.7,
                           ).add_to(basemap)

In [30]:
basemap

## Merge turnstiles and station data

In [31]:
turnstiles_copy = turnstiles.copy()

In [32]:
turnstiles_copy.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,deltaEntry,deltaExit,datetime,weekday
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,7247322,2455491,0,0,2019-10-26,Saturday
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,04:00:00,REGULAR,7247336,2455499,14,8,2019-10-26,Saturday
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,08:00:00,REGULAR,7247351,2455532,15,33,2019-10-26,Saturday
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,12:00:00,REGULAR,7247463,2455623,112,91,2019-10-26,Saturday
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,16:00:00,REGULAR,7247755,2455679,292,56,2019-10-26,Saturday


Clean station names to match `stations` DataFrame

In [33]:
turnstiles_copy['station'] = turnstiles_copy['station'].str.title()

Filter down the DataFrames to only include relevant data

In [34]:
turnstiles_copy = turnstiles_copy.loc[:, ['station', 'division', 'date', 'time', 'entries', 'exits', 'deltaEntry', 'deltaExit', 'datetime', 'weekday']]

In [35]:
turnstiles_copy.head()

Unnamed: 0,station,division,date,time,entries,exits,deltaEntry,deltaExit,datetime,weekday
0,59 St,BMT,10/26/2019,00:00:00,7247322,2455491,0,0,2019-10-26,Saturday
1,59 St,BMT,10/26/2019,04:00:00,7247336,2455499,14,8,2019-10-26,Saturday
2,59 St,BMT,10/26/2019,08:00:00,7247351,2455532,15,33,2019-10-26,Saturday
3,59 St,BMT,10/26/2019,12:00:00,7247463,2455623,112,91,2019-10-26,Saturday
4,59 St,BMT,10/26/2019,16:00:00,7247755,2455679,292,56,2019-10-26,Saturday


Convert stations with numerics to ordinal numbers

In [42]:
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])
print([ordinal(n) for n in range(1, 32)])

['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th', '11st', '12nd', '13rd', '14th', '15th', '16th', '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th', '25th', '26th', '27th', '28th', '29th', '30th', '31st']


In [43]:
for x in turnstiles_copy.station:
    if any(char.isdigit() for char in x):
        ordinal(x)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

# COME BACK TO ABOVE TOMORROW AM TO GET MERGE RIGHT

In [44]:
stations_copy = stations_copy.loc[:, ['division', 'station_name', 'station_latitude', 'station_longitude']]

In [45]:
stations_copy.head()

Unnamed: 0,division,station_name,station_latitude,station_longitude
0,BMT,25th St,40.660397,-73.998091
1,BMT,25th St,40.660397,-73.998091
2,BMT,36th St,40.655144,-74.003549
3,BMT,36th St,40.655144,-74.003549
4,BMT,36th St,40.655144,-74.003549


In [46]:
stations_grouped = stations_copy.groupby(['station_name', 'division'], as_index=False)['station_latitude', 'station_longitude'].mean()

Merge DataFrames on station

In [47]:
merged_data = pd.merge(turnstiles_copy, stations_grouped, left_on='station', right_on='station_name', how='inner')

In [48]:
merged_data.columns

Index(['station', 'division_x', 'date', 'time', 'entries', 'exits',
       'deltaEntry', 'deltaExit', 'datetime', 'weekday', 'station_name',
       'division_y', 'station_latitude', 'station_longitude'],
      dtype='object')

In [49]:
merged_data.head()

Unnamed: 0,station,division_x,date,time,entries,exits,deltaEntry,deltaExit,datetime,weekday,station_name,division_y,station_latitude,station_longitude
0,Prince St,BMT,10/26/2019,00:00:00,12310132,4648104,0,0,2019-10-26,5,Prince St,BMT,40.724329,-73.997702
1,Prince St,BMT,10/26/2019,04:00:00,12310171,4648108,39,4,2019-10-26,5,Prince St,BMT,40.724329,-73.997702
2,Prince St,BMT,10/26/2019,08:00:00,12310177,4648132,6,24,2019-10-26,5,Prince St,BMT,40.724329,-73.997702
3,Prince St,BMT,10/26/2019,12:00:00,12310292,4648263,115,131,2019-10-26,5,Prince St,BMT,40.724329,-73.997702
4,Prince St,BMT,10/26/2019,16:00:00,12310875,4648594,583,331,2019-10-26,5,Prince St,BMT,40.724329,-73.997702


In [50]:
merged_data.shape

(90469, 14)