In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as mat
import datetime as dt
sys.path.append('../src')
import mta_data_clean as mc

In [2]:
# Read the datafiles
data_path = '../data/'
pickle_file = 'mta_data.pickle'
'''
data_file_list = ['turnstile_191102.txt',
                 'turnstile_191109.txt',
                 'turnstile_191116.txt',
                 'turnstile_191130.txt',
                 'turnstile_191207.txt',
                 'turnstile_191214.txt',
                 'turnstile_191221.txt',
                 'turnstile_191228.txt'
                 'turnstile_200104.txt']
'''
data_file_list = ['turnstile_191102.txt',
                 'turnstile_191005.txt']
booth_key_filename = 'Remote-Booth-Station.csv'

In [3]:
df = mc.mta_data_pickle_read(pickle_file)

In [5]:
df.describe()

AttributeError: 'NoneType' object has no attribute 'describe'

In [None]:
turnstiles = pd.read_csv(data_path + data_file_list[0])
booth_key = pd.read_csv(data_path + booth_key_filename)
turnstiles, booth_key

In [None]:
turnstiles.head()

In [None]:
turnstiles.columns = [x.strip() for x in turnstiles.columns]
turnstiles.columns

In [None]:
turnstiles.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)

In [None]:
# Initialize the variables we're using for cleaning and summarizing
delta_entry = [0]
delta_exit = [0]
turnstiles['date'] = pd.to_datetime(turnstiles['date'])
turnstiles['time'] = pd.to_datetime(turnstiles['time'])
turnstiles['time'] = turnstiles['time'].dt.time
turnstiles['weekday'] = turnstiles['date'].dt.weekday_name

# Loop through the rows of the dataframe
for row in range(1, len(turnstiles)) :
    # Assume we're going to have a 0 change for each row
    entry_update = 0
    exit_update = 0
    
    # If we're still dealing with the same station
    if((turnstiles['station'][row] == turnstiles['station'][row - 1]) &
       (turnstiles['desc'][row] == 'REGULAR') &
       (turnstiles['date'][row] == turnstiles['date'][row - 1])) :
        
        # And we havn't changed a date
        if(turnstiles['entries'][row] > turnstiles['entries'][row - 1]) :
            entry_update = (turnstiles['entries'][row] - turnstiles['entries'][row - 1])
        
        if(turnstiles['exits'][row] > turnstiles['exits'][row - 1]) :
            exit_update = (turnstiles['exits'][row] - turnstiles['exits'][row - 1])
        
        # Check for 'outliers'
        if(entry_update > 20000) :
            entry_update = 0
            
        if(exit_update > 20000) :
            exit_update = 0
    
    # Update the lists
    delta_entry.append(entry_update)
    delta_exit.append(exit_update)


# Add new columns to the data frame for the newly calculated data
turnstiles['deltaEntry'] = delta_entry
turnstiles['deltaExit'] = delta_exit

In [None]:
turnstiles.head()

In [None]:
turnstiles.groupby(['station', 'time'])[['deltaEntry', 'deltaExit']].sum()

In [None]:
import folium

In [None]:
def generateBaseMap(default_location=[40.693943, -73.985880], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
basemap = generateBaseMap()
basemap

In [None]:
station_data = 'NYC_Transit_Subway_Entrance_And_Exit_Data.csv'
stations = pd.read_csv(data_path+station_data)

In [None]:
BBox = (stations['Entrance Longitude'].min(),   stations['Entrance Longitude'].max(),      
         stations['Entrance Latitude'].min(), stations['Entrance Latitude'].max())
BBox

In [None]:
stations.describe()

In [None]:
turnstiles.describe()

In [None]:
stations.head()

In [None]:
stations.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

In [None]:
stations.station_name.unique()

In [None]:
len(stations.station_name)

In [None]:
stations.head()

In [None]:
stations.division.value_counts()

In [None]:
from folium.plugins import HeatMap
stations_copy = stations.copy()
stations_copy['count'] = 1
basemap = generateBaseMap()
HeatMap(data=stations_copy[['station_latitude', 'station_longitude', 'count']].groupby(['station_latitude', 'station_longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(basemap);

In [None]:
basemap

In [None]:
less_stations = stations_copy[stations_copy['division'] == 'BMT']

In [None]:
for lat,lon,station in zip(less_stations['station_latitude'],less_stations['station_longitude'],less_stations['station_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            fill=True,
                            fill_opacity=0.7,
                           ).add_to(basemap)

In [None]:
basemap

## Merge turnstiles and station data

In [None]:
turnstiles_copy = turnstiles.copy()

In [None]:
turnstiles_copy.head()

Clean station names to match `stations` DataFrame

In [None]:
turnstiles_copy['station'] = turnstiles_copy['station'].str.title()

Filter down the DataFrames to only include relevant data

Convert stations with numerics to ordinal numbers

In [None]:
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])
print([ordinal(n) for n in range(1, 32)])

In [None]:
for x in turnstiles_copy.station:
    if any(char.isdigit() for char in x):
        ordinal(x)

# COME BACK TO ABOVE TOMORROW AM TO GET MERGE RIGHT

In [None]:
stations_copy = stations_copy.loc[:, ['division', 'station_name', 'station_latitude', 'station_longitude']]

In [None]:
stations_copy.head()

In [None]:
stations_grouped = stations_copy.groupby(['station_name', 'division'], as_index=False)['station_latitude', 'station_longitude'].mean()

Merge DataFrames on station

In [None]:
merged_data = pd.merge(turnstiles_copy, stations_grouped, left_on='station', right_on='station_name', how='inner')

In [None]:
merged_data.columns

In [None]:
merged_data.head()

In [None]:
merged_data.shape

# Only Manhattan Stations

In [None]:
new_stations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')

In [None]:
new_stations.head()

In [None]:
new_stations.Borough.value_counts()

In [None]:
manhattan_stations = new_stations[new_stations['Borough'] == 'M']
manhattan_stations.shape

In [None]:
manhattan_stations.columns

In [None]:
manhattan_stations['Stop Name'].value_counts()

In [None]:
import folium

In [None]:
def generateBaseMap(default_location=[40.693943, -73.985880], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
from folium.plugins import HeatMap
manhattan_copy = manhattan_stations.copy()
manhattan_copy['count'] = 1
basemap = generateBaseMap()
HeatMap(data=manhattan_copy[['GTFS Latitude', 'GTFS Longitude', 'count']].groupby(['GTFS Latitude', 'GTFS Longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(basemap)

In [None]:
basemap

In [None]:
for lat,lon,station in zip(manhattan_copy['GTFS Latitude'],manhattan_copy['GTFS Longitude'],manhattan_copy['Stop Name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            fill=True,
                            fill_opacity=0.7,
                           ).add_to(basemap)

In [None]:
basemap

<font color="red"><H1>Merge Stations and Turnstiles Data</H1></font>

In [None]:
turnstiles_copy.head()

In [None]:
manhattan_copy.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

In [None]:
manhattan_copy.head()

In [None]:
manhattan_copy = manhattan_copy.loc[:, ['stop_name', 'gtfs_latitude', 'gtfs_longitude']]

In [None]:
combined_data = pd.merge(turnstiles_copy, manhattan_copy, left_on='station', right_on='stop_name')

In [None]:
combined_data.head()

In [None]:
combined_data.dtypes

<font color="red"><H1>Add tech and women's health companies</H1></font>

In [None]:
tech_file = 'tech_ny.csv'
womens_health_file = 'womenshealth_ny.csv'

In [None]:
tech_companies = pd.read_csv(data_path + tech_file)
tech_companies.head()

In [None]:
womens_health_companies = pd.read_csv(data_path + womens_health_file)
womens_health_companies.head()

<font color="red"><H1>Replot using combined data for traffic densities and add tech companies</H1></font>

In [None]:
import folium

In [None]:
def generateBaseMap(default_location=[40.758896, -73.985130], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
combined_data.head()

## Subway Stations Map

In [None]:
stations_map = generateBaseMap()
for lat,lon,station in zip(manhattan_copy['gtfs_latitude'],manhattan_copy['gtfs_longitude'],manhattan_copy['stop_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            radius=2.5,
                            fill=True,
                            fill_opacity=0.8,
                           ).add_to(stations_map)

In [None]:
stations_map

## Company Locations Map

Tech Companies

In [None]:
tech_map = generateBaseMap(default_zoom_start=12.5)
for lat,lon,company in zip(tech_companies['latitude'],tech_companies['longitude'],tech_companies['name']):
     folium.Marker([lat, lon],
                        popup=company,
                        icon=folium.Icon(color='darkblue'),
                        radius=2.5,
                        fill=True,
                        fill_opacity=0.8,
                       ).add_to(tech_map)

In [None]:
tech_map

Women's Health Map

In [None]:
womens_health_map = generateBaseMap()
for lat,lon,company in zip(womens_health_companies['latitude'],womens_health_companies['longitude'],womens_health_companies['name']):
     folium.Marker([lat, lon],
                        popup=company,
                        icon=folium.Icon(color='red'),
                        radius=2.5,
                        fill=True,
                        fill_opacity=0.8,
                       ).add_to(womens_health_map)

In [None]:
womens_health_map

## Combined tech and womens health maps

In [None]:
for lat,lon,company in zip(womens_health_companies['latitude'],womens_health_companies['longitude'],womens_health_companies['name']):
     folium.Marker([lat, lon],
                        popup=company,
                        icon=folium.Icon(color='red'),
                        radius=2.5,
                        fill=True,
                        fill_opacity=0.8,
                       ).add_to(tech_map)

In [None]:
tech_map

## Entries Map

In [None]:
from folium.plugins import HeatMap
combined_copy = combined_data.copy()
entries_map = generateBaseMap()
HeatMap(data=combined_copy[['gtfs_latitude', 'gtfs_longitude', 'deltaEntry']].groupby(['gtfs_latitude', 'gtfs_longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(entries_map);

In [None]:
entries_map

In [None]:
for lat,lon,station in zip(manhattan_copy['gtfs_latitude'],manhattan_copy['gtfs_longitude'],manhattan_copy['stop_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            radius=2.5,
                            fill=True,
                            fill_opacity=0.8,
                           ).add_to(entries_map)

In [None]:
entries_map

## Exits Map

In [None]:
from folium.plugins import HeatMap
combined_copy = combined_data.copy()
exits_map = generateBaseMap()
HeatMap(data=combined_copy[['gtfs_latitude', 'gtfs_longitude', 'deltaExit']].groupby(['gtfs_latitude', 'gtfs_longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(exits_map);

In [None]:
exits_map

In [None]:
for lat,lon,station in zip(manhattan_copy['gtfs_latitude'],manhattan_copy['gtfs_longitude'],manhattan_copy['stop_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            radius=2.5,
                            fill=True,
                            fill_opacity=0.8,
                           ).add_to(exits_map)

In [None]:
exits_map

## Combined Traffic Map + Layers

In [None]:
from folium.plugins import HeatMap
combined_copy = combined_data.copy()
traffic_map = generateBaseMap()
combined_copy['total_traffic'] = combined_copy['deltaExit'] + combined_copy['deltaEntry']
HeatMap(data=combined_copy[['gtfs_latitude', 'gtfs_longitude', 'total_traffic']].groupby(['gtfs_latitude', 'gtfs_longitude']).sum().reset_index().values.tolist(), radius=10, max_zoom=13).add_to(traffic_map);

In [None]:
traffic_map

### Add stations

In [None]:
for lat,lon,station in zip(manhattan_copy['gtfs_latitude'],manhattan_copy['gtfs_longitude'],manhattan_copy['stop_name']):
     folium.CircleMarker([lat, lon],
                            popup=station,
                            color='b',
                            radius=2.5,
                            fill=True,
                            fill_opacity=0.8,
                           ).add_to(traffic_map)

In [None]:
traffic_map

## Add companies

In [None]:
tech_map = generateBaseMap(default_zoom_start=12.5)
for lat,lon,company in zip(tech_companies['latitude'],tech_companies['longitude'],tech_companies['name']):
     folium.Marker([lat, lon],
                        popup=company,
                        icon=folium.Icon(color='darkblue'),
                        radius=2.5,
                        fill=True,
                        fill_opacity=0.8,
                       ).add_to(tech_map)

In [None]:
for lat,lon,company in zip(womens_health_companies['latitude'],womens_health_companies['longitude'],womens_health_companies['name']):
     folium.Marker([lat, lon],
                        popup=company,
                        icon=folium.Icon(color='red'),
                        radius=2.5,
                        fill=True,
                        fill_opacity=0.8,
                       ).add_to(tech_map)

In [None]:
tech_map