# Feature Engineering Part I: Features Depending on Station

#### The station name is one of the few supplied features relating to external information of the outside world that can impact turnstile counts. We can gather more information about it to engineer features that will improve our predictive power, starting from its location.

#### With GeoCoder, we can extract its map coordinates by searching the station name on Google Maps.

In [62]:
import pandas as pd

In [187]:
stationInfo = pd.DataFrame(fullTimeTable[['STATION','LINENAME']].drop_duplicates().values.astype(str),columns=['STATION','Trains'])
stationInfo=stationInfo.drop_duplicates(subset='STATION').reset_index(drop=True)
#stationInfo.to_pickle(path=saveToThisPath+'stationInfo.pickle')
#stationInfo=pd.read_pickle(path=saveToThisPath+'stationInfo.pickle')

# LATITUDE and LONGITUDE
from pygeocoder import Geocoder

stationInfo.to_pickle('stationInfo.csv')

stationInfo['Latitude']=""
stationInfo['Longitude']=""
stationInfo['Coordinates'] = ""

          STATION        Trains Latitude Longitude                Coordinates
0           59 ST       NQR456W  40.7617   -73.967  (40.7617367, -73.9669511)
1      5 AV/59 ST          NQRW  40.7643   -73.973  (40.7643054, -73.9730051)
2      57 ST-7 AV          NQRW  40.7656  -73.9803   (40.765564, -73.9803309)
3           49 ST          NQRW  40.7621  -73.9877  (40.7621246, -73.9876635)
4  TIMES SQ-42 ST  ACENQRS1237W  40.7559  -73.9871   (40.7558611, -73.987061)


In [273]:
for i in range(0,len(stationInfo)):
    coordinates=Geocoder().geocode(stationInfo.loc[i,'STATION']+", New York, NY")[0].coordinates
    stationInfo.loc[i,'Coordinates']=str(coordinates)
    stationInfo.loc[i,'Latitude']=coordinates[0]
    stationInfo.loc[i,'Longitude']=coordinates[1]
    print(len(stationInfo[stationInfo['Coordinates']!=""]))
    
stationInfo.to_pickle('stationInfo.csv')

print(stationInfo.head())

479
479
479
479
479
479
479
          STATION        Trains Latitude Longitude                Coordinates
0           59 ST       NQR456W  40.7617   -73.967  (40.7617367, -73.9669511)
1      5 AV/59 ST          NQRW  40.7643   -73.973  (40.7643054, -73.9730051)
2      57 ST-7 AV          NQRW  40.7656  -73.9803   (40.765564, -73.9803309)
3           49 ST          NQRW  40.7621  -73.9877  (40.7621246, -73.9876635)
4  TIMES SQ-42 ST  ACENQRS1237W  40.7559  -73.9871   (40.7558611, -73.987061)


#### Tracking the borough or part of a borough the station is in can help us narrow what we're looking at, as places in New York City just a mile apart can vary so much.

In [28]:
# BOROUGH
from geopy.geocoders import Nominatim
geolocator = Nominatim()
coordList = stationInfo[['Latitude','Longitude']].values.tolist()

def coordInfo(c):
    borough = str(geolocator.reverse(c)).split(', ')
    return borough

geolocator.reverse(coordList[0])



Location(Bloomingdale's, East 59th Street, Upper East Side, Midtown East, Manhattan, Manhattan Community Board 6, New York County, NYC, New York, 10022, United States of America, (40.76221035, -73.9671743455906, 0.0))

In [31]:
list1=   list(map(coordInfo,coordList[0:50]))

In [33]:
list2 =  list(map(coordInfo,coordList[51:100]))

In [35]:
list3 =  list(map(coordInfo,coordList[101:150]))

In [36]:
list4 =  list(map(coordInfo,coordList[151:200]))

In [38]:
list5 =  list(map(coordInfo,coordList[201:250]))

In [39]:
list6 =  list(map(coordInfo,coordList[251:300]))

In [40]:
list7 =  list(map(coordInfo,coordList[301:350]))

In [41]:
list8 =  list(map(coordInfo,coordList[351:400]))

In [42]:
list9 =  list(map(coordInfo,coordList[401:450]))

In [43]:
list10 =  list(map(coordInfo,coordList[451:479]))

In [None]:
bigList=list1+list2+list3+list4+list5+list6+list7+list8+list9+list10

In [None]:
stationInfo['Borough'] = pd.DataFrame(bigList).loc[:,3]

In [51]:
stationInfo.to_pickle('D:/MTA/stationInfo.pickle')

In [52]:
stationInfo.head()

Unnamed: 0,STATION,Trains,Latitude,Longitude,Coordinates,Borough
0,59 ST,NQR456W,40.7617,-73.967,"(40.7617367, -73.9669511)",Midtown East
1,5 AV/59 ST,NQRW,40.7643,-73.973,"(40.7643054, -73.9730051)",Upper East Side
2,57 ST-7 AV,NQRW,40.7656,-73.9803,"(40.765564, -73.9803309)",Manhattan
3,49 ST,NQRW,40.7621,-73.9877,"(40.7621246, -73.9876635)",Manhattan
4,TIMES SQ-42 ST,ACENQRS1237W,40.7559,-73.9871,"(40.7558611, -73.987061)",Manhattan


#### The distance to the closest station may impact whether a passenger will choose to take a train to get somewhere, rather than walk or take a taxi/Uber/Lyft instead. 

In [347]:
stationInfo=stationInfo.drop_duplicates(subset='STATION').reset_index(drop=True)

In [348]:
from math import sin, cos, sqrt, atan2
import pandas as pd
import numpy as np
import geopy.distance

def dist(lat1, long1, lat2, long2):
    coords_1=[lat1,long1]
    coords_2=[lat2,long2]
    return geopy.distance.vincenty(coords_1, coords_2).m
    
def find_closest_station(lat, long):
    distances = stationInfo.apply(lambda row: dist(lat, long, row['Latitude'], row['Longitude']), axis=1)
    return stationInfo.loc[distances.nsmallest(2).index.values[1], 'STATION']

def dist_to_closest(lat, long):
    distances = stationInfo.apply(lambda row: dist(lat, long, row['Latitude'], row['Longitude']), axis=1)
    return distances.nsmallest(2).values[1]

stationInfo['ClosestStation']=stationInfo[['Latitude','Longitude']].apply(lambda row: find_closest_station(row['Latitude'], row['Longitude']), axis=1)
stationInfo['distToClosestStation']=stationInfo[['Latitude','Longitude']].apply(lambda row: dist_to_closest(row['Latitude'], row['Longitude']), axis=1)



In [351]:
stationInfo['distToClosestStation'].sort_values(ascending=False).head()

373    98074.283935
248    98074.283935
205    19674.772636
237    13026.489971
113     4114.183269
Name: distToClosestStation, dtype: float64

In [352]:
stationInfo.loc[[373,248,205,237],:]

Unnamed: 0,STATION,Trains,Latitude,Longitude,Coordinates,Borough,ClosestStation,distToClosestStation,1,2,...,F,G,J,L,M,N,Q,R,S,W
373,RIT-MANHATTAN,R,43.0846,-77.6743,"(43.0845894, -77.67434449999999)",Manhattan,LACKAWANNA,98074.283935,0,0,...,0,0,0,0,0,0,0,1,0,0
248,LACKAWANNA,1,42.8256,-78.8234,"(42.8256141, -78.8233664)",Financial District,RIT-MANHATTAN,98074.283935,1,0,...,0,0,0,0,0,0,0,0,0,0
205,BROADWAY,G,41.1115,-73.8584,"(41.1115472, -73.85838059999999)",BK,HARRISON,19674.772636,0,0,...,0,1,0,0,0,0,0,0,0,0
237,HARRISON,1,40.97,-73.7176,"(40.9700171, -73.7176337)",Manhattan,ORCHARD BEACH,13026.489971,1,0,...,0,0,0,0,0,0,0,0,0,0
113,FOREST AVE,M,40.6256,-74.135,"(40.6255926, -74.13504069999999)",Middle Village,TOMPKINSVILLE,4114.183269,0,0,...,0,0,0,0,1,0,0,0,0,0


In [353]:
stationInfo.loc[[373,248,205,237],:]=np.nan

#### The availiability of a train one needs to take to get to their destination may impact if the rider chooses to enter that station over another. We will One Hot Encode a binary feature for the availability of each of the 22 trains in a row's station.

In [54]:
# Add takeable-Trains in Station? OneHotEncode.
trainList = ['1','2','3','4','5','6','7','A','B','C','D','E','F','G','J','L','M','N','Q','R','S','W']
for t in trainList:
    stationInfo[t]= stationInfo['Trains'].apply(lambda x: 1 if t in x else 0)

In [None]:
# OTHER POTENTIAL FEATURES WORTH OF ENGINEERING:

#['On E Line','On F Line',''On F Line']
# Num Stops to Penn
# Num Stops to GC
# Num Stops to TimesSq
# WHETHER ONE OF THESE IS REACHABLE BY DIRECTLY TAKING THIS STATION

#### Finally, we merge this onto the original table by the corresponding station.

In [357]:
len(stationInfo)

379

In [354]:
stationInfo.to_pickle('D:/MTA/stationInfo.pickle')

In [4]:
stationInfo=pd.read_pickle('D:/MTA/stationInfo.pickle')

In [3]:
import pandas as pd
fullTimeTable=pd.read_pickle('D:/MTA/fullTimeTable.pickle')

In [5]:
fullTimeTable=fullTimeTable.drop(['distToClosestStation','ClosestStation'],axis=1)

In [6]:
fullTimeTable=fullTimeTable.reset_index(drop=True).merge(stationInfo)
len(fullTimeTable)

8994256

# Feature Engineering Part II: Features Depending on Day

#### Of course, we also intend to see how time features relate to entry.

In [None]:
from datetime import datetime
from dateutil.parser import parse
dailyTable=pd.DataFrame(fullTimeTable['DATE'].drop_duplicates().values,columns=['Date'])
dailyTable['datetime']=[datetime.strptime(x, '%m/%d/%Y') for x in dailyTable['Date']]

#Add Weekday Column
dailyTable['Weekday'] = dailyTable['datetime'].dt.weekday_name

#Add Month Column
dailyTable['Mo'] = dailyTable['datetime'].dt.month

#Add Holiday Or Not Column
holidays = calendar().holidays(start=dailyTable['datetime'].min(), end=dailyTable['datetime'].max())
dailyTable['Holiday'] = dailyTable['datetime'].isin(holidays)

#### How hot or cold it is and whether its raining or snowing a lot can impact the turnout at a station, or even in the subway as a whole.

In [None]:
#Scrape the Mean Daily Temperature and Mean Precipitation
from WunderWeather import weather
from pprint import pprint
import arrow
extractor = weather.Extract('ce9f876ace16df4b') # API KEY

def getWeather(d):
    date = arrow.get(d,"MM/DD/YYYY")
    location = "NY/New York"
    response = extractor.date(location,date.format('YYYYMMDD'))
    return response.data['dailysummary'][0]['meantempi'],response.data['dailysummary'][0]['precipi']

dailyTable['Avg Temperature']=dailyTable['Date'].apply(lambda d: getWeather(d)[0])
dailyTable['Precipitation']=dailyTable['Date'].apply(lambda d: getWeather(d)[1])

In [None]:
saveToThisPath = "D:/MTA/"
dailyTable.to_pickle(path=saveToThisPath+'dailyTable.pickle')

In [30]:
dailyTable.head()

Unnamed: 0,Date,datetime,Weekday,Holiday,Avg Temperature,Precipitation
0,08/04/2018,2018-08-04,Saturday,False,78,0.74
1,08/05/2018,2018-08-05,Sunday,False,82,0.0
2,08/06/2018,2018-08-06,Monday,False,85,0.0
3,08/07/2018,2018-08-07,Tuesday,False,82,0.05
4,08/08/2018,2018-08-08,Wednesday,False,83,0.0


#### Finally, we merge this onto the original table by the corresponding date.

In [107]:
dailyTable=pd.read_pickle('D:/MTA/dailyTable.pickle')
fullTimeTable=pd.merge(fullTimeTable,dailyTable, left_on='DATE',right_on='datetime')

In [179]:
len(fullTimeTable)

9087106

In [11]:
fullTimeTable.to_pickle('D:/MTA/fullTimeTable.pickle')