In [62]:
import pandas as pd
import numpy as np
import math
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 10]
pd.options.mode.chained_assignment = None  # default='warn'

In [63]:
trips = pd.read_csv('../data/raw/201501-201611-hubway-tripdata.csv')
trips.head()

Unnamed: 0,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,Porter Square Station,42.387995,-71.119084,96,Cambridge Main Library at Broadway / Trowbridg...,42.373379,-71.111075
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,MIT Stata Center at Vassar St / Main St,42.361962,-71.092053,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,One Kendall Square at Hampshire St / Portland St,42.366277,-71.09169,68,Central Square at Mass Ave / Essex St,42.36507,-71.1031
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,Porter Square Station,42.387995,-71.119084,96,Cambridge Main Library at Broadway / Trowbridg...,42.373379,-71.111075
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,Lower Cambridgeport at Magazine St/Riverside Rd,42.356954,-71.113687,88,Inman Square at Vellucci Plaza / Hampshire St,42.374035,-71.101427


In [64]:
#convert timestamps
trips['starttime'] = trips['starttime'].apply(pd.Timestamp)
trips['stoptime'] = trips['stoptime'].apply(pd.Timestamp)
trips.dtypes

starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start_station_id                    int64
start_station_name                 object
start_station_latitude            float64
start_station_longitude           float64
end_station_id                      int64
end_station_name                   object
end_station_latitude              float64
end_station_longitude             float64
dtype: object

In [65]:
#split the columns into start and end data
#rename columns to a common format
start_stations = trips[['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'starttime']]
start_stations = start_stations.rename(columns={
    'start_station_id': 'station_id',
    'start_station_name': 'station_name', 
    'start_station_latitude': 'latitude', 
    'start_station_longitude': 'longitude',
    'starttime': 'timestamp'
})  

end_stations = trips[['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'stoptime']]
end_stations = end_stations.rename(columns={
    'end_station_id': 'station_id',
    'end_station_name': 'station_name', 
    'end_station_latitude': 'latitude', 
    'end_station_longitude': 'longitude',
    'stoptime': 'timestamp'
})  

#merge both sets into one
start_end_stations = start_stations.append(end_stations)

#save the station data. remove the timestamp and all duplicate IDs
stations = start_end_stations[['station_id', 'station_name', 'latitude', 'longitude']]
stations = stations.sort_values(by=['station_id'], ascending=True)
stations = stations.drop_duplicates(subset=['station_id'], keep='first')
stations = stations.set_index('station_id')
stations.head()

Unnamed: 0_level_0,station_name,latitude,longitude
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,Colleges of the Fenway,42.340021,-71.100812
4,Tremont St. at Berkeley St.,42.345392,-71.069616
5,Northeastern U / North Parking Lot,42.341814,-71.090179
6,Cambridge St. at Joy St.,42.361174,-71.065142
7,Fan Pier,42.353287,-71.044389


In [66]:
#reduce the full set of start and end data to station id and timestamp.
#sort by timestamp, with older timestamps on top
start_end_stations = start_end_stations[['station_id', 'timestamp']]
start_end_stations = start_end_stations.sort_values(by=['timestamp'], ascending=True)

#get the first entry for each station. it is the oldest departure or arrival timestamp
first_used = start_end_stations.drop_duplicates(subset=['station_id'], keep='first')

#convert the timestamp to date and drop the timestamp
first_used['first_used'] = first_used['timestamp'].dt.date
first_used = first_used.drop(['timestamp'], axis=1)
first_used = first_used.sort_values(by=['station_id'], ascending=True)
first_used = first_used.set_index('station_id')
first_used.head()

Unnamed: 0_level_0,first_used
station_id,Unnamed: 1_level_1
3,2015-04-17
4,2015-01-02
5,2015-04-17
6,2015-01-01
7,2015-04-17


In [67]:
#same as above for the most recent arrival or departure
last_used = start_end_stations.drop_duplicates(subset=['station_id'], keep='last')

last_used['last_used'] = last_used.loc[:, ('timestamp')].dt.date
last_used = last_used.drop(['timestamp'], axis=1)
last_used = last_used.sort_values(by=['station_id'], ascending=True)
last_used = last_used.set_index('station_id')
last_used.head()

Unnamed: 0_level_0,last_used
station_id,Unnamed: 1_level_1
3,2016-11-30
4,2016-11-30
5,2016-11-30
6,2016-11-30
7,2016-11-30


In [68]:
#join the datasets into one
stations = stations.join(first_used, how='outer')
stations = stations.join(last_used, how='outer')
stations.head()

Unnamed: 0_level_0,station_name,latitude,longitude,first_used,last_used
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,Colleges of the Fenway,42.340021,-71.100812,2015-04-17,2016-11-30
4,Tremont St. at Berkeley St.,42.345392,-71.069616,2015-01-02,2016-11-30
5,Northeastern U / North Parking Lot,42.341814,-71.090179,2015-04-17,2016-11-30
6,Cambridge St. at Joy St.,42.361174,-71.065142,2015-01-01,2016-11-30
7,Fan Pier,42.353287,-71.044389,2015-04-17,2016-11-30


In [69]:
print("Number of stations: ", stations.shape[0])

Number of stations:  189


In [89]:
stations.to_csv('../data/modelInput/stations.csv')