In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle

import json
import plotly.plotly as py
from plotly.graph_objs import *

from fuzzywuzzy import fuzz

### Load Trip Data

In [125]:
### Load Trip Data

# Load old df
with open('old_clean.pkl', 'rb') as f:
    old_df = pickle.load(f)

# Load new df
with open('new_clean.pkl', 'rb') as f:
    new_df = pickle.load(f)

# Add Epoch to each DF to make it easy to split later
old_df['Epoch'] = 'Old'
new_df['Epoch'] = 'New'

#
def remap_usertype(df):
    usertype_map = {'Subscriber': 'Member', 'Customer': 'Casual'}
    df.UserType = df.UserType.map(usertype_map)
    return df

old_df = remap_usertype(old_df)
new_df = remap_usertype(new_df)

In [120]:
old_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StopID,StopName,BikeID,UserType,ZipCode,Gender,Epoch
0,397,2014-12-31 23:58:00,2015-01-01 00:05:00,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Member,2148.0,Male,Old
1,543,2014-12-31 23:51:00,2015-01-01 00:00:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Member,2143.0,Male,Old
2,928,2014-12-31 23:30:00,2014-12-31 23:46:00,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Member,2215.0,Male,Old
3,270,2014-12-31 23:26:00,2014-12-31 23:31:00,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Member,2043.0,Male,Old
4,960,2014-12-31 23:07:00,2014-12-31 23:23:00,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Member,1945.0,Male,Old


In [98]:
new_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StartLat,StartLon,StopID,StopName,StopLat,StopLon,BikeID,UserType,BirthYear,Gender,Epoch
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,78,Union Square - Somerville,42.379637,-71.095319,1380,Member,1983.0,Female,New
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,23,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,190,Nashua Street at Red Auerbach Way,42.365598,-71.064248,866,Member,1980.0,Male,New
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,178,MIT Pacific St at Purrington St,42.359573,-71.101295,184,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,35,Member,1990.0,Male,New
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,25,Tremont St / W Newton St,42.341332,-71.076847,21,Prudential Center / Belvidere,42.345959,-71.082578,1917,Casual,,Unknown,New
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,76,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,139,Dana Park,42.361589,-71.107437,603,Member,1990.0,Male,New


### Load Stations Data

In [6]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [7]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [8]:
stations1_df = stations1_df[stations2_df.columns]
stations1_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations1_df = stations1_df.sort_values('StationID', axis=0)
stations1_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.1065,Boston,15


In [9]:
stations2_df = stations2_df.sort_values('Station ID', axis=0)
stations2_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations2_df = stations2_df.sort_values('StationID', axis=0)
stations2_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
3,A32000,Fan Pier,42.353287,-71.044389,Boston,15
4,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
5,A32002,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
6,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
7,A32004,Longwood Ave at Binney St,42.338629,-71.1065,Boston,15


### Combine Stations Data

In [85]:
# Combine station data into single DF
stations_df = pd.concat([stations1_df, stations2_df], axis=0)
# Remove duplicate rows
stations_df = stations_df[~stations_df.duplicated()]

# Compile all the names used to refer to each station ID
station_name_map = {}
for sid in set(stations_df.StationID):
    station_name_map[sid] = list(set(stations_df[stations_df.StationID == sid].StationName))
    
    # Designate a single name to be used to refer to each station ID
station_names = {}
for key, val in station_name_map.items():
    if len(val) > 1:
        val = val[1]
    else:
        val = val[0]
    station_names[key] = val
# Convert to dataframe    
station_names = pd.DataFrame.from_dict(d, 'index').reset_index()    
station_names.columns = ['StationID', 'StationName']
# Merge designated names back into combined stations DF
stations_df = station_names.merge(stations_df[['StationID', 'Lat', 'Lon', 'City', 'NumDocks']], on='StationID', how='left')
stations_df = stations_df[~stations_df.duplicated()].reset_index(drop=True)

In [87]:
stations_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,S32009,Packard Ave / Powderhouse Blvd,42.40449,-71.123413,Somerville,15
1,M32007,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,Cambridge,15
2,A32029,Piers Park,42.36489,-71.035042,Boston,19
3,D32003,TD Garden - Causeway at Portal Park #2,42.366222,-71.059914,Boston,27
4,M32048,Third at Binney,42.365445,-71.082771,Cambridge,15


In [127]:
# Plot bike stations on interactive map
credentials = json.load(open('/home/cneiderer/.mapbox/.credentials'))
mapbox_access_token = credentials['public_token']

data = Data([
    Scattermapbox(
        lat=stations_df.Lat,
        lon=stations_df.Lon,
        mode='markers',
        marker=Marker(
            color='red',
            size=10,
            opacity=0.7
        ),
        text=stations_df.StationName,
    )    
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    width=1200,
    height=800,
    margin=Margin(
        l=25,
        r=25,
        b=25,
        t=25,
        pad=4
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=42.361769, 
            lon=-71.078249 #-71.0589
        ),
        pitch=0,
        zoom=11.5
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Multiple Mapbox')

### Merge Lat, Lon, City into Old Data

In [99]:
# Map start station info
old_df = old_df.merge(stations_df, how='left', left_on='StartID', right_on='StationID')
old_df = old_df.drop(columns=['StartName', 'StationID'])
old_df = old_df.rename(columns={'StationName': 'StartName', 'Lat': 'StartLat', 'Lon': 'StartLon'})
old_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StopID,StopName,BikeID,UserType,ZipCode,Gender,Epoch,StartName,StartLat,StartLon,City,NumDocks
0,397,2014-12-31 23:58:00,2015-01-01 00:05:00,B32004,D32022,TD Garden - Causeway at Portal Park #1,T01335,Member,2148.0,Male,Old,Aquarium T Stop - 200 Atlantic Ave,42.35977,-71.051601,Boston,19.0
1,543,2014-12-31 23:51:00,2015-01-01 00:00:00,M32006,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Member,2143.0,Male,Old,MIT at Mass Ave / Amherst St,42.3581,-71.093198,Cambridge,27.0
2,928,2014-12-31 23:30:00,2014-12-31 23:46:00,D32005,A32008,Buswell St. at Park Dr.,B01570,Member,2215.0,Male,Old,Copley Square - Dartmouth St at Boylston St,42.349673,-71.077303,Boston,25.0
3,270,2014-12-31 23:26:00,2014-12-31 23:31:00,B32004,A32010,South Station - 700 Atlantic Ave.,T01205,Member,2043.0,Male,Old,Aquarium T Stop - 200 Atlantic Ave,42.35977,-71.051601,Boston,19.0
4,960,2014-12-31 23:07:00,2014-12-31 23:23:00,D32010,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Member,1945.0,Male,Old,Cross St at Hanover St,42.362811,-71.056067,Boston,18.0


In [None]:
# # Map stop station info
# old_df = old_df.merge(stations_df, how='left', left_on='StopID', right_on='StationID', suffixes=('', 'Junk_'))
# old_df = old_df.drop(columns=['StopName', 'StationID'])
# old_df = old_df.rename(columns={'StationName': 'StopName', 'Lat': 'StopLat', 'Lon': 'StopLon'})
# old_df.head()

In [126]:
def map_station_info_to_old_data(data_df, stations_df, loc):
    data_df = data_df.merge(stations_df, how='left', left_on=loc+'ID', right_on='StationID', suffixes=('', '_Junk'))
    for col in data_df.columns:
        if col[-5:] == '_Junk':
            data_df.drop(columns=col)
    data_df = data_df.drop(columns=[loc+'Name', 'StationID'])
    rename_map = {'StationName': loc+'Name', 
                  'City': loc+'City',
                  'Lat': loc+'Lat', 
                  'Lon': loc+'Lon', 
                  'NumDocks': loc+'NumDocks'}
    data_df = data_df.rename(columns=rename_map)
    return data_df

old_df = map_station_info_to_old_data(old_df, stations_df, 'Start')
old_df = map_station_info_to_old_data(old_df, stations_df, 'Stop')
old_df = old_df.reindex(sorted(old_df.columns), axis=1)

old_df.head()


'.reindex_axis' is deprecated and will be removed in a future version. Use '.reindex' instead.



Unnamed: 0,BikeID,Duration,Epoch,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartNumDocks,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopNumDocks,StopTime,UserType,ZipCode
0,T01335,397,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,2014-12-31 23:58:00,Boston,D32022,42.365885,-71.064548,TD Garden - West End Park (formerly TD Garden ...,34.0,2015-01-01 00:05:00,Member,2148.0
1,T01426,543,Old,Male,Cambridge,M32006,42.3581,-71.093198,MIT at Mass Ave / Amherst St,27.0,2014-12-31 23:51:00,Cambridge,M32007,42.372969,-71.094445,Cambridge St - at Columbia St / Webster Ave,15.0,2015-01-01 00:00:00,Member,2143.0
2,B01570,928,Old,Male,Boston,D32005,42.349673,-71.077303,Copley Square - Dartmouth St at Boylston St,25.0,2014-12-31 23:30:00,Boston,A32008,42.347241,-71.105301,Park Dr at Buswell St,15.0,2014-12-31 23:46:00,Member,2215.0
3,T01205,270,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,2014-12-31 23:26:00,Boston,A32010,42.352175,-71.055547,South Station - 700 Atlantic Ave,46.0,2014-12-31 23:31:00,Member,2043.0
4,T01306,960,Old,Male,Boston,D32010,42.362811,-71.056067,Cross St at Hanover St,18.0,2014-12-31 23:07:00,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,2014-12-31 23:23:00,Member,1945.0


### Trip Info

In [None]:
new_df.head()

In [None]:
start_df = new_df[['StartID', 'StartName', 'StartLat', 'StartLon']]
start_df = start_df[~start_df.duplicated()]

In [None]:
stop_df = new_df[['StopID', 'StopName', 'StopLat', 'StopLon']]
stop_df = stop_df[~stop_df.duplicated()]

In [None]:
df = pd.DataFrame(np.concatenate([start_df.as_matrix(), stop_df.as_matrix()], axis=0), 
                  columns=['StationID', 'StationName', 'Lat', 'Lon'])
df = df[~df.duplicated()]
df = df.sort_values('StationID', axis=0)
df = df.reset_index(drop=True)
df.head()

In [None]:
df[(df == 0).any(axis=1)]

In [None]:
# Station mapping
station_map = {}
for sid in set(df.StationID):
    station_map[sid] = set(df[df.StationID == sid].StationName) 

In [None]:
station_map

In [None]:
# Find similarity of station naming
similarity = []
for key1, val1 in stations.items():
    for key2, val2 in station_map.items():
        item1 = list(val1)[0]
        for item2 in val2:
            similarity.append([key1, str(key2), item1, item2, 
                               fuzz.partial_ratio(str(item1), str(item2)),
                               fuzz.partial_token_sort_ratio(str(item1), str(item2)),
                               fuzz.partial_token_set_ratio(str(item1), str(item2))])        

In [None]:
# Convert similarity calculations to df
mapping_df = pd.DataFrame(similarity, columns=['StationID1', 'StationID2', 
                                               'StationName1', 'StationName2', 
                                               'PartialSimilarity', 'PartialTokenSort', 'PartialTokenSet'])
mapping_df

In [None]:
mapping_df.info()

In [None]:
# Find maximum similarity match
mapping1 = pd.DataFrame()
mapping2 = pd.DataFrame()
mapping3 = pd.DataFrame()
for ID in set(mapping_df.StationID1):
    tmp = mapping_df[mapping_df.StationID1 == ID]
    mapping1 = pd.concat([mapping1, pd.DataFrame(tmp.iloc[tmp['PartialSimilarity'].values.argmax(), :]).T], axis=0)
    mapping2 = pd.concat([mapping2, pd.DataFrame(tmp.iloc[tmp['PartialTokenSort'].values.argmax(), :]).T], axis=0)
    mapping3 = pd.concat([mapping3, pd.DataFrame(tmp.iloc[tmp['PartialTokenSet'].values.argmax(), :]).T], axis=0)

In [None]:
mapping1

In [None]:
# Spot check any similarity scores below 100
mapping1[mapping1.PartialSimilarity < 100]

In [None]:
mapping1 = mapping1.reset_index(drop=True).sort_values('StationID1')

In [None]:
mapping1

#### Map Station Info to Trip Data

In [None]:
# Map alpha-numeric ID to starting station
mapping1.StationID2 = mapping1.StationID2.astype(int)
tmp_df = new_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StartID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StartID': 'StartNum', 'StationID1': 'StartID'})
tmp_df = tmp_df.drop(columns='StationID2')
# Map alpha-numeric ID to stopping station
tmp_df = tmp_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StopID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StopID': 'StopNum', 'StationID1': 'StopID'})
tmp_df = tmp_df.drop(columns=['StationID2', 'StartNum', 'StopNum'])
# Map start city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StartCity'})
tmp_df = tmp_df.drop(columns=['StationID'])
# Map stop city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StopCity'})
tmp_df = tmp_df.drop(columns=['StationID'])

In [None]:
tmp_df = tmp_df[sorted(tmp_df.columns)]
tmp_df.head()

In [None]:
tmp_df.shape

In [None]:
# Save new df
with open('new_mapped.pkl', 'wb') as f:
    pickle.dump(tmp_df, f)

In [None]:
# Map Start Lat, Lon, and City
tmp2_df = old_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StartLat', 'Lon': 'StartLon', 'City': 'StartCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])
# Map Stop Lat, Lon, and City
tmp2_df = tmp2_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StopLat', 'Lon': 'StopLon', 'City': 'StopCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])

In [None]:
tmp_df2 = tmp2_df[sorted(tmp2_df.columns)]
tmp_df2.head()

In [None]:
tmp2_df.shape

In [None]:
# Save old df
with open('old_mapped.pkl', 'wb') as f:
    pickle.dump(tmp2_df, f)

#### Combine Old Data with New Data

In [None]:
set(tmp_df.columns).symmetric_difference(set(tmp2_df.columns))

In [None]:
combo = pd.concat([tmp_df, tmp2_df], axis=0)
combo = combo.drop(columns=['UserType', 'ZipCode'])

In [None]:
combo.head()

In [None]:
# Save combo df
with open('combo.pkl', 'wb') as f:
    pickle.dump(combo, f)