In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle

import json
import plotly.plotly as py
from plotly.graph_objs import *

from fuzzywuzzy import fuzz

### Load Trip Data

In [3]:
# Load old df
with open('old_clean.pkl', 'rb') as f:
    old_df = pickle.load(f)

# Load new df
with open('new_clean.pkl', 'rb') as f:
    new_df = pickle.load(f)

# Add Epoch to each DF to make it easy to split later
old_df['Epoch'] = 'Old'
new_df['Epoch'] = 'New'

#
def remap_usertype(df):
    usertype_map = {'Subscriber': 'Member', 'Customer': 'Casual'}
    df.UserType = df.UserType.map(usertype_map)
    return df

old_df = remap_usertype(old_df)
new_df = remap_usertype(new_df)

### Load Stations Data

In [4]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [5]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [6]:
stations1_df = stations1_df[stations2_df.columns]
stations1_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations1_df = stations1_df.sort_values('StationID', axis=0)
stations1_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.1065,Boston,15


In [7]:
stations2_df = stations2_df.sort_values('Station ID', axis=0)
stations2_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations2_df = stations2_df.sort_values('StationID', axis=0)
stations2_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
3,A32000,Fan Pier,42.353287,-71.044389,Boston,15
4,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
5,A32002,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
6,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
7,A32004,Longwood Ave at Binney St,42.338629,-71.1065,Boston,15


### Combine Stations Data

In [8]:
# Combine station data into single DF
stations_df = pd.concat([stations1_df, stations2_df], axis=0)
# Remove duplicate rows
stations_df = stations_df[~stations_df.duplicated()]

# Compile all the names used to refer to each station ID
station_name_map = {}
for sid in set(stations_df.StationID):
    station_name_map[sid] = list(set(stations_df[stations_df.StationID == sid].StationName))
    
    # Designate a single name to be used to refer to each station ID
station_names = {}
for key, val in station_name_map.items():
    if len(val) > 1:
        val = val[1]
    else:
        val = val[0]
    station_names[key] = val
# Convert to dataframe    
station_names = pd.DataFrame.from_dict(station_names, 'index').reset_index()    
station_names.columns = ['StationID', 'StationName']
# Merge designated names back into combined stations DF
stations_df = station_names.merge(stations_df[['StationID', 'Lat', 'Lon', 'City', 'NumDocks']], on='StationID', how='left')
stations_df = stations_df[~stations_df.duplicated()].reset_index(drop=True)

In [9]:
stations_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,M32049,Brian P. Murphy Staircase at Child Street,42.37206,-71.072026,Cambridge,23
1,M32047,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,Cambridge,19
2,C32006,Washington St at Rutland St,42.338493,-71.074019,Boston,19
3,A32019,Harvard University Transportation Services - 1...,42.363796,-71.129164,Boston,18
4,M32032,Kendall Street,42.36356,-71.082168,Cambridge,15


In [10]:
# Plot bike stations on interactive map
credentials = json.load(open('/home/cneiderer/.mapbox/.credentials'))
mapbox_access_token = credentials['public_token']

data = Data([
    Scattermapbox(
        lat=stations_df.Lat,
        lon=stations_df.Lon,
        mode='markers',
        marker=Marker(
            color='red',
            size=10,
            opacity=0.7
        ),
        text=stations_df.StationName,
    )    
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    width=1200,
    height=800,
    margin=Margin(
        l=25,
        r=25,
        b=25,
        t=25,
        pad=4
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=42.361769, 
            lon=-71.078249 #-71.0589
        ),
        pitch=0,
        zoom=11.5
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Multiple Mapbox')

### Map Lat, Lon, City into Old Data

In [11]:
def map_station_info_to_old_data(data_df, stations_df, loc):
    data_df = data_df.merge(stations_df, how='left', left_on=loc+'ID', right_on='StationID', suffixes=('', '_Junk'))
    for col in data_df.columns:
        if col[-5:] == '_Junk':
            data_df.drop(columns=col)
    data_df[loc+'OrigName'] = data_df[loc+'Name']
    data_df = data_df.drop(columns=[loc+'Name', 'StationID'])
    rename_map = {'StationName': loc+'Name', 
                  'City': loc+'City',
                  'Lat': loc+'Lat', 
                  'Lon': loc+'Lon', 
                  'NumDocks': loc+'NumDocks'}
    data_df = data_df.rename(columns=rename_map)
    return data_df

wold_df = map_station_info_to_old_data(old_df, stations_df, 'Start')
wold_df = map_station_info_to_old_data(wold_df, stations_df, 'Stop')

# Remove rows that have undefined station info
wold_df = wold_df[~(wold_df.StartCity.isna() | wold_df.StopCity.isna())]

wold_df = wold_df[sorted(wold_df.columns)]
wold_df.head()

Unnamed: 0,BikeID,Duration,Epoch,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartNumDocks,StartOrigName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopNumDocks,StopOrigName,StopTime,UserType,ZipCode
0,T01335,397,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium Station - 200 Atlantic Ave.,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:58:00,Boston,D32022,42.365885,-71.064548,TD Garden - West End Park (formerly TD Garden ...,34.0,TD Garden - Causeway at Portal Park #1,2015-01-01 00:05:00,Member,2148.0
1,T01426,543,Old,Male,Cambridge,M32006,42.3581,-71.093198,MIT at Mass Ave / Amherst St,27.0,MIT at Mass Ave / Amherst St,2014-12-31 23:51:00,Cambridge,M32007,42.372969,-71.094445,Cambridge St - at Columbia St / Webster Ave,15.0,Cambridge St - at Columbia St / Webster Ave,2015-01-01 00:00:00,Member,2143.0
2,B01570,928,Old,Male,Boston,D32005,42.349673,-71.077303,Copley Square - Dartmouth St at Boylston St,25.0,Boston Public Library - 700 Boylston St.,2014-12-31 23:30:00,Boston,A32008,42.347241,-71.105301,Park Dr at Buswell St,15.0,Buswell St. at Park Dr.,2014-12-31 23:46:00,Member,2215.0
3,T01205,270,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium Station - 200 Atlantic Ave.,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:26:00,Boston,A32010,42.352175,-71.055547,South Station - 700 Atlantic Ave.,46.0,South Station - 700 Atlantic Ave.,2014-12-31 23:31:00,Member,2043.0
4,T01306,960,Old,Male,Boston,D32010,42.362811,-71.056067,Cross St at Hanover St,18.0,Cross St. at Hanover St.,2014-12-31 23:07:00,Boston,B32004,42.35977,-71.051601,Aquarium Station - 200 Atlantic Ave.,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:23:00,Member,1945.0


In [12]:
# Find Stations that existed at start of Hubway but no longer exist in system
wold_df[wold_df.StartCity.isna() | wold_df.StopCity.isna()]

Unnamed: 0,BikeID,Duration,Epoch,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartNumDocks,StartOrigName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopNumDocks,StopOrigName,StopTime,UserType,ZipCode


In [None]:
# missing_data = [['A32007', 'Harvard Real Estate - North Harvard St at Western Ave', 'Boston', 42.3631911, -71.1319868],
#                 ['B32009', 'Overland St at Brookline Ave', 'Boston', 42.3460641, -71.1019295],
#                 ['B32019', 'Longwood Ave/Riverway', 'Boston', 42.3403202, -71.112122],
#                 ['C32011', 'South Bay Plaza', 'Boston', 42.3341162, -71.0698326],
#                 ['D32025', 'Milk St at India St', 'Boston', 42.3584621, -71.0554896],
#                 ['K32003', 'Brookline Village - Station Street @ MBTA', 'Brookline', 42.3327674, -71.1191912],
#                 ['S32007', 'Ball Sq', 'Somerville', 42.399747, -71.1132117],
#                 ['S32012', 'Summer St at Cutter St', 'Somerville', 42.3940975, -71.1228071]]

# missing_df = pd.DataFrame(missing_data, columns=['StationID', 'StationName', 'City', 'Lat', 'Lon'])
# missing_df

In [None]:
# Save old df
with open('old.pkl', 'wb') as f:
    pickle.dump(wold_df, f)

### Compile Station Info from New Trip Data

In [None]:
new_df.head()

In [None]:
# Compile start stations
start_df = new_df[['StartID', 'StartName', 'StartLat', 'StartLon']]
start_df = start_df[~start_df.duplicated()]

# Compile stop stations
stop_df = new_df[['StopID', 'StopName', 'StopLat', 'StopLon']]
stop_df = stop_df[~stop_df.duplicated()]

# Combine start and stop stations
trip_stations_df = pd.DataFrame(np.concatenate([start_df.as_matrix(), stop_df.as_matrix()], axis=0), 
                                columns=['StationID', 'StationName', 'Lat', 'Lon'])
trip_stations_df = trip_stations_df[~trip_stations_df.duplicated()]
trip_stations_df = trip_stations_df.sort_values('StationID', axis=0)
trip_stations_df = trip_stations_df.reset_index(drop=True)
trip_stations_df.head()

In [None]:
# Station mapping
trip_station_map = {}
for sid in set(trip_stations_df.StationID):
    trip_station_map[sid] = list(set(trip_stations_df[trip_stations_df.StationID == sid].StationName) )

In [None]:
# Find similarity of station naming
similarity = []
for key1, val1 in station_name_map.items():
    for key2, val2 in trip_station_map.items():
        item1 = list(val1)[0]
        for item2 in val2:
            similarity.append([key1, str(key2), item1, item2, 
                               fuzz.partial_ratio(str(item1), str(item2)),
                               fuzz.partial_token_sort_ratio(str(item1), str(item2)),
                               fuzz.partial_token_set_ratio(str(item1), str(item2))])        

In [None]:
# Convert similarity calculations to df
mapping_df = pd.DataFrame(similarity, columns=['StationID1', 'StationID2', 
                                               'StationName1', 'StationName2', 
                                               'PartialSimilarity', 'PartialTokenSort', 'PartialTokenSet'])
mapping_df.head()

In [None]:
mapping_df.shape

In [None]:
mapping_df[mapping_df.StationID1 == 'C32019'].shape

In [None]:
# Find maximum similarity match
mapping1 = pd.DataFrame()
mapping2 = pd.DataFrame()
mapping3 = pd.DataFrame()
for ID in set(mapping_df.StationID1):
    tmp = mapping_df[mapping_df.StationID1 == ID]
    mapping1 = pd.concat([mapping1, pd.DataFrame(tmp.iloc[tmp['PartialSimilarity'].values.argmax(), :]).T], axis=0)
    mapping2 = pd.concat([mapping2, pd.DataFrame(tmp.iloc[tmp['PartialTokenSort'].values.argmax(), :]).T], axis=0)
    mapping3 = pd.concat([mapping3, pd.DataFrame(tmp.iloc[tmp['PartialTokenSet'].values.argmax(), :]).T], axis=0)
    
mapping1 = mapping1.reset_index(drop=True).sort_values('StationID1')
mapping2 = mapping2.reset_index(drop=True).sort_values('StationID1')
mapping3 = mapping3.reset_index(drop=True).sort_values('StationID1')

In [None]:
# Spot check any similarity scores below 100
mapping1[mapping1.PartialSimilarity < 100]

In [None]:
mapping1

#### Map Station Info to Trip Data

In [None]:
# Map alpha-numeric ID to starting station
mapping1.StationID2 = mapping1.StationID2.astype(int)
tmp_df = new_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StartID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StartID': 'StartNum', 'StationID1': 'StartID'})
tmp_df = tmp_df.drop(columns='StationID2')
# Map alpha-numeric ID to stopping station
tmp_df = tmp_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StopID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StopID': 'StopNum', 'StationID1': 'StopID'})
tmp_df = tmp_df.drop(columns=['StationID2', 'StartNum', 'StopNum'])

# Map start city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StartCity'})
tmp_df = tmp_df.drop(columns=['StationID'])
# Map stop city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StopCity'})
tmp_df = tmp_df.drop(columns=['StationID'])

In [None]:
tmp_df = tmp_df[sorted(tmp_df.columns)]
tmp_df.head()

In [None]:
tmp_df.shape

In [None]:
tmp_df[tmp_df.StartCity.isna() | tmp_df.StopCity.isna()]

In [None]:
missing = sorted(list(set(sorted(tmp_df[tmp_df.StartCity.isna()]['StartName'].unique()) + sorted(tmp_df[tmp_df.StopCity.isna()]['StopName'].unique()))))

In [None]:
{x:sum(tmp_df.StartName == x) for x in missing}

In [None]:
# Save new df
with open('new.pkl', 'wb') as f:
    pickle.dump(tmp_df, f)

#### Combine Old Data with New Data

In [None]:
set(tmp_df.columns).symmetric_difference(set(tmp2_df.columns))

In [None]:
combo = pd.concat([tmp_df, tmp2_df], axis=0)
combo = combo.drop(columns=['UserType', 'ZipCode'])

In [None]:
combo.head()

In [None]:
# Save combo df
with open('combo.pkl', 'wb') as f:
    pickle.dump(combo, f)