In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle

import json
import plotly.plotly as py
from plotly.graph_objs import *

from fuzzywuzzy import fuzz

### Load Trip Data

In [2]:
# Load old df
with open('old_clean.pkl', 'rb') as f:
    old_df = pickle.load(f)

In [3]:
# Load new df
with open('new_clean.pkl', 'rb') as f:
    new_df = pickle.load(f)

### Clean Data

In [4]:
# Add Epoch to each DF to make it easy to split later
old_df['Epoch'] = 'Old'
new_df['Epoch'] = 'New'

In [5]:
def remap_usertype(df):
    usertype_map = {'Subscriber': 'Member', 'Customer': 'Casual'}
    df.UserType = df.UserType.map(usertype_map)
    return df
old_df = remap_usertype(old_df)
new_df = remap_usertype(new_df)

### Load Stations Data

In [6]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [7]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [8]:
stations1_df = stations1_df[stations2_df.columns]
stations1_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations1_df = stations1_df.sort_values('StationID', axis=0)
stations1_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.1065,Boston,15


In [9]:
stations2_df = stations2_df.sort_values('Station ID', axis=0)
stations2_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations2_df = stations2_df.sort_values('StationID', axis=0)
stations2_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
3,A32000,Fan Pier,42.353287,-71.044389,Boston,15
4,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
5,A32002,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
6,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
7,A32004,Longwood Ave at Binney St,42.338629,-71.1065,Boston,15


In [12]:
# Plot bike stations on interactive map
credentials = json.load(open('/home/cneiderer/.mapbox/.credentials'))
mapbox_access_token = credentials['public_token']

data = Data([
    Scattermapbox(
        lat=stations2_df.Lat,
        lon=stations2_df.Lon,
        name='2017 Stations',
        mode='markers',
        marker=Marker(
            color='red',
            size=10,
            opacity=1
        ),
        text=stations2_df.StationName,
    ),
    Scattermapbox(
        lat=stations1_df.Lat,
        lon=stations1_df.Lon,
        name='2011-2017 Stations',
        mode='markers',
        marker=Marker(
            color='blue',
            size=5,
            opacity=1
        ),
        text=stations1_df.StationName,
    ),
    
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    width=1200,
    height=800,
    margin=Margin(
        l=25,
        r=25,
        b=25,
        t=25,
        pad=4
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=42.361769, 
            lon=-71.078249 #-71.0589
        ),
        pitch=0,
        zoom=11.5
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Multiple Mapbox')

### Combine Stations Data

In [15]:
stations_df = stations1_df.merge(stations2_df, on='StationID', how='outer')
stations_df = stations_df.fillna('')

In [16]:
stations_df[(stations_df.Lat_x != stations_df.Lat_y) | (stations_df.Lon_x != stations_df.Lon_y)]

Unnamed: 0,StationID,StationName_x,Lat_x,Lon_x,City_x,NumDocks_x,StationName_y,Lat_y,Lon_y,City_y,NumDocks_y
187,K32006,,,,,,Commonwealth Ave At Babcock St,42.351547,-71.121263,Brookline,19
188,M32045,,,,,,Museum of Science,42.36769,-71.071163,Cambridge,19
189,M32050,,,,,,Verizon Innovation Hub 10 Ware Street,42.372509,-71.113054,Cambridge,19
190,M32051,,,,,,Fresh Pond Reservation,42.382678,-71.143479,Cambridge,19
191,M32052,,,,,,Cambridge Dept. of Public Works -147 Hampshire...,42.371197,-71.097599,Cambridge,19
192,Silber Way,,,,,,Silber Way,42.349496,-71.100576,Brookline,19


In [17]:
# Collect all station names for each station ID
station_names = {}
for idx in range(stations_df.shape[0]):
    id_info = stations_df.iloc[idx]
    station_names[id_info.StationID] = set(id_info[['StationName_x', 'StationName_y']])

### Trip Info

In [None]:
new_df.head()

In [None]:
start_df = new_df[['StartID', 'StartName', 'StartLat', 'StartLon']]
start_df = start_df[~start_df.duplicated()]

In [None]:
stop_df = new_df[['StopID', 'StopName', 'StopLat', 'StopLon']]
stop_df = stop_df[~stop_df.duplicated()]

In [None]:
df = pd.DataFrame(np.concatenate([start_df.as_matrix(), stop_df.as_matrix()], axis=0), 
                  columns=['StationID', 'StationName', 'Lat', 'Lon'])
df = df[~df.duplicated()]
df = df.sort_values('StationID', axis=0)
df = df.reset_index(drop=True)
df.head()

In [None]:
df[(df == 0).any(axis=1)]

In [None]:
# Station mapping
station_map = {}
for sid in set(df.StationID):
    station_map[sid] = set(df[df.StationID == sid].StationName) 

In [None]:
station_map

In [None]:
# Find similarity of station naming
similarity = []
for key1, val1 in stations.items():
    for key2, val2 in station_map.items():
        item1 = list(val1)[0]
        for item2 in val2:
            similarity.append([key1, str(key2), item1, item2, 
                               fuzz.partial_ratio(str(item1), str(item2)),
                               fuzz.partial_token_sort_ratio(str(item1), str(item2)),
                               fuzz.partial_token_set_ratio(str(item1), str(item2))])        

In [None]:
# Convert similarity calculations to df
mapping_df = pd.DataFrame(similarity, columns=['StationID1', 'StationID2', 
                                               'StationName1', 'StationName2', 
                                               'PartialSimilarity', 'PartialTokenSort', 'PartialTokenSet'])
mapping_df

In [None]:
mapping_df.info()

In [None]:
# Find maximum similarity match
mapping1 = pd.DataFrame()
mapping2 = pd.DataFrame()
mapping3 = pd.DataFrame()
for ID in set(mapping_df.StationID1):
    tmp = mapping_df[mapping_df.StationID1 == ID]
    mapping1 = pd.concat([mapping1, pd.DataFrame(tmp.iloc[tmp['PartialSimilarity'].values.argmax(), :]).T], axis=0)
    mapping2 = pd.concat([mapping2, pd.DataFrame(tmp.iloc[tmp['PartialTokenSort'].values.argmax(), :]).T], axis=0)
    mapping3 = pd.concat([mapping3, pd.DataFrame(tmp.iloc[tmp['PartialTokenSet'].values.argmax(), :]).T], axis=0)

In [None]:
mapping1

In [None]:
# Spot check any similarity scores below 100
mapping1[mapping1.PartialSimilarity < 100]

In [None]:
mapping1 = mapping1.reset_index(drop=True).sort_values('StationID1')

In [None]:
mapping1

#### Map Station Info to Trip Data

In [None]:
# Map alpha-numeric ID to starting station
mapping1.StationID2 = mapping1.StationID2.astype(int)
tmp_df = new_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StartID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StartID': 'StartNum', 'StationID1': 'StartID'})
tmp_df = tmp_df.drop(columns='StationID2')
# Map alpha-numeric ID to stopping station
tmp_df = tmp_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StopID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StopID': 'StopNum', 'StationID1': 'StopID'})
tmp_df = tmp_df.drop(columns=['StationID2', 'StartNum', 'StopNum'])
# Map start city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StartCity'})
tmp_df = tmp_df.drop(columns=['StationID'])
# Map stop city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StopCity'})
tmp_df = tmp_df.drop(columns=['StationID'])

In [None]:
tmp_df = tmp_df[sorted(tmp_df.columns)]
tmp_df.head()

In [None]:
tmp_df.shape

In [None]:
# Save new df
with open('new_mapped.pkl', 'wb') as f:
    pickle.dump(tmp_df, f)

In [None]:
# Map Start Lat, Lon, and City
tmp2_df = old_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StartLat', 'Lon': 'StartLon', 'City': 'StartCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])
# Map Stop Lat, Lon, and City
tmp2_df = tmp2_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StopLat', 'Lon': 'StopLon', 'City': 'StopCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])

In [None]:
tmp_df2 = tmp2_df[sorted(tmp2_df.columns)]
tmp_df2.head()

In [None]:
tmp2_df.shape

In [None]:
# Save old df
with open('old_mapped.pkl', 'wb') as f:
    pickle.dump(tmp2_df, f)

#### Combine Old Data with New Data

In [None]:
set(tmp_df.columns).symmetric_difference(set(tmp2_df.columns))

In [None]:
combo = pd.concat([tmp_df, tmp2_df], axis=0)
combo = combo.drop(columns=['UserType', 'ZipCode'])

In [None]:
combo.head()

In [None]:
# Save combo df
with open('combo.pkl', 'wb') as f:
    pickle.dump(combo, f)