In [98]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle

import json
import plotly.plotly as py
from plotly.graph_objs import *

from fuzzywuzzy import fuzz

### Load Trip Data

In [99]:
# Load old df
with open('old_clean.pkl', 'rb') as f:
    old_df = pickle.load(f)

# Load new df
with open('new_clean.pkl', 'rb') as f:
    new_df = pickle.load(f)

# Add Epoch to each DF to make it easy to split later
old_df['Epoch'] = 'Old'
new_df['Epoch'] = 'New'

#
def remap_usertype(df):
    usertype_map = {'Subscriber': 'Member', 'Customer': 'Casual'}
    df.UserType = df.UserType.map(usertype_map)
    return df

old_df = remap_usertype(old_df)
new_df = remap_usertype(new_df)

### Load Stations Data

In [89]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [90]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [91]:
stations1_df = stations1_df[stations2_df.columns]
stations1_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations1_df = stations1_df.sort_values('StationID', axis=0)
stations1_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.1065,Boston,15


In [92]:
stations2_df = stations2_df.sort_values('Station ID', axis=0)
stations2_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations2_df = stations2_df.sort_values('StationID', axis=0)
stations2_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
3,A32000,Fan Pier,42.353287,-71.044389,Boston,15
4,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
5,A32002,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
6,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
7,A32004,Longwood Ave at Binney St,42.338629,-71.1065,Boston,15


### Combine Stations Data

In [93]:
# Combine station data into single DF
stations_df = pd.concat([stations1_df, stations2_df], axis=0)
# Remove duplicate rows
stations_df = stations_df[~stations_df.duplicated()]

# Compile all the names used to refer to each station ID
station_name_map = {}
for sid in set(stations_df.StationID):
    station_name_map[sid] = list(set(stations_df[stations_df.StationID == sid].StationName))
    
    # Designate a single name to be used to refer to each station ID
station_names = {}
for key, val in station_name_map.items():
    if len(val) > 1:
        val = val[1]
    else:
        val = val[0]
    station_names[key] = val
# Convert to dataframe    
station_names = pd.DataFrame.from_dict(station_names, 'index').reset_index()    
station_names.columns = ['StationID', 'StationName']
# Merge designated names back into combined stations DF
stations_df = station_names.merge(stations_df[['StationID', 'Lat', 'Lon', 'City', 'NumDocks']], on='StationID', how='left')
stations_df = stations_df[~stations_df.duplicated()].reset_index(drop=True)

In [94]:
stations_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32029,Piers Park,42.36489,-71.035042,Boston,19
1,D32003,TD Garden - Causeway at Portal Park #2,42.366222,-71.059914,Boston,27
2,B32003,HMS/HSPH - Avenue Louis Pasteur at Longwood Ave,42.337171,-71.102797,Boston,21
3,D32026,Hayes Square - Vine St at Moulton St,42.377022,-71.056605,Boston,19
4,M32041,MIT Pacific St at Purrington St,42.359573,-71.101295,Cambridge,19


In [95]:
# Plot bike stations on interactive map
credentials = json.load(open('/home/cneiderer/.mapbox/.credentials'))
mapbox_access_token = credentials['public_token']

data = Data([
    Scattermapbox(
        lat=stations_df.Lat,
        lon=stations_df.Lon,
        mode='markers',
        marker=Marker(
            color='red',
            size=10,
            opacity=0.7
        ),
        text=stations_df.StationName,
    )    
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    width=1200,
    height=800,
    margin=Margin(
        l=25,
        r=25,
        b=25,
        t=25,
        pad=4
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=42.361769, 
            lon=-71.078249 #-71.0589
        ),
        pitch=0,
        zoom=11.5
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Multiple Mapbox')

### Map Lat, Lon, City into Old Data

In [103]:
def map_station_info_to_old_data(data_df, stations_df, loc):
    data_df = data_df.merge(stations_df, how='left', left_on=loc+'ID', right_on='StationID', suffixes=('', '_Junk'))
    for col in data_df.columns:
        if col[-5:] == '_Junk':
            data_df.drop(columns=col)
    data_df[loc+'OrigName'] = data_df[loc+'Name']
    data_df = data_df.drop(columns=[loc+'Name', 'StationID'])
    rename_map = {'StationName': loc+'Name', 
                  'City': loc+'City',
                  'Lat': loc+'Lat', 
                  'Lon': loc+'Lon', 
                  'NumDocks': loc+'NumDocks'}
    data_df = data_df.rename(columns=rename_map)
    return data_df

wold_df = map_station_info_to_old_data(old_df, stations_df, 'Start')
wold_df = map_station_info_to_old_data(wold_df, stations_df, 'Stop')
wold_df = wold_df[sorted(wold_df.columns)]

wold_df.head()

Unnamed: 0,BikeID,Duration,Epoch,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartNumDocks,StartOrigName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopNumDocks,StopOrigName,StopTime,UserType,ZipCode
0,T01335,397,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:58:00,Boston,D32022,42.365885,-71.064548,TD Garden - West End Park (formerly TD Garden ...,34.0,TD Garden - Causeway at Portal Park #1,2015-01-01 00:05:00,Member,2148.0
1,T01426,543,Old,Male,Cambridge,M32006,42.3581,-71.093198,MIT at Mass Ave / Amherst St,27.0,MIT at Mass Ave / Amherst St,2014-12-31 23:51:00,Cambridge,M32007,42.372969,-71.094445,Cambridge St - at Columbia St / Webster Ave,15.0,Cambridge St - at Columbia St / Webster Ave,2015-01-01 00:00:00,Member,2143.0
2,B01570,928,Old,Male,Boston,D32005,42.349673,-71.077303,Copley Square - Dartmouth St at Boylston St,25.0,Boston Public Library - 700 Boylston St.,2014-12-31 23:30:00,Boston,A32008,42.347241,-71.105301,Buswell St. at Park Dr.,15.0,Buswell St. at Park Dr.,2014-12-31 23:46:00,Member,2215.0
3,T01205,270,Old,Male,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:26:00,Boston,A32010,42.352175,-71.055547,South Station - 700 Atlantic Ave,46.0,South Station - 700 Atlantic Ave.,2014-12-31 23:31:00,Member,2043.0
4,T01306,960,Old,Male,Boston,D32010,42.362811,-71.056067,Cross St at Hanover St,18.0,Cross St. at Hanover St.,2014-12-31 23:07:00,Boston,B32004,42.35977,-71.051601,Aquarium T Stop - 200 Atlantic Ave,19.0,Aquarium Station - 200 Atlantic Ave.,2014-12-31 23:23:00,Member,1945.0


In [104]:
# Find Stations that existed at start of Hubway but no longer exist in system
wold_df[wold_df.StartCity.isna() | wold_df.StopCity.isna()]

Unnamed: 0,BikeID,Duration,Epoch,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartNumDocks,StartOrigName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopNumDocks,StopOrigName,StopTime,UserType,ZipCode
32,B00103,185,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 20:40:00,Boston,A32010,42.352175,-71.055547,South Station - 700 Atlantic Ave,46.0,South Station - 700 Atlantic Ave.,2014-12-31 20:43:00,Member,2081.0
99,B00037,306,Old,Female,,D32025,,,,,Milk St at India St,2014-12-31 17:22:00,Boston,D32022,42.365885,-71.064548,TD Garden - West End Park (formerly TD Garden ...,34.0,TD Garden - Causeway at Portal Park #1,2014-12-31 17:28:00,Member,1907.0
145,T01421,328,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 16:29:00,Boston,D32022,42.365885,-71.064548,TD Garden - West End Park (formerly TD Garden ...,34.0,TD Garden - Causeway at Portal Park #1,2014-12-31 16:35:00,Member,2110.0
171,B00037,1263,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 16:00:00,,D32025,,,,,Milk St at India St,2014-12-31 16:21:00,Member,2128.0
186,B00222,391,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 15:45:00,Boston,D32014,42.354979,-71.063348,Tremont St at West St,15.0,Tremont St / West St,2014-12-31 15:51:00,Member,2446.0
203,T01149,925,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 15:30:00,Boston,C32003,42.348074,-71.076570,Back Bay / South End Station,19.0,Back Bay / South End Station,2014-12-31 15:45:00,Member,2116.0
295,B00222,317,Old,Male,Boston,D32015,42.351356,-71.059367,Chinatown Gate Plaza,15.0,Chinatown Gate Plaza - Surface Rd. at Beach St.,2014-12-31 13:33:00,,D32025,,,,,Milk St at India St,2014-12-31 13:38:00,Member,2446.0
318,T01421,200,Old,Male,Boston,D32010,42.362811,-71.056067,Cross St at Hanover St,18.0,Cross St. at Hanover St.,2014-12-31 13:05:00,,D32025,,,,,Milk St at India St,2014-12-31 13:08:00,Member,2113.0
340,T01185,171,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 12:25:00,Boston,D32010,42.362811,-71.056067,Cross St at Hanover St,18.0,Cross St. at Hanover St.,2014-12-31 12:27:00,Member,2113.0
367,B00198,362,Old,Male,,D32025,,,,,Milk St at India St,2014-12-31 11:50:00,Boston,D32015,42.351356,-71.059367,Chinatown Gate Plaza,15.0,Chinatown Gate Plaza - Surface Rd. at Beach St.,2014-12-31 11:56:00,Member,2446.0


In [127]:
missing = sorted(list(set(sorted(wold_df[wold_df.StartCity.isna()]['StartID'].unique()) + sorted(wold_df[wold_df.StopCity.isna()]['StopID'].unique()))))

['A32007',
 'B32009',
 'B32019',
 'C32011',
 'D32025',
 'K32003',
 'S32007',
 'S32012',
 'X32000']

In [83]:
for idx in range(len(missing)):
    stationID = missing[idx]
    try:
        print(stationID, old_df[old_df.StartID == stationID].iloc[0]['StartName'], ':', sum(old_df.StartID == stationID))        
    except:
        print(stationID, 'Unknown', ':', sum(old_df.StartID == stationID))

A32007 Harvard Real Estate - North Harvard St at Western Ave : 6647
B32009 Overland St at Brookline Ave : 5036
B32019 Longwood Ave/Riverway : 10583
C32011 South Bay Plaza : 1468
D32025 Milk St at India St : 7425
K32003 Brookline Village - Station Street @ MBTA : 8929
S32007 Ball Sq : 1530
S32012 Summer St at Cutter St : 2789
X32000 Unknown : 0


In [137]:
# missing_data = [['A32007', 'Harvard Real Estate - North Harvard St at Western Ave', 42.3631911, -71.1319868],
#                 ['B32009', 'Overland St at Brookline Ave', 42.3460641, -71.1019295],
#                 ['B32019', 'Longwood Ave/Riverway', 42.3403202, -71.112122],
#                 ['C32011', 'South Bay Plaza', 42.3341162, -71.0698326],
#                 ['D32025', 'Milk St at India St', 42.3584621, -71.0554896],
#                 ['K32003', 'Brookline Village - Station Street @ MBTA', 42.3327674, -71.1191912],
#                 ['S32007', 'Ball Sq', 42.399747, -71.1132117],
#                 ['S32012', 'Summer St at Cutter St', 'Somerville', 42.3940975, -71.1228071]]

# missing_df = pd.DataFrame(missing_data, columns=['StationID', 'StationName', 'Lat', 'Lon'])
# missing_df

In [48]:
# Save old df
with open('old.pkl', 'wb') as f:
    pickle.dump(old_df, f)

### Compile Station Info from New Trip Data

In [14]:
new_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StartLat,StartLon,StopID,StopName,StopLat,StopLon,BikeID,UserType,BirthYear,Gender,Epoch
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,78,Union Square - Somerville,42.379637,-71.095319,1380,Member,1983.0,Female,New
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,23,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,190,Nashua Street at Red Auerbach Way,42.365598,-71.064248,866,Member,1980.0,Male,New
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,178,MIT Pacific St at Purrington St,42.359573,-71.101295,184,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,35,Member,1990.0,Male,New
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,25,Tremont St / W Newton St,42.341332,-71.076847,21,Prudential Center / Belvidere,42.345959,-71.082578,1917,Casual,,Unknown,New
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,76,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,139,Dana Park,42.361589,-71.107437,603,Member,1990.0,Male,New


In [17]:
# Compile start stations
start_df = new_df[['StartID', 'StartName', 'StartLat', 'StartLon']]
start_df = start_df[~start_df.duplicated()]

# Compile stop stations
stop_df = new_df[['StopID', 'StopName', 'StopLat', 'StopLon']]
stop_df = stop_df[~stop_df.duplicated()]

# Combine start and stop stations
trip_stations_df = pd.DataFrame(np.concatenate([start_df.as_matrix(), stop_df.as_matrix()], axis=0), 
                                columns=['StationID', 'StationName', 'Lat', 'Lon'])
trip_stations_df = trip_stations_df[~trip_stations_df.duplicated()]
trip_stations_df = trip_stations_df.sort_values('StationID', axis=0)
trip_stations_df = trip_stations_df.reset_index(drop=True)
trip_stations_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon
0,1,18 Dorrance Warehouse,42.3872,-71.076
1,3,Colleges of the Fenway,42.34,-71.1008
2,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.3401,-71.1006
3,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.34,-71.1008
4,4,Tremont St. at Berkeley St.,42.3454,-71.0696


In [23]:
# Station mapping
trip_station_map = {}
for sid in set(trip_stations_df.StationID):
    trip_station_map[sid] = list(set(trip_stations_df[trip_stations_df.StationID == sid].StationName) )

In [24]:
# Find similarity of station naming
similarity = []
for key1, val1 in station_name_map.items():
    for key2, val2 in trip_station_map.items():
        item1 = list(val1)[0]
        for item2 in val2:
            similarity.append([key1, str(key2), item1, item2, 
                               fuzz.partial_ratio(str(item1), str(item2)),
                               fuzz.partial_token_sort_ratio(str(item1), str(item2)),
                               fuzz.partial_token_set_ratio(str(item1), str(item2))])        

In [25]:
# Convert similarity calculations to df
mapping_df = pd.DataFrame(similarity, columns=['StationID1', 'StationID2', 
                                               'StationName1', 'StationName2', 
                                               'PartialSimilarity', 'PartialTokenSort', 'PartialTokenSet'])
mapping_df.head()

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
0,A32029,1,Piers Park- Marginal St at East Boston Shipyard,18 Dorrance Warehouse,29,33,33
1,A32029,3,Piers Park- Marginal St at East Boston Shipyard,Colleges of the Fenway - Fenway at Avenue Loui...,36,43,100
2,A32029,3,Piers Park- Marginal St at East Boston Shipyard,Colleges of the Fenway,27,32,32
3,A32029,4,Piers Park- Marginal St at East Boston Shipyard,Tremont St. at Berkeley St.,44,48,100
4,A32029,4,Piers Park- Marginal St at East Boston Shipyard,Tremont St at E Berkeley St,52,44,100


In [85]:
mapping_df.shape

(61953, 7)

In [87]:
mapping_df[mapping_df.StationID1 == 'C32019'].shape

(321, 7)

In [27]:
# Find maximum similarity match
mapping1 = pd.DataFrame()
mapping2 = pd.DataFrame()
mapping3 = pd.DataFrame()
for ID in set(mapping_df.StationID1):
    tmp = mapping_df[mapping_df.StationID1 == ID]
    mapping1 = pd.concat([mapping1, pd.DataFrame(tmp.iloc[tmp['PartialSimilarity'].values.argmax(), :]).T], axis=0)
    mapping2 = pd.concat([mapping2, pd.DataFrame(tmp.iloc[tmp['PartialTokenSort'].values.argmax(), :]).T], axis=0)
    mapping3 = pd.concat([mapping3, pd.DataFrame(tmp.iloc[tmp['PartialTokenSet'].values.argmax(), :]).T], axis=0)
    
mapping1 = mapping1.reset_index(drop=True).sort_values('StationID1')
mapping2 = mapping2.reset_index(drop=True).sort_values('StationID1')
mapping3 = mapping3.reset_index(drop=True).sort_values('StationID1')

In [29]:
# Spot check any similarity scores below 100
mapping1[mapping1.PartialSimilarity < 100]

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
33869,C32015,106,Dudley Town Common - Mt Pleasant Ave at Blue H...,Mt Pleasant Ave / Dudley Town Common,62,85,100


In [31]:
mapping1

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
120,A32000,7,Fan Pier,Fan Pier,100,100,100
80,A32001,8,Union Square - Brighton Ave at Cambridge St,Union Square - Brighton Ave at Cambridge St,100,100,100
181,A32002,9,Agganis Arena - 925 Comm Ave.,Agganis Arena - 925 Comm Ave.,100,100,100
192,A32003,10,B.U. Central - 725 Comm. Ave.,B.U. Central - 725 Comm. Ave.,100,100,100
68,A32004,11,Longwood Ave / Binney St,Longwood Ave / Binney St,100,100,100
79,A32005,15,Brighton Mills - 370 Western Ave,Brighton Mills - 370 Western Ave,100,100,100
52,A32006,17,Soldiers Field Park - 111 Western Ave,Soldiers Field Park - 111 Western Ave,100,100,100
143,A32008,19,Park Dr at Buswell St,Park Dr at Buswell St,100,100,100
65,A32009,25,Tremont St / W Newton St,Tremont St / W Newton St,100,100,100
121,A32010,22,South Station - 700 Atlantic Ave.,South Station - 700 Atlantic Ave.,100,100,100


#### Map Station Info to Trip Data

In [32]:
# Map alpha-numeric ID to starting station
mapping1.StationID2 = mapping1.StationID2.astype(int)
tmp_df = new_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StartID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StartID': 'StartNum', 'StationID1': 'StartID'})
tmp_df = tmp_df.drop(columns='StationID2')
# Map alpha-numeric ID to stopping station
tmp_df = tmp_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StopID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StopID': 'StopNum', 'StationID1': 'StopID'})
tmp_df = tmp_df.drop(columns=['StationID2', 'StartNum', 'StopNum'])

# Map start city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City', 'NumDocks']], how='left', left_on='StartID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StartCity'})
tmp_df = tmp_df.drop(columns=['StationID'])
# Map stop city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StopCity'})
tmp_df = tmp_df.drop(columns=['StationID'])

In [33]:
tmp_df = tmp_df[sorted(tmp_df.columns)]
tmp_df.head()

Unnamed: 0,BikeID,BirthYear,Duration,Epoch,Gender,NumDocks,StartCity,StartID,StartLat,StartLon,StartName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopTime,UserType
0,1380,1983.0,196,New,Female,15.0,Cambridge,M32007,42.372969,-71.094445,Cambridge St - at Columbia St / Webster Ave,2016-11-01 00:00:35,Somerville,S32002,42.379637,-71.095319,Union Square - Somerville,2016-11-01 00:03:52,Member
1,866,1980.0,222,New,Male,21.0,Boston,B32008,42.35892,-71.057629,Mayor Martin J Walsh - 28 State St,2016-11-01 00:01:23,Boston,A32025,42.365598,-71.064248,Nashua Street at Red Auerbach Way,2016-11-01 00:05:06,Member
2,35,1990.0,483,New,Male,19.0,Cambridge,M32041,42.359573,-71.101295,MIT Pacific St at Purrington St,2016-11-01 00:04:19,Cambridge,M32047,42.357753,-71.103934,Sidney Research Campus/ Erie Street at Waverly,2016-11-01 00:12:22,Member
3,1917,,141,New,Unknown,15.0,Boston,A32009,42.341332,-71.076847,Tremont St / W Newton St,2016-11-01 00:05:31,Boston,C32007,42.345959,-71.082578,Prudential Center / Belvidere,2016-11-01 00:07:53,Casual
4,603,1990.0,154,New,Male,17.0,Cambridge,M32012,42.366426,-71.105495,Central Sq Post Office / Cambridge City Hall a...,2016-11-01 00:05:59,Cambridge,M32030,42.361589,-71.107437,Dana Park,2016-11-01 00:08:34,Member


In [34]:
tmp_df.shape

(3735869, 19)

In [35]:
tmp_df[tmp_df.StartCity.isna() | tmp_df.StopCity.isna()]

Unnamed: 0,BikeID,BirthYear,Duration,Epoch,Gender,NumDocks,StartCity,StartID,StartLat,StartLon,StartName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopTime,UserType
582,1242,1964,1574,New,Male,18.0,Boston,A32019,42.363796,-71.129164,Harvard University Transportation Services - 1...,2016-11-01 08:12:47,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-01 08:39:01,Member
2467,86,1994,938,New,Female,,,,42.318844,-71.069757,Upham's Corner T Stop,2016-11-01 15:25:44,Boston,C32024,42.344137,-71.052608,State Street at Channel Center,2016-11-01 15:41:23,Member
3122,1548,1979,1317,New,Male,19.0,Boston,A32026,42.354686,-71.053292,Purchase St at Pearl St,2016-11-01 17:06:30,,,42.318844,-71.069757,Upham's Corner T Stop,2016-11-01 17:28:28,Member
6847,852,,118153,New,Unknown,19.0,Boston,B32005,42.343864,-71.085918,Christian Science Plaza,2016-11-02 12:10:20,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-03 20:59:33,Casual
8018,413,1986,17254,New,Male,19.0,Boston,C32010,42.351100,-71.049600,Congress / Sleeper,2016-11-02 16:28:34,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-02 21:16:08,Member
10847,198,1964,1663,New,Male,18.0,Boston,A32019,42.363796,-71.129164,Harvard University Transportation Services - 1...,2016-11-03 08:13:27,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-03 08:41:11,Member
11468,198,1993,970,New,Male,,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-03 09:33:16,Cambridge,M32027,42.366095,-71.086388,Binney St / Sixth St,2016-11-03 09:49:27,Member
18762,1105,1978,394,New,Female,15.0,Boston,B32017,42.328654,-71.084198,Dudley Square,2016-11-04 18:47:12,,,42.318844,-71.069757,Upham's Corner T Stop,2016-11-04 18:53:46,Member
18848,1221,1989,408,New,Male,15.0,Boston,B32017,42.328654,-71.084198,Dudley Square,2016-11-04 19:08:32,,,42.318844,-71.069757,Upham's Corner T Stop,2016-11-04 19:15:21,Member
30098,1100,1964,1432,New,Male,18.0,Boston,A32019,42.363796,-71.129164,Harvard University Transportation Services - 1...,2016-11-08 08:08:52,,,42.387151,-71.075978,18 Dorrance Warehouse,2016-11-08 08:32:45,Member


In [84]:
missing = sorted(list(set(sorted(tmp_df[tmp_df.StartCity.isna()]['StartName'].unique()) + sorted(tmp_df[tmp_df.StopCity.isna()]['StopName'].unique()))))

In [78]:
{x:sum(tmp_df.StartName == x) for x in missing}

{'18 Dorrance Warehouse': 213,
 '8D OPS 01': 1,
 '8D OPS 03': 3,
 '8D QC Station 01': 4,
 '8D QC Station 02': 3,
 'Brookline Village - Pearl Street @ MBTA': 4017,
 'Four Corners - 157 Washington St': 10,
 'Huron Ave. At Vassal Lane': 355,
 'Lab PBSC': 0,
 'One Memorial Drive': 2113,
 'Seaport Blvd at Sleeper St': 1802,
 'Summer St at Cutter St': 1565,
 'Test 000e3': 23,
 'Troy Boston': 583,
 "Upham's Corner T Stop": 145,
 "Upham's Corner T Stop - Magnolia St at Dudley St": 370}

In [75]:
# Save new df
with open('new.pkl', 'wb') as f:
    pickle.dump(tmp_df, f)

#### Combine Old Data with New Data

In [None]:
set(tmp_df.columns).symmetric_difference(set(tmp2_df.columns))

In [None]:
combo = pd.concat([tmp_df, tmp2_df], axis=0)
combo = combo.drop(columns=['UserType', 'ZipCode'])

In [None]:
combo.head()

In [None]:
# Save combo df
with open('combo.pkl', 'wb') as f:
    pickle.dump(combo, f)