In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle

import json
import plotly.plotly as py
from plotly.graph_objs import *

from fuzzywuzzy import fuzz

### Load Old Data

In [2]:
# Load old df
with open('old_clean.pkl', 'rb') as f:
    old_df = pickle.load(f)

In [3]:
old_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2762938 entries, 0 to 2762937
Data columns (total 11 columns):
Duration     int64
StartTime    datetime64[ns]
StopTime     datetime64[ns]
StartID      object
StartName    object
StopID       object
StopName     object
BikeID       object
UserType     object
ZipCode      float64
Gender       object
dtypes: datetime64[ns](2), float64(1), int64(1), object(7)
memory usage: 231.9+ MB


In [4]:
old_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StopID,StopName,BikeID,UserType,ZipCode,Gender
0,397,2014-12-31 23:58:00,2015-01-01 00:05:00,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Subscriber,2148.0,Male
1,543,2014-12-31 23:51:00,2015-01-01 00:00:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Subscriber,2143.0,Male
2,928,2014-12-31 23:30:00,2014-12-31 23:46:00,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Subscriber,2215.0,Male
3,270,2014-12-31 23:26:00,2014-12-31 23:31:00,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Subscriber,2043.0,Male
4,960,2014-12-31 23:07:00,2014-12-31 23:23:00,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Subscriber,1945.0,Male


### Load New Data

In [5]:
# Load new df
with open('new_clean.pkl', 'rb') as f:
    new_df = pickle.load(f)

In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3731291 entries, 0 to 3731291
Data columns (total 15 columns):
Duration     int64
StartTime    datetime64[ns]
StopTime     datetime64[ns]
StartID      int64
StartName    object
StartLat     float64
StartLon     float64
StopID       int64
StopName     object
StopLat      float64
StopLon      float64
BikeID       int64
UserType     object
BirthYear    object
Gender       object
dtypes: datetime64[ns](2), float64(4), int64(4), object(5)
memory usage: 455.5+ MB


In [7]:
new_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StartLat,StartLon,StopID,StopName,StopLat,StopLon,BikeID,UserType,BirthYear,Gender
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,78,Union Square - Somerville,42.379637,-71.095319,1380,Subscriber,1983.0,Female
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,23,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,190,Nashua Street at Red Auerbach Way,42.365598,-71.064248,866,Subscriber,1980.0,Male
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,178,MIT Pacific St at Purrington St,42.359573,-71.101295,184,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,35,Subscriber,1990.0,Male
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,25,Tremont St / W Newton St,42.341332,-71.076847,21,Prudential Center / Belvidere,42.345959,-71.082578,1917,Customer,,Unknown
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,76,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,139,Dana Park,42.361589,-71.107437,603,Subscriber,1990.0,Male


### Stations Info

In [8]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [9]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [10]:
stations1_df = stations1_df[stations2_df.columns]
stations1_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations1_df = stations1_df.sort_values('StationID', axis=0)
stations1_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.1065,Boston,15


In [11]:
stations2_df = stations2_df.sort_values('Station ID', axis=0)
stations2_df.columns = ['StationID', 'StationName', 'Lat', 'Lon', 'City', 'NumDocks']
stations2_df = stations2_df.sort_values('StationID', axis=0)
stations2_df.head()

Unnamed: 0,StationID,StationName,Lat,Lon,City,NumDocks
3,A32000,Fan Pier,42.353287,-71.044389,Boston,15
4,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
5,A32002,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
6,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
7,A32004,Longwood Ave at Binney St,42.338629,-71.1065,Boston,15


In [15]:
# Plot bike stations on interactive map
credentials = json.load(open('/home/cneiderer/.mapbox/.credentials'))
mapbox_access_token = credentials['public_token']

data = Data([
    Scattermapbox(
        lat=stations1_df.Latitude,
        lon=stations1_df.Longitude,
        mode='markers',
        marker=Marker(
            color='blue',
            size=9,
            opacity=0.4
        ),
        text=stations1_df.Station,
    ),
    Scattermapbox(
        lat=stations2_df.Latitude,
        lon=stations2_df.Longitude,
        mode='markers',
        marker=Marker(
            color='red',
            size=9,
            opacity=0.4
        ),
        text=stations2_df.Station,
    )
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    width=750,
    height=750,
    margin=Margin(
        l=25,
        r=25,
        b=25,
        t=25,
        pad=4
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=42.361769, 
            lon=-71.078249 #-71.0589
        ),
        pitch=0,
        zoom=11.5
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Multiple Mapbox')

### Station Info

In [13]:
stations_df = stations1_df.merge(stations2_df, on='StationID', how='outer')
stations_df

Unnamed: 0,StationID,StationName_x,Lat_x,Lon_x,City_x,NumDocks_x,StationName_y,Lat_y,Lon_y,City_y,NumDocks_y
0,A32000,Fan Pier,42.353287,-71.044389,Boston,15.0,Fan Pier,42.353287,-71.044389,Boston,15
1,A32001,Union Square - Brighton Ave. at Cambridge St.,42.353334,-71.137313,Boston,15.0,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,Boston,15
2,A32002,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,Boston,19.0,Commonwealth Ave at Buick St,42.351246,-71.115639,Boston,19
3,A32003,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11.0,B.U. Central - 725 Comm. Ave.,42.350406,-71.108279,Boston,11
4,A32004,Longwood Ave / Binney St,42.338629,-71.106500,Boston,15.0,Longwood Ave at Binney St,42.338629,-71.106500,Boston,15
5,A32005,Harvard Real Estate - Brighton Mills - 370 Wes...,42.361667,-71.138020,Boston,15.0,Brighton Mills - 370 Western Ave,42.361667,-71.138020,Boston,15
6,A32006,Harvard University Housing - 111 Western Ave. ...,42.365064,-71.119233,Boston,15.0,Soldiers Field Park - 111 Western Ave,42.365064,-71.119233,Boston,15
7,A32008,Buswell St. at Park Dr.,42.347241,-71.105301,Boston,15.0,Park Dr at Buswell St,42.347241,-71.105301,Boston,15
8,A32009,Tremont St / W Newton St,42.341332,-71.076847,Boston,15.0,South End Library - Tremont St at W Newton St,42.341332,-71.076847,Boston,15
9,A32010,South Station - 700 Atlantic Ave.,42.352175,-71.055547,Boston,46.0,South Station - 700 Atlantic Ave,42.352175,-71.055547,Boston,46


In [56]:
stations_df[(stations_df.Lat_x != stations_df.Lat_y) | (stations_df.Lon_x != stations_df.Lon_y)]

Unnamed: 0,StationID,StationName_x,Lat_x,Lon_x,City_x,NumDocks_x,StationName_y,Lat_y,Lon_y,City_y,NumDocks_y
187,K32006,,,,,,Commonwealth Ave At Babcock St,42.351547,-71.121263,Brookline,19
188,M32045,,,,,,Museum of Science,42.36769,-71.071163,Cambridge,19
189,M32050,,,,,,Verizon Innovation Hub 10 Ware Street,42.372509,-71.113054,Cambridge,19
190,M32051,,,,,,Fresh Pond Reservation,42.382678,-71.143479,Cambridge,19
191,M32052,,,,,,Cambridge Dept. of Public Works -147 Hampshire...,42.371197,-71.097599,Cambridge,19
192,Silber Way,,,,,,Silber Way,42.349496,-71.100576,Brookline,19


In [16]:
# Station mapping
stations = {}
for idx in range(stations_df.shape[0]):
    stationID = stations_df.StationID.iloc[idx]
    x = list(set([stations_df.StationName_x.iloc[idx]]))
    x = set([xi for xi in x if str(xi) != 'nan'])
    if x == np.nan:
        x = set([])
    y = list(set([stations_df.StationName_y.iloc[idx]]))
    y = set([yi for yi in y if str(yi) != 'nan'])
    if y == np.nan:
        y = set([])
    
    stations[stationID] = x.union(y)

In [17]:
stations

{'A32000': {'Fan Pier'},
 'A32001': {'Union Square - Brighton Ave at Cambridge St',
  'Union Square - Brighton Ave. at Cambridge St.'},
 'A32002': {'Agganis Arena - 925 Comm Ave.', 'Commonwealth Ave at Buick St'},
 'A32003': {'B.U. Central - 725 Comm. Ave.'},
 'A32004': {'Longwood Ave / Binney St', 'Longwood Ave at Binney St'},
 'A32005': {'Brighton Mills - 370 Western Ave',
  'Harvard Real Estate - Brighton Mills - 370 Western Ave'},
 'A32006': {'Harvard University Housing - 111 Western Ave. at Soldiers Field Park ',
  'Soldiers Field Park - 111 Western Ave'},
 'A32008': {'Buswell St. at Park Dr.', 'Park Dr at Buswell St'},
 'A32009': {'South End Library - Tremont St at W Newton St',
  'Tremont St / W Newton St'},
 'A32010': {'South Station - 700 Atlantic Ave',
  'South Station - 700 Atlantic Ave.'},
 'A32011': {'Innovation Lab - 125 Western Ave at Batten Way',
  'Innovation Lab - 125 Western Ave. at Batten Way'},
 'A32012': {"Packard's Corner - Comm. Ave. at Brighton Ave.",
  "Packar

### Trip Info

In [18]:
new_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StartLat,StartLon,StopID,StopName,StopLat,StopLon,BikeID,UserType,BirthYear,Gender
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,78,Union Square - Somerville,42.379637,-71.095319,1380,Subscriber,1983.0,Female
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,23,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,190,Nashua Street at Red Auerbach Way,42.365598,-71.064248,866,Subscriber,1980.0,Male
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,178,MIT Pacific St at Purrington St,42.359573,-71.101295,184,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,35,Subscriber,1990.0,Male
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,25,Tremont St / W Newton St,42.341332,-71.076847,21,Prudential Center / Belvidere,42.345959,-71.082578,1917,Customer,,Unknown
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,76,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,139,Dana Park,42.361589,-71.107437,603,Subscriber,1990.0,Male


In [19]:
start_df = new_df[['StartID', 'StartName', 'StartLat', 'StartLon']]
start_df = start_df[~start_df.duplicated()]

In [20]:
stop_df = new_df[['StopID', 'StopName', 'StopLat', 'StopLon']]
stop_df = stop_df[~stop_df.duplicated()]

In [21]:
df = pd.DataFrame(np.concatenate([start_df.as_matrix(), stop_df.as_matrix()], axis=0), 
                  columns=['StationID', 'StationName', 'Lat', 'Lon'])
df = df[~df.duplicated()]
df = df.sort_values('StationID', axis=0)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,StationID,StationName,Lat,Lon
0,1,18 Dorrance Warehouse,42.3872,-71.076
1,3,Colleges of the Fenway,42.34,-71.1008
2,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.3401,-71.1006
3,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.34,-71.1008
4,4,Tremont St. at Berkeley St.,42.3454,-71.0696


In [22]:
df[(df == 0).any(axis=1)]

Unnamed: 0,StationID,StationName,Lat,Lon
301,158,8D OPS 01,0,0
314,164,Lab PBSC,0,0
315,164,Test 000e3,0,0
407,229,8D QC Station 01,0,0
408,230,8D QC Station 02,0,0


In [23]:
# Station mapping
station_map = {}
for sid in set(df.StationID):
    station_map[sid] = set(df[df.StationID == sid].StationName) 

In [25]:
station_map

{1: {'18 Dorrance Warehouse'},
 3: {'Colleges of the Fenway',
  'Colleges of the Fenway - Fenway at Avenue Louis Pasteur'},
 4: {'Tremont St at E Berkeley St', 'Tremont St. at Berkeley St.'},
 5: {'Northeastern U / North Parking Lot',
  'Northeastern University - North Parking Lot'},
 6: {'Cambridge St at Joy St', 'Cambridge St. at Joy St.'},
 7: {'Fan Pier'},
 8: {'Union Square - Brighton Ave at Cambridge St',
  'Union Square - Brighton Ave. at Cambridge St.'},
 9: {'Agganis Arena - 925 Comm Ave.', 'Commonwealth Ave at Buick St'},
 10: {'B.U. Central - 725 Comm. Ave.'},
 11: {'Longwood Ave / Binney St', 'Longwood Ave at Binney St'},
 12: {'Ruggles Station / Columbus Ave.',
  'Ruggles T Stop - Columbus Ave at Melnea Cass Blvd'},
 13: {'Boston Medical Center -  East Concord at Harrison Ave',
  'Boston Medical Center - E Concord St at Harrison Ave'},
 14: {'HMS / HSPH - Ave. Louis Pasteur at Longwood Ave.',
  'HMS/HSPH - Avenue Louis Pasteur at Longwood Ave'},
 15: {'Brighton Mills - 370

In [28]:
# Find similarity of station naming
similarity = []
for key1, val1 in stations.items():
    for key2, val2 in station_map.items():
        item1 = list(val1)[0]
        for item2 in val2:
            similarity.append([key1, str(key2), item1, item2, 
                               fuzz.partial_ratio(str(item1), str(item2)),
                               fuzz.partial_token_sort_ratio(str(item1), str(item2)),
                               fuzz.partial_token_set_ratio(str(item1), str(item2))])        

In [29]:
# Convert similarity calculations to df
mapping_df = pd.DataFrame(similarity, columns=['StationID1', 'StationID2', 
                                               'StationName1', 'StationName2', 
                                               'PartialSimilarity', 'PartialTokenSort', 'PartialTokenSet'])
mapping_df

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
0,A32000,1,Fan Pier,18 Dorrance Warehouse,38,38,38
1,A32000,3,Fan Pier,Colleges of the Fenway,25,38,38
2,A32000,3,Fan Pier,Colleges of the Fenway - Fenway at Avenue Loui...,50,50,50
3,A32000,4,Fan Pier,Tremont St. at Berkeley St.,50,50,50
4,A32000,4,Fan Pier,Tremont St at E Berkeley St,50,50,50
5,A32000,5,Fan Pier,Northeastern U / North Parking Lot,38,50,50
6,A32000,5,Fan Pier,Northeastern University - North Parking Lot,62,50,50
7,A32000,6,Fan Pier,Cambridge St. at Joy St.,25,25,25
8,A32000,6,Fan Pier,Cambridge St at Joy St,25,25,25
9,A32000,7,Fan Pier,Fan Pier,100,100,100


In [30]:
mapping_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61953 entries, 0 to 61952
Data columns (total 7 columns):
StationID1           61953 non-null object
StationID2           61953 non-null object
StationName1         61953 non-null object
StationName2         61953 non-null object
PartialSimilarity    61953 non-null int64
PartialTokenSort     61953 non-null int64
PartialTokenSet      61953 non-null int64
dtypes: int64(3), object(4)
memory usage: 3.3+ MB


In [31]:
# Find maximum similarity match
mapping1 = pd.DataFrame()
mapping2 = pd.DataFrame()
mapping3 = pd.DataFrame()
for ID in set(mapping_df.StationID1):
    tmp = mapping_df[mapping_df.StationID1 == ID]
    mapping1 = pd.concat([mapping1, pd.DataFrame(tmp.iloc[tmp['PartialSimilarity'].values.argmax(), :]).T], axis=0)
    mapping2 = pd.concat([mapping2, pd.DataFrame(tmp.iloc[tmp['PartialTokenSort'].values.argmax(), :]).T], axis=0)
    mapping3 = pd.concat([mapping3, pd.DataFrame(tmp.iloc[tmp['PartialTokenSet'].values.argmax(), :]).T], axis=0)

In [32]:
mapping1

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
10303,B32004,20,Aquarium T Stop - 200 Atlantic Ave,Aquarium T Stop - 200 Atlantic Ave,100,100,100
44097,M32006,67,MIT at Mass Ave / Amherst St,MIT at Mass Ave / Amherst St,100,100,100
45382,M32011,68,Central Square at Mass Ave / Essex St,Central Square at Mass Ave / Essex St,100,100,100
25906,C32024,150,State Street at Channel Center,State Street at Channel Center,100,100,100
52809,M32037,107,Ames St at Main St,Ames St at Main St,100,100,100
17621,B32029,203,Columbia Rd at Ceylon St,Columbia Rd at Ceylon St,100,100,100
39351,E32002,123,JP Center - Centre Street at Myrtle Street,JP Center - Centre Street at Myrtle Street,100,100,100
43789,M32005,80,MIT Stata Center at Vassar St / Main St,MIT Stata Center at Vassar St / Main St,100,100,100
42324,K32005,193,Brookline Village - Station Street @ MBTA,Brookline Village - Station Street @ MBTA TEMP...,100,76,100
32180,D32010,47,Cross St. at Hanover St.,Cross St. at Hanover St.,100,100,100


In [33]:
# Spot check any similarity scores below 100
mapping1[mapping1.PartialSimilarity < 100]

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
22955,C32015,106,Dudley Town Common - Mt Pleasant Ave at Blue H...,Mt Pleasant Ave / Dudley Town Common,62,85,100


In [34]:
mapping1 = mapping1.reset_index(drop=True).sort_values('StationID1')

In [35]:
mapping1

Unnamed: 0,StationID1,StationID2,StationName1,StationName2,PartialSimilarity,PartialTokenSort,PartialTokenSet
179,A32000,7,Fan Pier,Fan Pier,100,100,100
13,A32001,8,Union Square - Brighton Ave. at Cambridge St.,Union Square - Brighton Ave. at Cambridge St.,100,100,100
40,A32002,9,Commonwealth Ave at Buick St,Commonwealth Ave at Buick St,100,100,100
144,A32003,10,B.U. Central - 725 Comm. Ave.,B.U. Central - 725 Comm. Ave.,100,100,100
54,A32004,11,Longwood Ave / Binney St,Longwood Ave / Binney St,100,100,100
39,A32005,15,Harvard Real Estate - Brighton Mills - 370 Wes...,Harvard Real Estate - Brighton Mills - 370 Wes...,100,100,100
33,A32006,17,Harvard University Housing - 111 Western Ave. ...,Harvard University Housing - 111 Western Ave. ...,100,100,100
155,A32008,19,Park Dr at Buswell St,Park Dr at Buswell St,100,100,100
143,A32009,25,Tremont St / W Newton St,Tremont St / W Newton St,100,100,100
177,A32010,22,South Station - 700 Atlantic Ave.,South Station - 700 Atlantic Ave.,100,100,100


#### Map Station Info to Trip Data

In [93]:
# Map alpha-numeric ID to starting station
mapping1.StationID2 = mapping1.StationID2.astype(int)
tmp_df = new_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StartID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StartID': 'StartNum', 'StationID1': 'StartID'})
tmp_df = tmp_df.drop(columns='StationID2')
# Map alpha-numeric ID to stopping station
tmp_df = tmp_df.merge(mapping1[['StationID1', 'StationID2']], how='left', left_on='StopID', right_on='StationID2')
tmp_df = tmp_df.rename(columns={'StopID': 'StopNum', 'StationID1': 'StopID'})
tmp_df = tmp_df.drop(columns=['StationID2', 'StartNum', 'StopNum'])
# Map start city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StartID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StartCity'})
tmp_df = tmp_df.drop(columns=['StationID'])
# Map stop city
tmp_df = tmp_df.merge(stations2_df[['StationID', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp_df = tmp_df.rename(columns={'City': 'StopCity'})
tmp_df = tmp_df.drop(columns=['StationID'])

In [94]:
tmp_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartName,StartLat,StartLon,StopName,StopLat,StopLon,BikeID,UserType,BirthYear,Gender,StartID,StopID,StartCity,StopCity
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,Union Square - Somerville,42.379637,-71.095319,1380,Subscriber,1983.0,Female,M32007,S32002,Cambridge,Somerville
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,Nashua Street at Red Auerbach Way,42.365598,-71.064248,866,Subscriber,1980.0,Male,B32008,A32025,Boston,Boston
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,MIT Pacific St at Purrington St,42.359573,-71.101295,Sidney Research Campus/ Erie Street at Waverly,42.357753,-71.103934,35,Subscriber,1990.0,Male,M32041,M32047,Cambridge,Cambridge
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,Tremont St / W Newton St,42.341332,-71.076847,Prudential Center / Belvidere,42.345959,-71.082578,1917,Customer,,Unknown,A32009,C32007,Boston,Boston
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,Dana Park,42.361589,-71.107437,603,Subscriber,1990.0,Male,M32012,M32030,Cambridge,Cambridge


In [95]:
tmp_df.shape

(3735869, 17)

In [96]:
# Map Start Lat, Lon, and City
tmp2_df = old_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City']], how='left', left_on='StartID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StartLat', 'Lon': 'StartLon', 'City': 'StartCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])
# Map Stop Lat, Lon, and City
tmp2_df = tmp2_df.merge(stations2_df[['StationID', 'Lat', 'Lon', 'City']], how='left', left_on='StopID', right_on='StationID')
tmp2_df = tmp2_df.rename(columns={'Lat': 'StopLat', 'Lon': 'StopLon', 'City': 'StopCity'})
tmp2_df = tmp2_df.drop(columns=['StationID'])

In [97]:
tmp2_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StopID,StopName,BikeID,UserType,ZipCode,Gender,StartLat,StartLon,StartCity,StopLat,StopLon,StopCity
0,397,2014-12-31 23:58:00,2015-01-01 00:05:00,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Subscriber,2148.0,Male,42.35977,-71.051601,Boston,42.365885,-71.064548,Boston
1,543,2014-12-31 23:51:00,2015-01-01 00:00:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Subscriber,2143.0,Male,42.3581,-71.093198,Cambridge,42.372969,-71.094445,Cambridge
2,928,2014-12-31 23:30:00,2014-12-31 23:46:00,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Subscriber,2215.0,Male,42.349673,-71.077303,Boston,42.347241,-71.105301,Boston
3,270,2014-12-31 23:26:00,2014-12-31 23:31:00,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Subscriber,2043.0,Male,42.35977,-71.051601,Boston,42.352175,-71.055547,Boston
4,960,2014-12-31 23:07:00,2014-12-31 23:23:00,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Subscriber,1945.0,Male,42.362811,-71.056067,Boston,42.35977,-71.051601,Boston


In [98]:
tmp2_df.shape

(2762938, 17)

#### Combine Old Data with New Data

In [99]:
set(tmp_df.columns).symmetric_difference(set(tmp2_df.columns))

{'BirthYear', 'ZipCode'}

In [120]:
combo = pd.concat([tmp_df, tmp2_df], axis=0)

In [121]:
combo.head()

Unnamed: 0,BikeID,BirthYear,Duration,Gender,StartCity,StartID,StartLat,StartLon,StartName,StartTime,StopCity,StopID,StopLat,StopLon,StopName,StopTime,UserType,ZipCode
0,1380,1983.0,196,Female,Cambridge,M32007,42.372969,-71.094445,Cambridge St - at Columbia St / Webster Ave,2016-11-01 00:00:35,Somerville,S32002,42.379637,-71.095319,Union Square - Somerville,2016-11-01 00:03:52,Subscriber,
1,866,1980.0,222,Male,Boston,B32008,42.35892,-71.057629,Mayor Martin J Walsh - 28 State St,2016-11-01 00:01:23,Boston,A32025,42.365598,-71.064248,Nashua Street at Red Auerbach Way,2016-11-01 00:05:06,Subscriber,
2,35,1990.0,483,Male,Cambridge,M32041,42.359573,-71.101295,MIT Pacific St at Purrington St,2016-11-01 00:04:19,Cambridge,M32047,42.357753,-71.103934,Sidney Research Campus/ Erie Street at Waverly,2016-11-01 00:12:22,Subscriber,
3,1917,,141,Unknown,Boston,A32009,42.341332,-71.076847,Tremont St / W Newton St,2016-11-01 00:05:31,Boston,C32007,42.345959,-71.082578,Prudential Center / Belvidere,2016-11-01 00:07:53,Customer,
4,603,1990.0,154,Male,Cambridge,M32012,42.366426,-71.105495,Central Sq Post Office / Cambridge City Hall a...,2016-11-01 00:05:59,Cambridge,M32030,42.361589,-71.107437,Dana Park,2016-11-01 00:08:34,Subscriber,


### Save Data

In [126]:
tmp_df = tmp_df[combo.columns[:-1]]

In [None]:
# Save new df
with open('new_mapped.pkl', 'wb') as f:
    pickle.dump(tmp_df, f)

In [140]:
y = list(combo.columns[:-2])
y.append('ZipCode')
tmp2_df.columns = y

In [141]:
# Save old df
with open('old_mapped.pkl', 'wb') as f:
    pickle.dump(tmp2_df, f)

In [144]:
# Save combo df
combo = combo.drop(columns=['UserType', 'ZipCode'])
with open('combo_mapped.pkl', 'wb') as f:
    pickle.dump(combo, f)