In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pickle

### Load Data

In [None]:
# Define trip data path
trip_data_path = '/home/cneiderer/Metis/boston_bike_data/trip_data'

In [None]:
# Build dataframes from old data (pre General Bikeshare Feed Specification (GBFS))
# and new data (postGeneral Bikeshare Feed Specification (GBFS))
old_df = pd.DataFrame()
new_df = pd.DataFrame()
for f in os.listdir(trip_data_path):
    tmp_df = pd.read_csv(os.path.join(trip_data_path, f))
    if f[:6] == 'hubway':
        old_df = pd.concat([old_df, tmp_df], axis=0, ignore_index=True)
    else:
        new_df = pd.concat([new_df, tmp_df], axis=0, ignore_index=True)

In [None]:
# Save old df
with open('old_raw.pkl', 'wb') as f:
    pickle.dump(old_df, f)

In [None]:
# Save new df    
with open('new_raw.pkl', 'wb') as f:
    pickle.dump(new_df, f)

### Clean-Up Old Data

In [50]:
# Load old df
with open('old_raw.pkl', 'rb') as f:
    old_df = pickle.load(f)

In [51]:
old_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2762938 entries, 0 to 2762937
Data columns (total 11 columns):
Duration                int64
Start date              object
End date                object
Start station number    object
Start station name      object
End station number      object
End station name        object
Bike number             object
Member type             object
Zip code                object
Gender                  object
dtypes: int64(1), object(10)
memory usage: 231.9+ MB


In [52]:
old_df.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station name,End station number,End station name,Bike number,Member type,Zip code,Gender
0,397101,12/31/2014 23:58,1/1/2015 0:05,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Member,2148,Male
1,542806,12/31/2014 23:51,1/1/2015 0:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Member,2143,Male
2,927775,12/31/2014 23:30,12/31/2014 23:46,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Member,2215,Male
3,269771,12/31/2014 23:26,12/31/2014 23:31,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Member,2043,Male
4,959775,12/31/2014 23:07,12/31/2014 23:23,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Member,1945,Male


In [54]:
# Rename columns
old_df.columns = ['Duration', 'StartTime', 'StopTime', 
                  'StartID', 'StartName', 
                  'StopID', 'StopName', 
                  'BikeID', 'UserType', 'ZipCode', 'Gender']

In [55]:
old_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StopID,StopName,BikeID,UserType,ZipCode,Gender
0,397101,12/31/2014 23:58,1/1/2015 0:05,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Member,2148,Male
1,542806,12/31/2014 23:51,1/1/2015 0:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Member,2143,Male
2,927775,12/31/2014 23:30,12/31/2014 23:46,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Member,2215,Male
3,269771,12/31/2014 23:26,12/31/2014 23:31,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Member,2043,Male
4,959775,12/31/2014 23:07,12/31/2014 23:23,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Member,1945,Male


In [81]:
# Convert timestamps to datetime objects
old_df.StartTime = pd.to_datetime(old_df.StartTime, format='%m/%d/%Y %H:%M')
old_df.StopTime = pd.to_datetime(old_df.StopTime, format='%m/%d/%Y %H:%M')

In [82]:
# Convert duration to time in seconds
old_df.Duration = np.round(old_df.Duration.astype(int) / 1000).astype(int)

In [83]:
# Remap user types
old_df.UserType = old_df.UserType.map({'Member': 'Subscriber', 'Casual': 'Customer'})
# Remap gender
old_df.Gender = old_df.Gender.fillna('Unknown')

In [84]:
# Zip code conversion helper function
def convert_zipcode(z):
    if isinstance(z, str):
        try:
            z = int(z)
        except:
            z = (z[:6].strip('-').replace(' ', ''), type(z))
            try:
                z = int(z)
            except:
                z = np.nan
    return z
# Convert zip code
old_df.ZipCode = old_df.ZipCode.apply(convert_zipcode)

In [85]:
old_df.head()

Unnamed: 0,Duration,StartTime,StopTime,StartID,StartName,StopID,StopName,BikeID,UserType,ZipCode,Gender
0,397,2014-12-31 23:58:00,2015-01-01 00:05:00,B32004,Aquarium Station - 200 Atlantic Ave.,D32022,TD Garden - Causeway at Portal Park #1,T01335,Subscriber,2148.0,Male
1,543,2014-12-31 23:51:00,2015-01-01 00:00:00,M32006,MIT at Mass Ave / Amherst St,M32007,Cambridge St - at Columbia St / Webster Ave,T01426,Subscriber,2143.0,Male
2,928,2014-12-31 23:30:00,2014-12-31 23:46:00,D32005,Boston Public Library - 700 Boylston St.,A32008,Buswell St. at Park Dr.,B01570,Subscriber,2215.0,Male
3,270,2014-12-31 23:26:00,2014-12-31 23:31:00,B32004,Aquarium Station - 200 Atlantic Ave.,A32010,South Station - 700 Atlantic Ave.,T01205,Subscriber,2043.0,Male
4,960,2014-12-31 23:07:00,2014-12-31 23:23:00,D32010,Cross St. at Hanover St.,B32004,Aquarium Station - 200 Atlantic Ave.,T01306,Subscriber,1945.0,Male


In [86]:
old_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2762938 entries, 0 to 2762937
Data columns (total 11 columns):
Duration     int64
StartTime    datetime64[ns]
StopTime     datetime64[ns]
StartID      object
StartName    object
StopID       object
StopName     object
BikeID       object
UserType     object
ZipCode      float64
Gender       object
dtypes: datetime64[ns](2), float64(1), int64(1), object(7)
memory usage: 231.9+ MB


In [None]:
# Save old df
with open('old_clean.pkl', 'wb') as f:
    pickle.dump(old_df, f)

### Clean-Up New Data

In [5]:
# Load old df
with open('new_raw.pkl', 'rb') as f:
    new_df = pickle.load(f)

In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3731292 entries, 0 to 3731291
Data columns (total 15 columns):
tripduration               int64
starttime                  object
stoptime                   object
start station id           int64
start station name         object
start station latitude     float64
start station longitude    float64
end station id             object
end station name           object
end station latitude       object
end station longitude      object
bikeid                     int64
usertype                   object
birth year                 object
gender                     int64
dtypes: float64(2), int64(4), object(9)
memory usage: 427.0+ MB


In [7]:
new_df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,196,2016-11-01 00:00:35,2016-11-01 00:03:52,95,Cambridge St - at Columbia St / Webster Ave,42.372969,-71.094445,78,Union Square - Somerville,42.3796,-71.0953,1380,Subscriber,1983,2
1,222,2016-11-01 00:01:23,2016-11-01 00:05:06,23,Mayor Martin J Walsh - 28 State St,42.35892,-71.057629,190,Nashua Street at Red Auerbach Way,42.3656,-71.0642,866,Subscriber,1980,1
2,483,2016-11-01 00:04:19,2016-11-01 00:12:22,178,MIT Pacific St at Purrington St,42.359573,-71.101295,184,Sidney Research Campus/ Erie Street at Waverly,42.3578,-71.1039,35,Subscriber,1990,1
3,141,2016-11-01 00:05:31,2016-11-01 00:07:53,25,Tremont St / W Newton St,42.341332,-71.076847,21,Prudential Center / Belvidere,42.346,-71.0826,1917,Customer,\N,0
4,154,2016-11-01 00:05:59,2016-11-01 00:08:34,76,Central Sq Post Office / Cambridge City Hall a...,42.366426,-71.105495,139,Dana Park,42.3616,-71.1074,603,Subscriber,1990,1


In [None]:
new_df.columns = ['Duration', 'StartTime', 'StopTime',
                  'StartID', 'StartName', 'StartLat', 'StartLon', 
                  'StopID', 'StopName', 'StopLat', 'StopLon',
                  'BikeID', 'UserType', 'BirthYear', 'Gender']

In [None]:
new_df.head()

In [None]:
# Convert timestamps to datetime objects
new_df.StartTime = pd.to_datetime(new_df.StartTime)
new_df.StopTime = pd.to_datetime(new_df.StopTime)
# Convert duration to integer
new_df.Duration = new_df.Duration.astype(int)
# Convert lat and lon to float
new_df.StartLat = new_df.StartLat.astype(float)
new_df.StartLon = new_df.StartLon.astype(float)
new_df.StopLat = new_df.StopLat.astype(float)
new_df.StopLon = new_df.StopLon.astype(float)
# Remap birth year
new_df.BirthYear = [np.nan if x == '\\N' else x for x in new_df.BirthYear]
# Remap gender
new_df.Gender = new_df.Gender.map({0: 'Unknown', 1: 'Male', 2:'Female'})

In [None]:
new_df.head()

In [None]:
new_df.BirthYear = [np.nan if x == '\\N' else x for x in new_df.BirthYear]

In [None]:
new_df.StartLat = new_df.StartLat.astype(float)
new_df.StartLon = new_df.StartLon.astype(float)
new_df.StopLat = new_df.StopLat.astype(float)
new_df.StopLon = new_df.StopLon.astype(float)

In [None]:
id_map1 = {}
for i in set(tmp1['start station id']):
    id_map1[i] = set(tmp1[(tmp1['start station id'] == i)]['start station name'])
id_map1

In [None]:
from difflib import SequenceMatcher

In [None]:
SequenceMatcher(None, list(id_map1[151])[0], list(id_map1[151])[1]).ratio()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
l = list(id_map1[151])
print(l)
print('Ratio:', fuzz.ratio(l[0], l[1]))
print('Partial Ratio:', fuzz.partial_ratio(l[0], l[1]))
print('Token Sort Ratio:', fuzz.token_sort_ratio(l[0], l[1]))
print('Token Set Ratio:', fuzz.token_set_ratio(l[0], l[1]))


In [None]:
import itertools

In [None]:
l = list(id_map1[130])
list(itertools.combinations(l, 2))

In [None]:
similarity = {}
for key in id_map1:
    if len(id_map1[key]) > 1:
        combos = list(itertools.combinations(list(id_map1[key]), 2))
        sim = []
        for combo in combos:
            sim.append([combo, fuzz.partial_ratio(combo[0], combo[1])])
        similarity[key] = sim
    else:
        similarity[key] = [id_map1[key], 1]

In [None]:
similarity

In [None]:
tmp2 = old_df[['Start station number', 'Start station name']]

In [None]:
id_map2 = {}
for i in set(tmp2['Start station number']):
    id_map2[i] = set(tmp2[(tmp2['Start station number'] == i)]['Start station name'])
id_map2

In [None]:
station_info_path = '/home/cneiderer/Metis/boston_bike_data/station_data'
stations1_file = 'Hubway_Stations_2011_2016.csv'
stations2_file = 'Hubway_Stations_as_of_July_2017.csv'

In [None]:
stations1_df = pd.read_csv(os.path.join(station_info_path, stations1_file))
stations2_df = pd.read_csv(os.path.join(station_info_path, stations2_file))

In [None]:
stations1_df.head()

In [None]:
stations1_df.sort_values('Station ID', axis=0)

In [None]:
stations2_df.head()

In [None]:
sorted_df = stations2_df.sort_values('Station ID', axis=0)
sorted_df

In [None]:
sorted_df[sorted_df['Station ID'] == 'D32007']

In [None]:
stations1_df = stations1_df[stations2_df.columns]

In [None]:
df = pd.concat([stations1_df, stations2_df], axis=0)

In [None]:
df = df[~df.duplicated()]

In [None]:
t4 = stations2_df[stations2_df['Station ID'] == 'D32007']
t4.head()

In [None]:
t5 = stations1_df[stations1_df['Station ID'] == 'D32007']
t5.head()

In [None]:
stations = stations1_df.merge(stations2_df, how='outer', on='Station ID')
stations

In [None]:
old_df.head()

In [None]:
xx = old_df[['Start station number']].merge(stations2_df[['Station ID', 'Latitude', 'Longitude']], 
                                       how='right', left_on='Start station number', right_on='Station ID')

In [None]:
set(xx[xx.isnull().any(axis=1)]['Start station number'])

In [None]:
xx[xx['Start station number'] == 'A32007']

In [None]:
set(xx[xx.isnull().any(axis=1)]['Station ID'])