In [37]:
import pandas as pd
import numpy as np
import feather
%matplotlib inline

### Part I. Get the StopID location from googletransit data

In [38]:
# Read and prepare data
stops = pd.read_csv('./googletransit20121129-1547/stops.csv')
stops['StopID'] = stops['stop_id'].str[-4:]
stops = stops.rename(columns = {'stop_lat':'Lat'})
stops = stops.rename(columns = {'stop_lon':'Lon'})
googletransit_stops = stops.loc[:,['StopID', 'Lat', 'Lon']]

### Part II. For those StopID not in googletransit, use the maximun occurance in gps data as its location

In [39]:
res = pd.read_feather('DBus_stage_two_clean_v4.feather')

In [40]:
# Get the StopID list in gps data
gb_gps_stops = res.groupby('StopID')
gb_gps_stops_c = gb_gps_stops.count()
gps_stops = gb_gps_stops_c.reset_index('StopID')
gps_stops = gps_stops.loc[:,['StopID', 'AtStop']]
gps_stops = gps_stops.rename(columns = {'AtStop':'Count'})
gps_stops

Unnamed: 0,StopID,Count
0,0010,65360
1,0100,8643
2,1000,5076
3,1001,14737
4,1002,7186
5,1003,5369
6,1004,4372
7,1005,10585
8,1006,3828
9,1007,7803


In [41]:
# Compare stops in gps data and googletransit data
compare_stops = pd.merge(gps_stops, stops, on='StopID', how='outer')
compare_stops

Unnamed: 0,StopID,Count,stop_id,stop_name,Lat,Lon
0,0010,65360.0,8220DB000010,"Parnell Square, Rotunda",53.353383,-6.265389
1,0100,8643.0,8230DB000100,"Wellington Lane, Templeogue",53.303083,-6.322361
2,1000,5076.0,8220DB001000,"Oscar Traynor Rd, Coolock",53.390974,-6.201794
3,1001,14737.0,8220DB001001,"Oscar Traynor Rd, Coolock",53.390081,-6.199200
4,1002,7186.0,8220DB001002,"Tonlegee Rd, Coolock",53.389219,-6.195117
5,1003,5369.0,8220DB001003,"Tonlegee Rd, Coolock",53.390325,-6.190965
6,1004,4372.0,8220DB001004,"Tonlegee Rd, Edenmore",53.390719,-6.186107
7,1005,10585.0,8220DB001005,"Tonlegee Rd, Edenmore",53.390997,-6.181930
8,1006,3828.0,8220DB001006,"Tonlegee Rd, Edenmore",53.391170,-6.176811
9,1007,7803.0,8220DB001007,"Tonlegee Rd, Edenmore",53.391220,-6.172433


In [42]:
# Get StopID not in googletransit
not_in_googletransit = compare_stops[compare_stops.Lat.isnull()]['StopID']

In [43]:
# Get the subset of whole rows that StopID is not in googletransit
res_no_LonLat_in_googletransit = res[res.StopID.isin(not_in_googletransit)]

In [44]:
# Start to find the Lon/Lat by using the max co-ordinate occurance in gps data
# Here mark off select AtStop == True is some StopID even only have no AtStop == True

res_no_LonLat_in_googletransit = res_no_LonLat_in_googletransit[res_no_LonLat_in_googletransit.AtStop == True]
res_no_LonLat_in_googletransit['Lat/Lon'] = res_no_LonLat_in_googletransit.loc[:,['Lat','Lon']].apply(lambda x: str(x[0]) + "," + str(x[1]), axis=1)

In [45]:
# Get the count of each occurance of Lat and Lon
gb = res_no_LonLat_in_googletransit.groupby(['StopID', 'Lat/Lon'])
count = gb.count()
gb_count = count[count.AtStop > 0]
gb_count = gb_count.reset_index(['StopID', 'Lat/Lon'])
gb_count = gb_count.loc[:,['StopID', 'Lat/Lon','AtStop']]

In [46]:
# Get the maximun occurance of Lat/Lon of each StopID
max_count = gb_count[gb_count.groupby(['StopID'])['AtStop'].transform('max') == gb_count['AtStop']]

In [47]:
# Separate Lat/Lon to Lat and Lon
max_count['Lat'] = max_count['Lat/Lon'].apply(lambda x: x[:x.index(',')])
max_count['Lon'] = max_count['Lat/Lon'].apply(lambda x: x[x.index(',')+1:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
# Change the data type first
max_count['Lat'] = max_count['Lat'].convert_objects(convert_numeric=True)
max_count['Lon'] = max_count['Lon'].convert_objects(convert_numeric=True)

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
gb = max_count.groupby('StopID').mean()
stopID_lat_lon = gb[gb.AtStop>0]
stopID_lat_lon = stopID_lat_lon.reset_index('StopID')
gps_stops = stopID_lat_lon.loc[:,['StopID', 'Lat', 'Lon']]

In [50]:
# For those only exists AtStop == False


all_require_stops = set(not_in_googletransit)
calaulated_stops = set(gps_stops.StopID.unique())

need_to_calculate = all_require_stops - calaulated_stops
need_to_calculate

{'0920',
 '0921',
 '0922',
 '0932',
 '0954',
 '0955',
 '7485',
 '7497',
 '7503',
 '7504',
 '7541'}

In [51]:
res_no_LonLat_in_googletransit_2 = res[res.StopID.isin(need_to_calculate)]
res_no_LonLat_in_googletransit_2

Unnamed: 0,level_0,index,AtStop,Day,Delay,JourneyPatternID,Lat,Lon,Route,StopID,TimeFrame,Timestamp,VehicleID,VehicleJourneyID,DateTime,Rain_Avg,Wind_Speed_Avg,SchoolHoliday
57227138,57227138,737411,False,Wednesday,-504,00390002,53.394142,-6.391830,0039,7497,2013-01-30,2013-01-30 23:58:25,36060,3256,2013-01-30 23,0.000000,15.5,False
57227139,57227139,737412,False,Saturday,73,00390002,53.394196,-6.391732,0039,7497,2013-01-26,2013-01-27 00:02:42,36059,10172,2013-01-27 00,0.533333,23.0,True
57227140,57227140,737413,False,Friday,-406,00390002,53.394341,-6.391468,0039,7497,2013-01-25,2013-01-25 23:59:49,36059,3256,2013-01-25 23,0.000000,18.0,False
57227141,57227141,737414,False,Thursday,58,00390002,53.394142,-6.391830,0039,7497,2013-01-24,2013-01-25 00:07:49,36069,3256,2013-01-25 00,0.000000,11.5,False
57227142,57227142,737415,False,Wednesday,-262,00390002,53.394398,-6.391368,0039,7497,2013-01-23,2013-01-24 00:02:05,36060,3256,2013-01-24 00,0.000000,5.0,False
57227143,57227143,737416,False,Tuesday,-442,00390002,53.394073,-6.391952,0039,7497,2013-01-22,2013-01-22 23:59:23,36060,3256,2013-01-22 23,0.000000,3.0,False
57227144,57227144,737417,False,Saturday,222,00390002,53.394123,-6.391864,0039,7497,2013-01-19,2013-01-20 00:05:27,33558,10172,2013-01-20 00,0.000000,9.0,True
57227145,57227145,737418,False,Friday,-122,00390002,53.394264,-6.391611,0039,7497,2013-01-18,2013-01-19 00:04:22,36066,3256,2013-01-19 00,0.200000,10.0,False
57227146,57227146,737419,False,Thursday,-475,00390002,53.394073,-6.391952,0039,7497,2013-01-17,2013-01-17 23:58:45,36049,3256,2013-01-17 23,0.100000,19.0,False
57227147,57227147,737420,False,Thursday,-454,00390002,53.394073,-6.391952,0039,7497,2013-01-17,2013-01-17 23:59:03,36049,3256,2013-01-17 23,0.100000,19.0,False


In [52]:
res_no_LonLat_in_googletransit_2['Lat/Lon'] = res_no_LonLat_in_googletransit_2.loc[:,['Lat','Lon']].apply(lambda x: str(x[0]) + "," + str(x[1]), axis=1)


# Get the count of each occurance of Lat and Lon
gb = res_no_LonLat_in_googletransit_2.groupby(['StopID', 'Lat/Lon'])
count = gb.count()
gb_count = count[count.AtStop > 0]
gb_count = gb_count.reset_index(['StopID', 'Lat/Lon'])
gb_count = gb_count.loc[:,['StopID', 'Lat/Lon','AtStop']]

# Get the maximun occurance of Lat/Lon of each StopID
max_count = gb_count[gb_count.groupby(['StopID'])['AtStop'].transform('max') == gb_count['AtStop']]

# Separate Lat/Lon to Lat and Lon
max_count['Lat'] = max_count['Lat/Lon'].apply(lambda x: x[:x.index(',')])
max_count['Lon'] = max_count['Lat/Lon'].apply(lambda x: x[x.index(',')+1:])

# Change the data type first
max_count['Lat'] = max_count['Lat'].convert_objects(convert_numeric=True)
max_count['Lon'] = max_count['Lon'].convert_objects(convert_numeric=True)


gb = max_count.groupby('StopID').mean()
stopID_lat_lon = gb[gb.AtStop>0]
stopID_lat_lon = stopID_lat_lon.reset_index('StopID')
gps_stops_2 = stopID_lat_lon.loc[:,['StopID', 'Lat', 'Lon']]

gps_stops_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

Unnamed: 0,StopID,Lat,Lon
0,7485,53.4186,-6.25419
1,7497,53.3941,-6.39195
2,7503,53.4204,-6.30685
3,7504,53.4183,-6.30691
4,7541,53.4546,-6.17676


In [55]:
# Concate all
stops = pd.concat([googletransit_stops, gps_stops, gps_stops_2], axis=0)

In [57]:
stops.to_csv('input1_StopID_LonLat.csv', index=False)