# NY CitiBike Analysis: Geospatial Plot

## Import Libraries and Load Data

In [1]:
# Import libraries
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Load data
df = pd.read_csv('new_york_data.csv', index_col = 0)

  df = pd.read_csv('new_york_data.csv', index_col = 0)


In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,value,bike_rides_daily
0,DF208007EE5F9D79,classic_bike,2022-08-26 15:21:44,2022-08-26 15:34:20,Berkeley Pl & 6 Ave,4134.06,Windsor Pl & 8 Ave,3620.02,40.67653,-73.978469,40.660906,-73.983074,member,2022-08-26,27.4,1,111955
1,4BF322F1E0D74152,classic_bike,2022-08-26 16:08:53,2022-08-26 16:17:48,W 44 St & 11 Ave,6756.05,8 Ave & W 33 St,6450.12,40.762009,-73.996975,40.751551,-73.993934,member,2022-08-26,27.4,1,111955
2,301E3D811B0D5219,classic_bike,2022-08-26 02:34:33,2022-08-26 08:29:41,9 Ave & W 45 St,6717.06,8 Ave & W 33 St,6450.12,40.760193,-73.991255,40.751551,-73.993934,member,2022-08-26,27.4,1,111955
3,7D4E20D0A43FCE1F,classic_bike,2022-08-26 11:25:26,2022-08-26 11:52:19,Central Ave & Covert St,4550.05,DeKalb Ave & Hudson Ave,4513.06,40.68929,-73.90951,40.689888,-73.981013,casual,2022-08-26,27.4,1,111955
4,BB560C341D35EB7D,electric_bike,2022-08-26 19:46:08,2022-08-26 19:48:11,21 St & 43 Ave,6395.01,9 St & 44 Rd,6361.03,40.750525,-73.945948,40.74966,-73.9521,member,2022-08-26,27.4,1,111955


## Data Preprocessing

In [4]:
# Take random sample of df
percentage = 0.001
subset_size = int(len(df) * percentage)

In [5]:
df_subset = df.sample(n = subset_size)

In [6]:
df_subset.shape

(30689, 17)

In [7]:
df_subset.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,value,bike_rides_daily
29824686,DDCE235CF727D34F,classic_bike,2022-12-07 19:13:05,2022-12-07 19:34:58,W 27 St & 7 Ave,6247.06,E 45 St & 3 Ave,6464.08,40.746647,-73.993915,40.752554,-73.972826,casual,2022-12-07,13.8,1,76862
16486601,3EC677B6728E518D,electric_bike,2022-06-12 14:55:39,2022-06-12 15:08:37,E 20 St & FDR Dr,5886.13,E 7 St & Ave B,5584.05,40.733209,-73.975681,40.725129,-73.981317,member,2022-06-12,20.5,1,92658
11574941,18B25B4F643BEFFE,electric_bike,2022-11-02 18:30:34,2022-11-02 19:12:19,Central Park West & W 72 St,7141.07,Washington Square E,5755.09,40.775794,-73.976206,40.730494,-73.995721,casual,2022-11-02,17.6,1,121039
10744025,99F4CC47B9BD5FF5,electric_bike,2022-04-11 16:45:02,2022-04-11 16:49:55,E 16 St & 5 Ave,6022.04,W 21 St & 6 Ave,6140.05,40.737262,-73.99239,40.74174,-73.994156,casual,2022-04-11,8.8,1,77371
17186439,9540001D16D850C5,classic_bike,2022-06-05 21:54:17,2022-06-05 22:17:04,Hudson St & W 13 St,6115.06,8 Ave & W 52 St,6816.07,40.740057,-74.005274,40.763707,-73.985162,member,2022-06-05,20.0,1,118762


In [8]:
# Group by start and end station
df_group = df_subset.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [9]:
df_group.rename(columns = {'from_station_name': 'start_station_name', 'to_station_name': 'end_station_name',
                          'value': 'trips'}, inplace = True)

In [10]:
df_group.shape

(26088, 3)

In [11]:
df_group.head()

Unnamed: 0,start_station_name,end_station_name,trips
0,1 Ave & E 110 St,1 Ave & E 110 St,1
1,1 Ave & E 110 St,3 Ave & E 112 St,1
2,1 Ave & E 110 St,E 102 St & Park Ave,1
3,1 Ave & E 110 St,E 141 St & Jackson Ave,1
4,1 Ave & E 110 St,E 72 St & York Ave,1


In [12]:
# Subset of start stations
start_stations = df_subset[['start_station_name', 'start_lat', 'start_lng']]

In [13]:
start_stations.rename(columns = {'start_station_name': 'station', 'start_lat': 'lat', 'start_lng': 'lng'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  start_stations.rename(columns = {'start_station_name': 'station', 'start_lat': 'lat', 'start_lng': 'lng'}, inplace = True)


In [14]:
start_stations.shape

(30689, 3)

In [15]:
start_stations.head()

Unnamed: 0,station,lat,lng
29824686,W 27 St & 7 Ave,40.746647,-73.993915
16486601,E 20 St & FDR Dr,40.733209,-73.975681
11574941,Central Park West & W 72 St,40.775794,-73.976206
10744025,E 16 St & 5 Ave,40.737262,-73.99239
17186439,Hudson St & W 13 St,40.740057,-74.005274


In [16]:
# Subset of end stations
end_stations = df_subset[['end_station_name', 'end_lat', 'end_lng']]

In [17]:
end_stations.rename(columns = {'end_station_name': 'station', 'end_lat': 'lat', 'end_lng': 'lng'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  end_stations.rename(columns = {'end_station_name': 'station', 'end_lat': 'lat', 'end_lng': 'lng'}, inplace = True)


In [18]:
end_stations.shape

(30689, 3)

In [19]:
end_stations.head()

Unnamed: 0,station,lat,lng
29824686,E 45 St & 3 Ave,40.752554,-73.972826
16486601,E 7 St & Ave B,40.725129,-73.981317
11574941,Washington Square E,40.730494,-73.995721
10744025,W 21 St & 6 Ave,40.74174,-73.994156
17186439,8 Ave & W 52 St,40.763707,-73.985162


In [20]:
# Combine to create df of stations and their coordinates
df_stations = pd.concat([start_stations, end_stations])

In [21]:
df_stations['start_station_name'] = df_stations['station']

In [22]:
df_stations['end_station_name'] = df_stations['station']

In [23]:
df_stations.drop('station', axis = 1, inplace = True)

In [24]:
df_stations.reset_index(drop = True, inplace = True)

In [25]:
df_stations = df_stations.drop_duplicates()

In [26]:
df_stations.shape

(6000, 4)

In [27]:
df_stations.head()

Unnamed: 0,lat,lng,start_station_name,end_station_name
0,40.746647,-73.993915,W 27 St & 7 Ave,W 27 St & 7 Ave
1,40.733209,-73.975681,E 20 St & FDR Dr,E 20 St & FDR Dr
2,40.775794,-73.976206,Central Park West & W 72 St,Central Park West & W 72 St
3,40.737262,-73.99239,E 16 St & 5 Ave,E 16 St & 5 Ave
4,40.740057,-74.005274,Hudson St & W 13 St,Hudson St & W 13 St


In [28]:
# Merge df_group with df_stations on start_station_name
df_m = df_group.merge(df_stations, how = 'outer', on = "start_station_name", indicator = 'merge_flag')

In [29]:
df_group.shape

(26088, 3)

In [30]:
df_stations.shape

(6000, 4)

In [31]:
df_m.shape

(156847, 7)

In [32]:
df_m.head()

Unnamed: 0,start_station_name,end_station_name_x,trips,lat,lng,end_station_name_y,merge_flag
0,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,1 Ave & E 110 St,both
1,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792347,-73.937864,1 Ave & E 110 St,both
2,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.79241,-73.93812,1 Ave & E 110 St,both
3,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792336,-73.937981,1 Ave & E 110 St,both
4,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792433,-73.938261,1 Ave & E 110 St,both


In [33]:
df_m['merge_flag'].value_counts(dropna = False)

merge_flag
both          156749
right_only        98
left_only          0
Name: count, dtype: int64

In [34]:
df_m = df_m[df_m['merge_flag'] =='both']

In [35]:
df_m.shape

(156749, 7)

In [36]:
df_m.head()

Unnamed: 0,start_station_name,end_station_name_x,trips,lat,lng,end_station_name_y,merge_flag
0,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,1 Ave & E 110 St,both
1,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792347,-73.937864,1 Ave & E 110 St,both
2,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.79241,-73.93812,1 Ave & E 110 St,both
3,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792336,-73.937981,1 Ave & E 110 St,both
4,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792433,-73.938261,1 Ave & E 110 St,both


In [37]:
df_m.drop(columns = {'end_station_name_y'}, inplace = True)

In [38]:
df_m.rename(columns = {'end_station_name_x' : 'end_station_name'}, inplace = True)

In [39]:
df_m.head()

Unnamed: 0,start_station_name,end_station_name,trips,lat,lng,merge_flag
0,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both
1,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792347,-73.937864,both
2,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.79241,-73.93812,both
3,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792336,-73.937981,both
4,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792433,-73.938261,both


In [40]:
# Merge df_m with df_stations on end_station_name
df_final = df_m.merge(df_stations, how = 'outer', on = 'end_station_name', indicator = 'merge_flag_2')

In [41]:
df_final['merge_flag_2'].value_counts(dropna = False)

merge_flag_2
both          1009279
right_only        106
left_only           0
Name: count, dtype: int64

In [42]:
df_final.head()

Unnamed: 0,start_station_name_x,end_station_name,trips,lat_x,lng_x,merge_flag,lat_y,lng_y,start_station_name_y,merge_flag_2
0,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both,40.792327,-73.9383,1 Ave & E 110 St,both
1,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both,40.792347,-73.937864,1 Ave & E 110 St,both
2,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both,40.79241,-73.93812,1 Ave & E 110 St,both
3,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both,40.792336,-73.937981,1 Ave & E 110 St,both
4,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,both,40.792433,-73.938261,1 Ave & E 110 St,both


In [43]:
df_final = df_final[df_final['merge_flag_2'] =='both']

In [44]:
df_final.drop(columns = {'start_station_name_y', 'merge_flag', 'merge_flag_2'}, inplace = True)

In [45]:
df_final.rename(columns = {'start_station_name_x' : 'start_station_name'}, inplace = True)

In [46]:
df_final.rename(columns = {'start_station_name_x' : 'start_station_name', 'lat_x' : 'start_lat',
                           'lng_x' : 'start_lon', 'lat_y' : 'end_lat', 'lng_y' : 'end_lon',}, inplace = True)

In [47]:
import gc # this is garbage collector - speeds up performance
gc.collect()

0

In [48]:
df_final.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lon,end_lat,end_lon
0,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,40.792327,-73.9383
1,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,40.792347,-73.937864
2,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,40.79241,-73.93812
3,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,40.792336,-73.937981
4,1 Ave & E 110 St,1 Ave & E 110 St,1.0,40.792327,-73.9383,40.792433,-73.938261


In [49]:
# Export df_final as csv
df_final.to_csv('df_final_locations_for_map.csv')

In [2]:
# Load df_final
df_final = pd.read_csv('df_final_locations_for_map.csv', index_col = 0)

## Initialize kepler.gl

In [3]:
# Create KeplerGl instance
m = KeplerGl(height = 700, data = {'data_1': df_final})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':             start_station_name       end_station_name  trips  start_lat  \
0         â€¦

#### I changed the following settings:

#### Under layers I made the start points and end points visible and green because that follows the color scheme of my other plots. Also under layers, I made the start to end arcs visible and set them to be green at the source and yellow at the target in order to identify the starting and ending stations on a path.

#### Under filters I applied a trips filter and set it to show paths with a minimum of 5 trips. This was in order to make the map more readable, since there were too many overlapping paths to clearly see any trends or draw any information from the map.


#### Also noteworthy:

#### My first impression was that Manhattan is clearly the busiest area of the city for the bikes. The data suggests that some of the longer distance trips are between stations located close to the water. This body of water is Newark Bay, and there also appear to be several parks along the water side. The increased amount of trips in this area could be due to it being a more scenic spot to ride a bike, attracting not just commuters but also sight-seers. Also noteworthy are the stations that do not have an arc connecting to them, indicating that trips both started and ended at the same location. Several of these are on the edges of parks, for example, which likely indicates the bikes were used to ride around the park.

In [4]:
# Create a config object and save your map with it
config = m.config

In [5]:
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In [6]:
m.save_to_html(file_name = 'CitiBike Trips Aggregated.html', read_only = False, config = config)

Map saved to CitiBike Trips Aggregated.html!
