In [None]:
import os
import pandas as pd
import geopandas
import geodatasets
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
import folium

In [None]:
#Read in the processed data of the last 3 months that were pulled.
april_df = pd.read_csv('processed_data/PROCESSED_202304-citibike-tripdata.csv', index_col=0)
may_df = pd.read_csv('processed_data/PROCESSED_202305-citibike-tripdata.csv', index_col=0)
june_df = pd.read_csv('processed_data/PROCESSED_202306-citibike-tripdata.csv', index_col=0)

After going through the processing of my data, I found some of the lower sites were in New Jersey, despite there being separate files for New Jersey. I decided to filter out the New Jersey station ids (started with JC or HB in station id).

In [None]:
#Remove New Jersey stations because of possible incomplete information
april_df=april_df[~april_df['station_id'].str.contains('|'.join(['JC','HB']))]
may_df=may_df[~may_df['station_id'].str.contains('|'.join(['JC','HB']))]
june_df=june_df[~june_df['station_id'].str.contains('|'.join(['JC','HB']))]

To get a sense of what kind of ride counts stations had for a month, I wanted to see for each month where the  stations fell in the range of the maximum and minumum ride counts. So I used a histogram with 20 bins to get a broad idea of the stations' ride counts.

In [None]:
#Histograms of how many stations  of the last 3 months of Citibike Data
fig, (ax1, ax2, ax3) = plt.subplots(3,
                                    1,
                                    figsize=(10,18))

fig.suptitle('Majority of Citibike Stations had under 1,000 Rides',
             fontsize = 20,
             fontweight = 'bold',
             y = 0.92)

####april
axes1, bin1, bars1 = ax1.hist(april_df['ride_count'],
                              bins = 20,
                              color = 'lightskyblue',
                              ec ='gray')

ax1.bar_label(bars1,
             fontsize = 12)

ax1.set_title('April 2023',
         fontsize = 16)

ax1.set_xlabel('Number of Rides',
          fontsize = 14)
ax1.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right')

#set the maximum x tick relative to the monthly max
ax1.set_xticks(range(0,april_df['ride_count'].max(),2000))

####may
axes2, bin2, bars2 = ax2.hist(may_df['ride_count'],
                              bins = 20,
                              color = 'steelblue',
                              ec ='gray')

ax2.bar_label(bars2,
             fontsize = 12)

ax2.set_title('May 2023',
         fontsize = 16)

ax2.set_xlabel('Number of Rides',
          fontsize = 14)
ax2.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right')

ax2.set_xticks(range(0,may_df['ride_count'].max(),2000))

####june
axes3, bin3, bars3 = ax3.hist(june_df['ride_count'],
                              bins = 20,
                              color = 'navy',
                              ec ='gray')

ax3.bar_label(bars3,
             fontsize = 12)

ax3.set_title('June 2023',
         fontsize = 16)

ax3.set_xlabel('Number of Rides',
          fontsize = 14)
ax3.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right')

ax3.set_xticks(range(0,june_df['ride_count'].max(),2000));

In [None]:
#Sort by ride_count for next histograms of 50 lowest ride counts each month so I can select them by slice
april_df.sort_values('ride_count', inplace = True)
may_df.sort_values('ride_count', inplace = True)
june_df.sort_values('ride_count', inplace = True)

In [None]:
#Create histograms of the 50 stations with least amount of rides in each month

#set up 3 subplots, one each for each month
fig, (ax1, ax2, ax3) = plt.subplots(3,
                                    1,
                                    figsize=(10,18))

fig.suptitle('Ride Count of the 50 Stations with the Month\'s Lowest Ride Count',
             fontsize = 20,
             fontweight = 'bold',
             y = 0.92)

####april
axes1, bin1, bars1 = ax1.hist(april_df['ride_count'][0:50],
                              color = 'lightskyblue',
                              ec ='gray')

ax1.bar_label(bars1,
             fontsize = 12)

ax1.set_title('April 2023',
         fontsize = 16)

ax1.set_xlabel('Number of Rides',
          fontsize = 14)
ax1.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right')

####may
axes2, bin2, bars2 = ax2.hist(may_df['ride_count'][0:50],
                              color = 'steelblue',
                              ec ='gray')

ax2.bar_label(bars2,
             fontsize = 12)

ax2.set_title('May 2023',
         fontsize = 16)

ax2.set_xlabel('Number of Rides',
          fontsize = 14)
ax2.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right')

####june
axes3, bin3, bars3 = ax3.hist(june_df['ride_count'][0:50],
                              color = 'navy',
                              ec ='gray')

ax3.bar_label(bars3,
             fontsize = 12)

ax3.set_title('June 2023',
         fontsize = 16)

ax3.set_xlabel('Number of Rides',
          fontsize = 14)
ax3.set_ylabel('Count of Stations',
          fontsize = 14,
          rotation = 0,
          ha = 'right');

After drilling down to the lowest 50 stations each month, I now want to map them.

In [None]:
#In addition to each month, I'm going to create a separate dataframe of the stations appearing in all months
merged_df = june_df[0:50].merge(right=may_df['station_id'][0:50], how = 'inner', on='station_id').merge(right=april_df['station_id'][0:50], how = 'inner', on='station_id')

In [None]:
#Create polygon of stations with June stations to be boundary in map
june_gdf = geopandas.GeoDataFrame(june_df,
                                   geometry=geopandas.points_from_xy(june_df['lng'], june_df['lat']),
                                   crs="EPSG:4326")

#use unary_union to join all station geometries, then apply convex_hull
polygon_geom = june_gdf.unary_union.convex_hull

#turn into geodataframe
polygon = gpd.GeoDataFrame(index=[0], crs='epsg:4326', geometry=[polygon_geom]) 

In [None]:
#set map to New York City
m=folium.Map(location=(40.730610, -73.935242), zoom_start = 11)

#Create FeatureGroup for LayerControl
fg_april = folium.FeatureGroup(name='April')
fg_may = folium.FeatureGroup(name='May')
fg_june = folium.FeatureGroup(name='June')
fg_merged = folium.FeatureGroup(name='Merged')
fg_polygon = folium.FeatureGroup(name='Boundaries')

#create markers for April that are light blue
for i, row in april_df[0:50].iterrows():
    folium.Marker(location = [row['lat'],row['lng']],
                  tooltip = 'Station ID: '+str(row['station_id'])+'. Ride Count: '+ str(row['ride_count']), 
                  icon=folium.Icon(color='lightblue')).add_to(fg_april)
fg_april.add_to(m)

#create markers for May that are blue
for i, row in may_df[0:50].iterrows():
    folium.Marker(location = [row['lat'],row['lng']], 
                  tooltip = 'Station ID: '+str(row['station_id'])+'. Ride Count: '+ str(row['ride_count']), 
                  icon=folium.Icon(color='blue')).add_to(fg_may)
fg_may.add_to(m)

#create markers for June that are dark blue
for i, row in june_df[0:50].iterrows():
    folium.Marker(location = [row['lat'],row['lng']], 
                 tooltip = 'Station ID: '+str(row['station_id'])+'. Ride Count: '+ str(row['ride_count']), 
                 icon=folium.Icon(color='darkblue')).add_to(fg_june)
fg_june.add_to(m)

#create markers for Merged that are purple
for i, row in merged_df.iterrows():
    folium.Marker(location = [row['lat'],row['lng']], 
                 tooltip = 'Station ID: '+str(row['station_id']), 
                 icon=folium.Icon(color='purple')).add_to(fg_merged)
fg_merged.add_to(m)

#Add Boundary
folium.GeoJson(data=polygon).add_to(fg_polygon)
fg_polygon.add_to(m)

#Add LayerControl to toggle between the dataframes that are now separate layers
folium.LayerControl(position='topright').add_to(m)

#Show the map
m