In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import folium

# Analyze member vs non member usage for New York City Bikes
data = pd.read_csv('data/202409-citibike-tripdata_1.csv', low_memory = False)
data['started_at'] = pd.to_datetime(data['started_at'])
data['day_of_week'] = data['started_at'].dt.day_name()

# Group data using members and day of the week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data['day_of_week'] = pd.Categorical(data['day_of_week'], categories = day_order, ordered = True)
member_casual = data.groupby(['member_casual', 'day_of_week']).size().unstack().T

# Plot bar graph for member and casual usages for mon to sun
member_casual.plot(kind = 'bar')
plt.title('Member vs Non-Member Usages')
plt.xlabel('Weekday')
plt.ylabel('Number of Bike Usages')
plt.legend()
plt.show()

# Analyze weather and create linear regression model
weather_data = pd.read_csv('data/202409-noaa-nyc-weatherdata.csv')

# Match different data format to merge with bike data
weather_data['date'] = pd.to_datetime(weather_data['Date'])
data['date'] = data['started_at'].dt.date
data['date'] = pd.to_datetime(data['date'])
daily_bike_usage = data.groupby('date').size().reset_index(name = 'bike_usage')

# Merge data 
merged_data = pd.merge(daily_bike_usage, weather_data, left_on='date', right_on='date')
x = merged_data[['Average']]
y = merged_data['bike_usage']

# Train Linear Regresssion model
model = LinearRegression()
model.fit(x, y)
predictions = model.predict(x)

# Plot data and predictions from model
plt.scatter(x['Average'], y, color='blue', label='Actual')
plt.plot(x['Average'], predictions, color='red', label='Predicted')
plt.xlabel('Average Temperature')
plt.ylabel('Bike Usage')
plt.legend()
plt.show()

# Find top three routes used and save as html
def find_top_routes(bike_data):
    routes = bike_data.groupby(['start_station_name', 'end_station_name', 'start_lat', 
                       'start_lng', 'end_lat', 'end_lng']).size().reset_index(name = 'total_trips')
    top_3_routes = routes.sort_values(by = 'total_trips', ascending = False).head(3)
    print(top_3_routes)

    nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=12)
    for index, row in top_3_routes.iterrows():
        folium.PolyLine([(row['start_lat'], row['start_lng']), (row['end_lat'], row['end_lng'])],
                        color = "blue", weight = 2, opacity = 1).add_to(nyc_map)
    nyc_map.save('nyc_bike_routes.html')

find_top_routes(data)


KeyError: 'started_at'