In [None]:
import os
import zipfile
import pandas as pd

RAW_DATA_DIR = "raw_data"
DATA_DIR = "data"

os.makedirs(DATA_DIR, exist_ok=True)

# unzip all downloaded files
for file in os.listdir(RAW_DATA_DIR):
    if file.endswith(".zip"):
        with zipfile.ZipFile(os.path.join(RAW_DATA_DIR, file), 'r') as zip_ref:
            zip_ref.extractall(DATA_DIR)

print("All files unzipped.")


In [None]:
# list all CSV files in the data folder
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
csv_files

In [None]:
df_list = []

for file in csv_files:
    path = os.path.join(DATA_DIR, file)
    df = pd.read_csv(path)
    df_list.append(df)

all_trips = pd.concat(df_list, ignore_index=True)

print("Total rows:", len(all_trips))
all_trips.head()


In [None]:
all_trips['started_at'] = pd.to_datetime(all_trips['started_at'])
all_trips['ended_at']   = pd.to_datetime(all_trips['ended_at'])
all_trips['ride_length'] = (all_trips['ended_at'] - all_trips['started_at']).dt.total_seconds() / 60
all_trips['day_of_week'] = all_trips['started_at'].dt.weekday + 1


In [None]:
all_trips = all_trips[all_trips['ride_length'] > 0]
all_trips = all_trips[all_trips['ride_length'] <= 1440]
len(all_trips)

In [None]:
required_cols = [
    'ride_id', 'started_at', 'ended_at', 'rideable_type', 'member_casual', 'start_lat', 'start_lng',
    'end_lat', 'end_lng'
]

all_trips = all_trips.dropna(subset=required_cols)
print("After dropping rows with null critical fields:", len(all_trips))


In [None]:
all_trips.head()

In [None]:
import numpy as np

print("Mean ride length:", round(all_trips['ride_length'].mean(), 2))
print("Max ride length:", round(all_trips['ride_length'].max(), 2))
print("Mode ride length:", round(all_trips['ride_length'].mode()[0], 2))
print("Mode day of week:", all_trips['day_of_week'].mode()[0])
avg_by_user = all_trips.groupby("member_casual")["ride_length"].mean()
print("\nAverage ride length by rider type:")
print(avg_by_user)

In [None]:
avg_by_day_user = all_trips.pivot_table(
    values="ride_length",
    index="member_casual",
    columns="day_of_week",
    aggfunc="mean"
)

print("\nAverage ride length by rider type AND day of week:")
print(avg_by_day_user)


In [None]:
rides_by_day = all_trips["day_of_week"].value_counts().sort_index()
print("\nNumber of rides per day of week:")
print(rides_by_day)
rides_by_day_user = all_trips.groupby(["day_of_week", "member_casual"])["ride_id"].count()
print("\nRides per day by user type:")
print(rides_by_day_user)


In [None]:
rides_by_bike_type = all_trips.groupby(["member_casual", "rideable_type"])["ride_id"].count()
print(rides_by_bike_type)


In [None]:
avg_length_by_bike = all_trips.groupby("rideable_type")["ride_length"].mean()
print("Average ride length by bike type:")
print(avg_length_by_bike)
avg_length_by_rider = all_trips.groupby("member_casual")["ride_length"].mean()
print("\nAverage ride length by rider type:")
print(avg_length_by_rider)
avg_length_interaction = all_trips.groupby(["member_casual", "rideable_type"])["ride_length"].mean()
print("\nAverage ride length by rider type AND bike type:")
print(avg_length_interaction)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [None]:
avg_by_rider = all_trips.groupby("member_casual")["ride_length"].mean()

plt.figure(figsize=(6,4))
sns.barplot(x=avg_by_rider.index, y=avg_by_rider.values)
plt.title("Average Ride Length by Rider Type")
plt.ylabel("Ride Length (minutes)")
plt.xlabel("Rider Type")
plt.show()


In [None]:
avg_by_bike = all_trips.groupby("rideable_type")["ride_length"].mean()

plt.figure(figsize=(6,4))
sns.barplot(x=avg_by_bike.index, y=avg_by_bike.values)
plt.title("Average Ride Length by Bike Type")
plt.ylabel("Ride Length (minutes)")
plt.xlabel("Bike Type")
plt.show()


In [None]:
avg_interaction = all_trips.groupby(["member_casual", "rideable_type"])["ride_length"].mean().unstack()

plt.figure(figsize=(8,5))
sns.heatmap(avg_interaction, annot=True, cmap="Blues", fmt=".1f")
plt.title("Average Ride Length by Rider Type AND Bike Type")
plt.ylabel("Rider Type")
plt.xlabel("Bike Type")
plt.show()


In [None]:
rides_per_day = all_trips["day_of_week"].value_counts().sort_index()

plt.figure(figsize=(8,4))
sns.barplot(x=rides_per_day.index, y=rides_per_day.values, palette="viridis")
plt.title("Number of Rides by Day of Week")
plt.xlabel("Day of Week (1=Monday ... 7=Sunday)")
plt.ylabel("Number of Rides")
plt.show()


In [None]:
rides_day_user = all_trips.groupby(["day_of_week", "member_casual"])["ride_id"].count().unstack()

plt.figure(figsize=(10,5))
rides_day_user.plot(kind="bar", figsize=(10,5))
plt.title("Rides per Day of Week by Rider Type")
plt.xlabel("Day of Week")
plt.ylabel("Number of Rides")
plt.xticks(rotation=0)
plt.show()


In [None]:
all_trips["month"] = all_trips["started_at"].dt.month
rides_per_month = all_trips["month"].value_counts().sort_index()

plt.figure(figsize=(8,4))
sns.lineplot(x=rides_per_month.index, y=rides_per_month.values, marker="o")
plt.title("Monthly Ride Volume")
plt.xlabel("Month (1 = January)")
plt.ylabel("Number of Rides")
plt.show()


In [None]:
all_trips["hour"] = all_trips["started_at"].dt.hour
hourly_usage = all_trips.groupby(["hour", "member_casual"])["ride_id"].count().unstack()

plt.figure(figsize=(12,5))
hourly_usage.plot()
plt.title("Hourly Ride Distribution by Rider Type")
plt.xlabel("Hour of Day")
plt.ylabel("Ride Count")
plt.show()

In [None]:
!pip install folium


In [None]:
import folium
from folium.plugins import HeatMap


In [None]:
chicago_map = folium.Map(location=[41.8781, -87.6298], zoom_start=12)
chicago_map


In [None]:
heat_data = all_trips[['start_lat', 'start_lng']].dropna().values.tolist()

map_all = folium.Map(location=[41.8781, -87.6298], zoom_start=12)
HeatMap(heat_data, radius=6).add_to(map_all)

map_all


In [None]:
heat_data = all_trips[['start_lat', 'start_lng']].dropna().values.tolist()

map_all = folium.Map(location=[41.8781, -87.6298], zoom_start=12, tiles="CartoDB positron")

HeatMap(
    heat_data, 
    radius=4,     # smaller radius = better detail
    blur=8,       # cleaner edges
    min_opacity=0.4,
    max_zoom=15
).add_to(map_all)

map_all


In [None]:
from folium.plugins import MarkerCluster
filtered = all_trips[
    (all_trips['start_lat'] > 41.64) &
    (all_trips['start_lat'] < 42.05) &
    (all_trips['start_lng'] > -87.94) &
    (all_trips['start_lng'] < -87.52)
]

marker_map = folium.Map(location=[41.8781, -87.6298], zoom_start=12)

marker_cluster = MarkerCluster().add_to(marker_map)

for _, row in filtered.sample(20000).iterrows():
    folium.Marker([row['start_lat'], row['start_lng']]).add_to(marker_cluster)

marker_map


In [None]:
casual = filtered[ filtered['member_casual'] == 'casual' ]
print(len(casual))
# center on Chicago
casual_map = folium.Map(location=[41.8781, -87.6298], zoom_start=12)

marker_cluster = MarkerCluster().add_to(casual_map)

# sample up to 20,000 points for performance
sample_casual = casual.sample(20000, random_state=42)

for _, row in sample_casual.iterrows():
    folium.Marker([row['start_lat'], row['start_lng']]).add_to(marker_cluster)

casual_map

In [None]:
from folium.plugins import MarkerCluster

# center on Chicago
member_map = folium.Map(location=[41.8781, -87.6298], zoom_start=12)

marker_cluster = MarkerCluster().add_to(member_map)

# sample up to 20,000 points for performance
sample_members = members.sample(20000, random_state=42)

for _, row in sample_members.iterrows():
    folium.Marker([row['start_lat'], row['start_lng']]).add_to(marker_cluster)

member_map



In [None]:
# Add hour and is_weekend columns
all_trips["hour"] = all_trips["started_at"].dt.hour
all_trips["is_weekend"] = all_trips["day_of_week"].isin([6, 7])  # 6=Sat, 7=Sun

# Step 1 — Count rides per DATE
daily_counts = (
    all_trips.groupby([
        "member_casual",
        "is_weekend",
        "hour",
        all_trips["started_at"].dt.date
    ])["ride_id"]
    .count()
    .reset_index(name="daily_ride_count")
)

# Step 2 — Now average across ALL weekdays or ALL weekends
avg_hourly = (
    daily_counts.groupby(["member_casual", "is_weekend", "hour"])["daily_ride_count"]
    .mean()
    .reset_index()
)

# Clean day labels
avg_hourly["day_type"] = avg_hourly["is_weekend"].map({
    True: "Weekend",
    False: "Weekday"
})

# Convert hour numbers to readable 12-hour AM/PM format
hour_labels = {
    0: "12 AM", 1: "1 AM", 2: "2 AM", 3: "3 AM", 4: "4 AM", 5: "5 AM",
    6: "6 AM", 7: "7 AM", 8: "8 AM", 9: "9 AM", 10: "10 AM", 11: "11 AM",
    12: "12 PM", 13: "1 PM", 14: "2 PM", 15: "3 PM", 16: "4 PM", 17: "5 PM",
    18: "6 PM", 19: "7 PM", 20: "8 PM", 21: "9 PM", 22: "10 PM", 23: "11 PM"
}

avg_hourly["hour_label"] = avg_hourly["hour"].map(hour_labels)

# Plot with correct x-labels
plt.figure(figsize=(14, 7))
sns.lineplot(
    data=avg_hourly,
    x="hour_label",
    y="daily_ride_count",
    hue="member_casual",
    style="day_type"
)

plt.title("Average Rides per Hour (Correct Daily Average – Weekday vs Weekend)")
plt.xlabel("Hour of Day")
plt.ylabel("Average Ride Count Per Day")
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()



In [None]:
# Compute average ride length by rider type and day of week
avg_length_day = all_trips.groupby(["member_casual", "day_of_week"])["ride_length"].mean().reset_index()

# Sort by proper weekday order (1=Monday,...,7=Sunday if using dt.weekday+1)
weekday_order = [1, 2, 3, 4, 5, 6, 7]
avg_length_day["day_of_week"] = pd.Categorical(avg_length_day["day_of_week"], categories=weekday_order, ordered=True)

plt.figure(figsize=(12, 6))
sns.lineplot(
    data=avg_length_day,
    x="day_of_week",
    y="ride_length",
    hue="member_casual",
    marker="o"
)

plt.title("Average Ride Length by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Average Ride Length (Minutes)")
plt.xticks(weekday_order, ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
plt.grid(alpha=0.3)
plt.show()


In [None]:
all_trips.head(20)