

## Retrieve data

First, let's get our info

In [None]:
!pip install pandas numpy seaborn matplotlib geopandas shapely

In [None]:
!wget https://divvy-tripdata.s3.amazonaws.com/202307-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202308-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202309-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202310-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202311-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202312-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202401-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202402-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202403-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202404-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202405-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202406-divvy-tripdata.zip

In [None]:
!unzip 202307-divvy-tripdata.zip
!unzip 202308-divvy-tripdata.zip
!unzip 202309-divvy-tripdata.zip
!unzip 202310-divvy-tripdata.zip
!unzip 202311-divvy-tripdata.zip
!unzip 202312-divvy-tripdata.zip
!unzip 202401-divvy-tripdata.zip
!unzip 202402-divvy-tripdata.zip
!unzip 202403-divvy-tripdata.zip
!unzip 202404-divvy-tripdata.zip
!unzip 202405-divvy-tripdata.zip
!unzip 202406-divvy-tripdata.zip

In [None]:
# Download Parameters
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import sqlite3
import zipfile
from pathlib import Path
from tqdm import tqdm

Now that we have the data downloaded, let's put it into a SQLITE db for easier manipulation.

In [None]:
trip_data_202307 = pd.read_csv('202307-divvy-tripdata.csv')
trip_data_202308 = pd.read_csv('202308-divvy-tripdata.csv')
trip_data_202309 = pd.read_csv('202309-divvy-tripdata.csv')
trip_data_202310 = pd.read_csv('202310-divvy-tripdata.csv')
trip_data_202311 = pd.read_csv('202311-divvy-tripdata.csv')
trip_data_202312 = pd.read_csv('202312-divvy-tripdata.csv')
trip_data_202401 = pd.read_csv('202401-divvy-tripdata.csv')
trip_data_202402 = pd.read_csv('202402-divvy-tripdata.csv')
trip_data_202403 = pd.read_csv('202403-divvy-tripdata.csv')
trip_data_202404 = pd.read_csv('202404-divvy-tripdata.csv')
trip_data_202405 = pd.read_csv('202405-divvy-tripdata.csv')
trip_data_202406 = pd.read_csv('202406-divvy-tripdata.csv')

Now that we have everything downloaded, we are going to read the dataframe from cyclistic table in cyclistic db.

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd


def calculate_distances_geopandas(start_lat, start_long, end_lat, end_long):
    """
    Calculates the distances between pairs of start and end coordinates using GeoPandas.

    Args:
        start_lat (pd.Series): A Series containing start latitudes.
        start_long (pd.Series): A Series containing start longitudes.
        end_lat (pd.Series): A Series containing end latitudes.
        end_long (pd.Series): A Series containing end longitudes.

    Returns:
        pd.Series: A Series containing the distances between corresponding start-end coordinate pairs in kilometers.
    """

    # Input validation (same length for all series)
    if not len({len(start_lat), len(start_long), len(end_lat), len(end_long)}) == 1:
        raise ValueError("All input Series must have the same length.")

    # Create Point geometries from start and end coordinates
    start_points = gpd.GeoSeries(
        [Point(lon, lat) for lat, lon in zip(start_lat, start_long)], crs="EPSG:4326")
    end_points = gpd.GeoSeries(
        [Point(lon, lat) for lat, lon in zip(end_lat, end_long)], crs="EPSG:4326")

    # Calculate distances (in meters) and convert to kilometers
    distance = start_points.distance(end_points, align=True)

    return distance

df = pd.concat(
    [trip_data_202307, trip_data_202308, trip_data_202309, trip_data_202310, trip_data_202311, trip_data_202312,
     trip_data_202401, trip_data_202402, trip_data_202403, trip_data_202404, trip_data_202405, trip_data_202406])

df.reset_index(inplace=True)

df.dropna()
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
df['member_casual'] = df['member_casual'].astype('category')
df['ride_id'] = df['ride_id'].astype('string')
df['rideable_type'] = df['ride_id'].astype('string')
df['start_station_name'] = df['start_station_name'].astype('string')
df['start_station_id'] = df['start_station_id'].astype('string')
df['end_station_name'] = df['end_station_name'].astype('string')
df['end_station_id'] = df['end_station_id'].astype('string')

df['ride_length'] = df['ended_at'] - df['started_at']

df['distance'] = calculate_distances_geopandas(
    df['start_lat'],
    df['start_lng'],
    df['end_lat'],
    df['end_lng']
)

print(df.info())
print(df.describe())
df.head()

In [32]:
del trip_data_202307
del trip_data_202308
del trip_data_202309
del trip_data_202310
del trip_data_202311
del trip_data_202312
del trip_data_202401
del trip_data_202402
del trip_data_202403
del trip_data_202404
del trip_data_202405
del trip_data_202406

Now, let's dig in. What should we look at first? Let's look at the relationship of casual users vs member users and what type of bike they use.

In [35]:
import seaborn as sns
import matplotlib.pyplot as plt

df.count()

index                 5734381
ride_id               5734381
rideable_type         5734381
started_at            5023660
ended_at              5023660
start_station_name    4801378
start_station_id      4801378
end_station_name      4753825
end_station_id        4753825
start_lat             5734381
start_lng             5734381
end_lat               5726462
end_lng               5726462
member_casual         5734381
ride_length           5023660
distance              5726462
dtype: int64

In [None]:
# calculate the length of ride and add it to DF


In [None]:
casual_mean = df.where(df.member_casual == 'casual').ride_length.mean()
member_mean = df.where(df.member_casual == 'member').ride_length.mean()
print(f'casual mean is {casual_mean}')
print(f'member mean is {member_mean}')

delta_mean = casual_mean - member_mean
print(f'delta mean is {delta_mean}')

Our observation shows that casual users have longer ride lengths than member users.

In [None]:

print(df.info())
print(df.describe()) 
df.head()

Now to find the mean distance between casual users and member users.

In [None]:
casual_mean = df.where(df.member_casual == 'casual').distance.mean()
member_mean = df.where(df.member_casual == 'member').distance.mean()
print(f'casual distance mean is {casual_mean}')
print(f'member distance mean is {member_mean}')

delta_mean = abs(casual_mean - member_mean)
print(f'delta mean is {delta_mean}')