This is a story of a cycling company, it's needs, and how to get there.

## Library Installed
- pandas
- numpy
- seaborn
- matplotlib
- geopandas
- shapely

In [None]:
!pip install pandas numpy seaborn matplotlib geopandas shapely

### Downloading data

In [None]:
!wget https://divvy-tripdata.s3.amazonaws.com/202307-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202308-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202309-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202310-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202311-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202312-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202401-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202402-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202403-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202404-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202405-divvy-tripdata.zip
!wget https://divvy-tripdata.s3.amazonaws.com/202406-divvy-tripdata.zip

In [None]:
!unzip 202307-divvy-tripdata.zip
!unzip 202308-divvy-tripdata.zip
!unzip 202309-divvy-tripdata.zip
!unzip 202310-divvy-tripdata.zip
!unzip 202311-divvy-tripdata.zip
!unzip 202312-divvy-tripdata.zip
!unzip 202401-divvy-tripdata.zip
!unzip 202402-divvy-tripdata.zip
!unzip 202403-divvy-tripdata.zip
!unzip 202404-divvy-tripdata.zip
!unzip 202405-divvy-tripdata.zip
!unzip 202406-divvy-tripdata.zip

Now we start really analyzing

## Data Loading and Cleanup

In [None]:
import pandas as pd
import numpy as np

In [None]:
trip_data_202307 = pd.read_csv('202307-divvy-tripdata.csv')
trip_data_202308 = pd.read_csv('202308-divvy-tripdata.csv')
trip_data_202309 = pd.read_csv('202309-divvy-tripdata.csv')
trip_data_202310 = pd.read_csv('202310-divvy-tripdata.csv')
trip_data_202311 = pd.read_csv('202311-divvy-tripdata.csv')
trip_data_202312 = pd.read_csv('202312-divvy-tripdata.csv')
trip_data_202401 = pd.read_csv('202401-divvy-tripdata.csv')
trip_data_202402 = pd.read_csv('202402-divvy-tripdata.csv')
trip_data_202403 = pd.read_csv('202403-divvy-tripdata.csv')
trip_data_202404 = pd.read_csv('202404-divvy-tripdata.csv')
trip_data_202405 = pd.read_csv('202405-divvy-tripdata.csv')
trip_data_202406 = pd.read_csv('202406-divvy-tripdata.csv')

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd


def calculate_distances_geopandas(start_lat, start_long, end_lat, end_long):
    """
    Calculates the distances between pairs of start and end coordinates using GeoPandas.

    Args:
        start_lat (pd.Series): A Series containing start latitudes.
        start_long (pd.Series): A Series containing start longitudes.
        end_lat (pd.Series): A Series containing end latitudes.
        end_long (pd.Series): A Series containing end longitudes.

    Returns:
        pd.Series: A Series containing the distances between corresponding start-end coordinate pairs in kilometers.
    """

    # Input validation (same length for all series)
    if not len({len(start_lat), len(start_long), len(end_lat), len(end_long)}) == 1:
        raise ValueError("All input Series must have the same length.")

    # Create Point geometries from start and end coordinates
    start_points = gpd.GeoSeries(
        [Point(lon, lat) for lat, lon in zip(start_lat, start_long)], crs="EPSG:4326")
    end_points = gpd.GeoSeries(
        [Point(lon, lat) for lat, lon in zip(end_lat, end_long)], crs="EPSG:4326")

    # Calculate distances (in meters) and convert to kilometers
    distance = start_points.distance(end_points, align=True)

    return distance

df = pd.concat(
    [trip_data_202307, trip_data_202308, trip_data_202309, trip_data_202310, trip_data_202311, trip_data_202312,
     trip_data_202401, trip_data_202402, trip_data_202403, trip_data_202404, trip_data_202405, trip_data_202406])

df.reset_index(inplace=True)

df.dropna()
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
df['member_casual'] = df['member_casual'].astype('category')
df['ride_id'] = df['ride_id'].astype('string')
df['rideable_type'] = df['rideable_type'].astype('string')
df['start_station_name'] = df['start_station_name'].astype('string')
df['start_station_id'] = df['start_station_id'].astype('string')
df['end_station_name'] = df['end_station_name'].astype('string')
df['end_station_id'] = df['end_station_id'].astype('string')

df['ride_length'] = df['ended_at'] - df['started_at']

df['distance'] = calculate_distances_geopandas(
    df['start_lat'],
    df['start_lng'],
    df['end_lat'],
    df['end_lng']
)

print(df.info())
print(df.describe())
df.head()

In [None]:
del trip_data_202307
del trip_data_202308
del trip_data_202309
del trip_data_202310
del trip_data_202311
del trip_data_202312
del trip_data_202401
del trip_data_202402
del trip_data_202403
del trip_data_202404
del trip_data_202405
del trip_data_202406

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df.count()

As we can see, there is an anomalous set of data that throws our scatterplot to dissarray. Now, let's find it, and clean it up. First we have to check if these distances are for rides that have no end (or end is null)

After we do that, we will conclude with filtering out some outliers from our code using Interquartile Range.

In [None]:
df_filtered = df[df['ended_at'].notnull()]
df_filtered = df_filtered[df_filtered['distance'] != 0]
df_filtered = df[df['start_station_name'].notnull()]
df_filtered = df[df['start_station_id'].notnull()]
df_filtered = df[df['end_station_name'].notnull()]
df_filtered = df[df['end_station_id'].notnull()]

df_filtered = df_filtered[df_filtered['ride_length'] != 0]
df_filtered = df_filtered[df_filtered['ride_length'] != '00:00:00']
df_filtered = df_filtered[df_filtered['ride_length'].notnull()]

df_filtered.drop_duplicates(inplace=True)

df_filtered.count()

# Calculate the IQR
Q1 = df_filtered['distance'].quantile(0.25)
Q3 = df_filtered['distance'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_filtered = df_filtered[(df_filtered['distance'] >= lower_bound) & (df_filtered['distance'] <= upper_bound)]

df_filtered.count()

Now let's see how our plot looks like again.