# Citi Bike 2024 - Data Import & Merge

There is additional Citi Bike data available for 2013-2023 and 2025, but **one year (2024)** will be enough for this project.

In [1]:
# Change working directory to project root
%cd OneDrive/Documents/Portfolio_Projects/project7.1_citi_bike_analysis

C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\project7.1_citi_bike_analysis


In [3]:
# Import libraries
import pandas as pd
import glob
import os
from pathlib import Path

In [5]:
# Path to folder with necessary data (relative to script location)
folder_path = Path.cwd() / "original_data" / "2024-citibike_tripdata"
print(folder_path.exists())

True


In [7]:
# Get list of all CSV files in this folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Preview file names
print(f"Found {len(csv_files)} CSV files")

Found 50 CSV files


In [9]:
# Define fixed data types for key columns
fixed_dtypes = {
    'start_station_name': 'string',
    'end_station_name': 'string'
}

In [11]:
# Set up list for all DataFrames to reside in
df_list = []

# Loop through and import all files into df_list (all CSVs become DataFrames)
for i, csv in enumerate(csv_files):  # i = index of current file
    print(f"Loading {i+1}/{len(csv_files)}: {os.path.basename(csv)}...", end=" ", flush=True)

    df = pd.read_csv(csv, dtype=fixed_dtypes, parse_dates=['started_at', 'ended_at'], low_memory=False)

    df_list.append(df)

    print("Loaded ✅")

Loading 1/50: 202401-citibike-tripdata_1.csv... Loaded ✅
Loading 2/50: 202401-citibike-tripdata_2.csv... Loaded ✅
Loading 3/50: 202402-citibike-tripdata_1.csv... Loaded ✅
Loading 4/50: 202402-citibike-tripdata_2.csv... Loaded ✅
Loading 5/50: 202402-citibike-tripdata_3.csv... Loaded ✅
Loading 6/50: 202403-citibike-tripdata_1.csv... Loaded ✅
Loading 7/50: 202403-citibike-tripdata_2.csv... Loaded ✅
Loading 8/50: 202403-citibike-tripdata_3.csv... Loaded ✅
Loading 9/50: 202404-citibike-tripdata_1.csv... Loaded ✅
Loading 10/50: 202404-citibike-tripdata_2.csv... Loaded ✅
Loading 11/50: 202404-citibike-tripdata_3.csv... Loaded ✅
Loading 12/50: 202404-citibike-tripdata_4.csv... Loaded ✅
Loading 13/50: 202405-citibike-tripdata_1.csv... Loaded ✅
Loading 14/50: 202405-citibike-tripdata_2.csv... Loaded ✅
Loading 15/50: 202405-citibike-tripdata_3.csv... Loaded ✅
Loading 16/50: 202405-citibike-tripdata_4.csv... Loaded ✅
Loading 17/50: 202405-citibike-tripdata_5.csv... Loaded ✅
Loading 18/50: 202406-c

In [13]:
# Concatenate all DataFrames 
citibike_2024 = pd.concat(df_list, ignore_index=True)

# Preview resulting DataFrame
citibike_2024

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8E865410DBDE0CA9,electric_bike,2024-01-01 13:00:04.563,2024-01-01 13:04:04.652,3 St & 3 Ave,4028.03,Carroll St & Smith St,4225.14,40.675070,-73.987752,40.680611,-73.994758,casual
1,0403D0B3FC9CA77D,electric_bike,2024-01-08 19:36:43.520,2024-01-08 19:53:16.266,Franklin Ave & St Marks Ave,4107.05,Bedford Ave & Bergen St,4066.15,40.675832,-73.956168,40.676368,-73.952918,casual
2,F6DE7BB42FF550BE,electric_bike,2024-01-12 15:00:41.580,2024-01-12 15:36:29.622,W 67 St & Broadway,7116.04,Central Park W & W 103 St,7577.27,40.774925,-73.982666,40.795590,-73.961884,casual
3,84A995BFD98030D4,classic_bike,2024-01-12 16:52:19.025,2024-01-12 17:17:29.773,Central Park West & W 68 St,7079.06,E 5 St & Ave C,5545.04,40.773407,-73.977825,40.722992,-73.979955,member
4,7BBEAD4F2B535813,electric_bike,2024-01-05 19:50:19.202,2024-01-05 20:34:42.517,W 67 St & Broadway,7116.04,Ave A & E 14 St,5779.11,40.774925,-73.982666,40.730311,-73.980472,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44303204,B1E3F3787B411A57,classic_bike,2024-12-30 17:04:18.514,2024-12-30 17:26:29.550,8 Ave & W 31 St,6450.05,Riverside Dr & W 82 St,7388.10,40.750585,-73.994685,40.787209,-73.981281,member
44303205,218197DE046FF8FB,electric_bike,2024-12-17 17:57:37.442,2024-12-17 18:21:38.471,11 Ave & W 27 St,6425.04,E 4 St & Ave B,5515.08,40.751396,-74.005226,40.723347,-73.982659,member
44303206,51FC2F035DAF6CCE,electric_bike,2024-12-19 21:58:03.975,2024-12-19 22:04:50.053,E 55 St & 2 Ave,6650.07,E 26 St & 3 Ave,6089.11,40.757973,-73.966033,40.740693,-73.981606,member
44303207,2BFA9D9E61CF5D2C,classic_bike,2024-12-20 16:36:19.789,2024-12-20 16:44:40.298,E 55 St & 2 Ave,6650.07,E 39 St & Lexington Ave,6389.09,40.757973,-73.966033,40.749499,-73.977292,member


~44.3 million rows of data have been uploaded, so it will be necessary to filter for only **a few consecutive months from 2024** to reduce the number of rows to under 10 million for better performance in Python and Tableau.

**July and August** (the two full summer months) from **2024** will be the focus. Bike trips most likely peak during this time of year anyway.

In [15]:
# Filter for only July and August
citibike_summer_2024 = citibike_2024[citibike_2024['started_at'].dt.month.isin([7, 8])]

# Preview results
citibike_summer_2024

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
18807481,6C3563ED9BD36F2B,electric_bike,2024-07-10 09:12:44.192,2024-07-10 09:18:13.145,Front St & Jay St,4895.03,Dock 72 Way & Market St,4804.02,40.702461,-73.986842,40.699850,-73.971410,member
18807482,788C72113A42CACD,classic_bike,2024-07-12 07:35:39.714,2024-07-12 07:37:28.966,W 10 St & Washington St,5847.06,Perry St & Bleecker St,5922.07,40.733424,-74.008515,40.735354,-74.004831,member
18807483,239DEA356066DDF7,classic_bike,2024-07-04 12:59:46.344,2024-07-04 13:02:37.572,E 25 St & 2 Ave,6046.02,E 20 St & 2 Ave,5971.08,40.739126,-73.979738,40.735877,-73.982050,member
18807484,90B5EE27A7CCB271,electric_bike,2024-07-02 18:55:21.450,2024-07-02 18:59:12.023,E 3 St & 1 Ave,5553.03,Forsyth St & Grand St,5382.07,40.724677,-73.987834,40.717798,-73.993161,member
18807485,091DEE951A54DAF4,electric_bike,2024-07-14 05:25:11.691,2024-07-14 05:29:31.193,E 147 St & Bergen Ave,7840.11,E 135 St & St Ann's Ave,7687.05,40.814673,-73.918390,40.805089,-73.918889,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30465660,C1B4F5FA558B3AE9,classic_bike,2024-08-31 23:51:12.993,2024-09-01 00:05:03.788,Kenmare St & Elizabeth St,5453.06,E 11 St & 3 Ave,5788.16,40.720540,-73.994900,40.731270,-73.988490,member
30465896,427E0C643DEB455F,classic_bike,2024-08-31 23:48:41.575,2024-09-01 00:00:38.764,Wythe Ave & Metropolitan Ave,5348.02,Railroad Ave & Kay Ave,4990.02,40.716887,-73.963198,40.705148,-73.970781,member
30469377,68A04BC09519D5D0,classic_bike,2024-08-31 23:53:12.454,2024-09-01 00:17:34.536,Ashland Pl & Dekalb Ave,4513.09,Hanson Pl & Ashland Pl,4395.07,40.690065,-73.978776,40.685068,-73.977908,casual
30473637,35A09E13139B9CAA,electric_bike,2024-08-31 23:59:33.119,2024-09-01 00:34:42.935,W 87 St & West End Ave,7484.05,E Fordham Rd & Webster Ave,8582.09,40.789622,-73.977570,40.861748,-73.891050,member


There are now only ~9.3 million rows of data to work with. They will be cleaned and prepared for analysis in the next notebook.

In [17]:
# Save filtered DataFrame to CSV and Pickle
citibike_summer_2024.to_csv("citibike_summer_2024.csv", index=False)
citibike_summer_2024.to_pickle("citibike_summer_2024.pkl")  # loads much faster in Python