# Citibike Data Retrieval

In [11]:
# Core data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Date/time handling
from datetime import datetime

import gdown
from pathlib import Path

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [12]:
os.makedirs("../data", exist_ok=True)

folder_url = "https://drive.google.com/drive/folders/1NhBgnlArKS2kISV44Cl-JOexjGWTcllE"

# Download Google Drive folder INTO data/
gdown.download_folder(
    folder_url,
    output="data",
    quiet=False,
    use_cookies=False
)

Retrieving folder contents
 47%|████▋     | 90.7M/195M [01:23<01:35, 1.09MB/s]
 99%|█████████▉| 192M/195M [00:40<00:00, 4.79MB/s]


Retrieving folder 1jVOpmbqFMiJbOy9i3ysy_7Luif6UVbtH cleaned_samples
Processing file 1dQfvcvmF0Wcqgd_Q187lpRGIvCRsNkQq 202409_citibike_tripdata_cleaned_sample_20000_random.csv
Retrieving folder 17RXa6dEaXKKwsRWkCxq7fI4Z7LFTp015 raw_data
Processing file 18hvTODjbRukhryuXvBMoGlIacwRBiXqT 202409-citibike-tripdata_1.csv
Processing file 1Y_TDcB-6h8CVNm2B0Z3ieZU6Z5Xr9xl7 202409-citibike-tripdata_2.csv
Processing file 154Q2KqbGqO3sQUzoD7CyWhdDQFALQPIK 202409-citibike-tripdata_3.csv
Processing file 12wKBAUO7q1LzdOathro1zuiZz5idj5B2 202409-citibike-tripdata_4.csv
Processing file 1h6s9V-hvdRoe52vB1EUUza2psszEEkrB 202409-citibike-tripdata_5.csv
Processing file 1hAo7cQQP2rZ6nVzpUakXJSWH_de0a3QG 202409_citibike_tripdata_cleaned.csv


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1dQfvcvmF0Wcqgd_Q187lpRGIvCRsNkQq
To: /Users/erhan/Desktop/citibike/notebooks/data/cleaned_samples/202409_citibike_tripdata_cleaned_sample_20000_random.csv
100%|██████████| 4.37M/4.37M [00:00<00:00, 30.3MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=18hvTODjbRukhryuXvBMoGlIacwRBiXqT
From (redirected): https://drive.google.com/uc?id=18hvTODjbRukhryuXvBMoGlIacwRBiXqT&confirm=t&uuid=96655af7-b458-4e4a-99a2-8a38f550af54
To: /Users/erhan/Desktop/citibike/notebooks/data/raw_data/202409-citibike-tripdata_1.csv
100%|██████████| 195M/195M [00:03<00:00, 49.8MB/s] 
Downloading...
From (original): https://drive.google.com/uc?id=1Y_TDcB-6h8CVNm2B0Z3ieZU6Z5Xr9xl7
From (redirected): https://drive.google.com/uc?id=1Y_TDcB-6h8CVNm2B0Z3ieZU6Z5Xr9xl7&confirm=t&uuid=3d966939-3460-44f6-a295-7c35a8564bdc
To: /Users/erhan/Desktop/ci

['data/cleaned_samples/202409_citibike_tripdata_cleaned_sample_20000_random.csv',
 'data/raw_data/202409-citibike-tripdata_1.csv',
 'data/raw_data/202409-citibike-tripdata_2.csv',
 'data/raw_data/202409-citibike-tripdata_3.csv',
 'data/raw_data/202409-citibike-tripdata_4.csv',
 'data/raw_data/202409-citibike-tripdata_5.csv',
 'data/202409_citibike_tripdata_cleaned.csv']

In [13]:
# Find the downloaded folder automatically
folder_name = 'data/raw_data'

csv_files = [
    os.path.join(folder_name, f)
    for f in os.listdir(folder_name)
    if f.startswith("202409-citibike-tripdata") and f.endswith(".csv")
]

# Read and concatenate
df = pd.concat(
    [pd.read_csv(f) for f in csv_files],
    ignore_index=True
)

print(df.shape)
df.head()

  [pd.read_csv(f) for f in csv_files],
  [pd.read_csv(f) for f in csv_files],
  [pd.read_csv(f) for f in csv_files],
  [pd.read_csv(f) for f in csv_files],
  [pd.read_csv(f) for f in csv_files],


(4997898, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F58AB98A78A80FED,classic_bike,2024-09-06 17:38:01.429,2024-09-06 17:49:21.529,Columbus Ave & W 59 St,6986.07,W 29 St & 9 Ave,6416.06,40.76931,-73.98464,40.750073,-73.998393,member
1,6FC53627176E4527,electric_bike,2024-09-24 01:08:57.722,2024-09-24 01:33:14.766,Columbus Ave & W 59 St,6986.07,Meserole Ave & Manhattan Ave,5666.04,40.76931,-73.98464,40.727086,-73.952991,member
2,59F909611C4B9A41,electric_bike,2024-09-14 22:32:37.247,2024-09-14 22:42:06.814,Division St & Bowery,5311.08,Pearl St & Hanover Square,4993.02,40.71419,-73.99673,40.704718,-74.00926,member
3,0825C91A785D1787,electric_bike,2024-09-08 17:35:46.831,2024-09-08 17:45:38.688,E 2 St & Ave A,5553.1,E 1 St & Bowery,5636.13,40.723077,-73.985836,40.724861,-73.992131,member
4,5E6B3ED5913ACB28,electric_bike,2024-09-14 14:51:04.508,2024-09-14 15:04:13.372,W 22 St & 10 Ave,6306.06,E 1 St & Bowery,5636.13,40.74692,-74.004519,40.724861,-73.992131,member


In [14]:
os.makedirs("data", exist_ok=True)

# Save CSV into data folder
df.to_csv("data/202409_citibike_tripdata_combined.csv", index=False)