In [1]:
# Import Libraries

import pandas as pd 
import numpy as np
import os 
import requests
import json
from datetime import datetime

In [2]:
# Define the folder path where the CityBike data files are stored

folderpath = r"CB_Data/Original Data"

In [3]:
# Create full file paths for all files inside the folder

filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [4]:
# Display the list of full file paths found in the folder

filepaths

['CB_Data/Original Data/202208-citibike-tripdata_3.csv',
 'CB_Data/Original Data/202207-citibike-tripdata_2.csv',
 'CB_Data/Original Data/202207-citibike-tripdata_3.csv',
 'CB_Data/Original Data/202208-citibike-tripdata_2.csv',
 'CB_Data/Original Data/202207-citibike-tripdata_1.csv',
 'CB_Data/Original Data/202208-citibike-tripdata_1.csv',
 'CB_Data/Original Data/202210-citibike-tripdata_1.csv',
 'CB_Data/Original Data/202207-citibike-tripdata_4.csv',
 'CB_Data/Original Data/202208-citibike-tripdata_4.csv',
 'CB_Data/Original Data/202203-citibike-tripdata_2.csv',
 'CB_Data/Original Data/202210-citibike-tripdata_2.csv',
 'CB_Data/Original Data/202203-citibike-tripdata_1.csv',
 'CB_Data/Original Data/202210-citibike-tripdata_3.csv',
 'CB_Data/Original Data/202204-citibike-tripdata_1.csv',
 'CB_Data/Original Data/202204-citibike-tripdata_2.csv',
 'CB_Data/Original Data/202204-citibike-tripdata_3.csv',
 'CB_Data/Original Data/202211-citibike-tripdata_3.csv',
 'CB_Data/Original Data/202202-

In [6]:
# Load and combine all CSV files into one dataframe (as strings)

df = pd.concat(
    [pd.read_csv(f, dtype=str) for f in filepaths],
    ignore_index=True
)

In [7]:
# Read each CSV separately into a list of dataframes

dfs = [pd.read_csv(f, dtype=str) for f in filepaths]

In [8]:
# Print number of rows in each CSV to verify consistency across files

print("Row counts per file:")
for path, d in zip(filepaths, dfs):
    print(path, "->", d.shape)

Row counts per file:
CB_Data/Original Data/202208-citibike-tripdata_3.csv -> (1000000, 13)
CB_Data/Original Data/202207-citibike-tripdata_2.csv -> (1000000, 13)
CB_Data/Original Data/202207-citibike-tripdata_3.csv -> (1000000, 13)
CB_Data/Original Data/202208-citibike-tripdata_2.csv -> (1000000, 13)
CB_Data/Original Data/202207-citibike-tripdata_1.csv -> (1000000, 13)
CB_Data/Original Data/202208-citibike-tripdata_1.csv -> (1000000, 13)
CB_Data/Original Data/202210-citibike-tripdata_1.csv -> (1000000, 13)
CB_Data/Original Data/202207-citibike-tripdata_4.csv -> (397932, 13)
CB_Data/Original Data/202208-citibike-tripdata_4.csv -> (576020, 13)
CB_Data/Original Data/202203-citibike-tripdata_2.csv -> (845965, 13)
CB_Data/Original Data/202210-citibike-tripdata_2.csv -> (1000000, 13)
CB_Data/Original Data/202203-citibike-tripdata_1.csv -> (1000000, 13)
CB_Data/Original Data/202210-citibike-tripdata_3.csv -> (936584, 13)
CB_Data/Original Data/202204-citibike-tripdata_1.csv -> (1000000, 13)
CB_

In [9]:
# Concatenate all individual dataframes into one combined dataframe

df = pd.concat(dfs, ignore_index=True)

In [10]:
# Verify final row count matches the sum of all input CSVs

print("\nFinal DF shape:", df.shape)
print("Total rows expected:", sum(len(d) for d in dfs))
print("Match:", len(df) == sum(len(d) for d in dfs))


Final DF shape: (29838806, 13)
Total rows expected: 29838806
Match: True


In [11]:
# Check that all CSV files contain the same column names

for path, d in zip(filepaths, dfs):
    print("\n", path)
    print(sorted(d.columns))


 CB_Data/Original Data/202208-citibike-tripdata_3.csv
['end_lat', 'end_lng', 'end_station_id', 'end_station_name', 'ended_at', 'member_casual', 'ride_id', 'rideable_type', 'start_lat', 'start_lng', 'start_station_id', 'start_station_name', 'started_at']

 CB_Data/Original Data/202207-citibike-tripdata_2.csv
['end_lat', 'end_lng', 'end_station_id', 'end_station_name', 'ended_at', 'member_casual', 'ride_id', 'rideable_type', 'start_lat', 'start_lng', 'start_station_id', 'start_station_name', 'started_at']

 CB_Data/Original Data/202207-citibike-tripdata_3.csv
['end_lat', 'end_lng', 'end_station_id', 'end_station_name', 'ended_at', 'member_casual', 'ride_id', 'rideable_type', 'start_lat', 'start_lng', 'start_station_id', 'start_station_name', 'started_at']

 CB_Data/Original Data/202208-citibike-tripdata_2.csv
['end_lat', 'end_lng', 'end_station_id', 'end_station_name', 'ended_at', 'member_casual', 'ride_id', 'rideable_type', 'start_lat', 'start_lng', 'start_station_id', 'start_station_n

In [12]:
# Inspect dataframe structure and preview first/last rows

df.info()
df.head()
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29838806 entries, 0 to 29838805
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   ride_id             object
 1   rideable_type       object
 2   started_at          object
 3   ended_at            object
 4   start_station_name  object
 5   start_station_id    object
 6   end_station_name    object
 7   end_station_id      object
 8   start_lat           object
 9   start_lng           object
 10  end_lat             object
 11  end_lng             object
 12  member_casual       object
dtypes: object(13)
memory usage: 2.9+ GB


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
29838801,1F223EDAFF420AE3,electric_bike,2022-12-01 20:26:45.847,2022-12-01 20:30:46.012,Avenue D & E 3 St,5436.09,Stanton St & Chrystie St,5523.02,40.720701456,-73.977939487,40.72229346,-73.99147535,member
29838802,CFA5C560ACB73B8E,classic_bike,2022-12-26 13:46:34.237,2022-12-26 13:52:43.900,43 Ave & 47 St,6209.05,39 Ave & 45 St,6401.03,40.7448062877224,-73.91728967428207,40.74947822183477,-73.91826465725897,member
29838803,11C8C5E0DB947B07,classic_bike,2022-12-01 05:56:14.903,2022-12-01 06:06:10.357,Avenue D & E 3 St,5436.09,Bleecker St & Crosby St,5679.08,40.72082834,-73.97793172,40.72615604980408,-73.99510189890862,member
29838804,5B9B083C534A5964,classic_bike,2022-12-02 11:54:15.871,2022-12-02 12:01:00.747,Montague St & Clinton St,4677.06,Sands St & Jay St,4821.03,40.6942715,-73.9923272,40.700119,-73.9862,member
29838805,91C286C462F89A50,classic_bike,2022-12-18 13:35:22.574,2022-12-18 13:37:27.193,Montague St & Clinton St,4677.06,Cadman Plaza E & Tillary St,4677.01,40.6942715,-73.9923272,40.69597683,-73.99014892,member


In [13]:
# Assigning Path

path = r'/Users/elia/Desktop/New_York_City_Bike/CB_Data/Prepared Data'

In [14]:
# Export Dataframe to Csv

df.to_csv(os.path.join(path, 'citybike_test.csv'))