In [32]:
# Importing Libraries
import pandas as pd
import glob ##For reading multiple files
import numpy as np
import dask.dataframe as dd


In [75]:
folder_path = 'NY-citibike-tripdata/*.csv' ##Path to the folder containing the data files
csv_files = glob.glob(folder_path) ##Reading all the files in the folder
csv_files = csv_files[:6] ##Reading only the first :5 files
data_df = pd.concat(
    [pd.read_csv(file, dtype={5: str, 7: str,}) for file in csv_files]
)  ##Concatenating all the files to form a


In [76]:
data_df.count()

ride_id               11889944
rideable_type         11889944
started_at            11889944
ended_at              11889944
start_station_name    11880688
start_station_id      11880688
end_station_name      11860275
end_station_id        11859022
start_lat             11889944
start_lng             11889944
end_lat               11886416
end_lng               11886416
member_casual         11889944
dtype: int64

###Data Exploration & Pre-Processing


In [77]:
data_df.columns = data_df.columns.str.upper() ##Converting column names to uppercase


In [78]:

data_df.isnull().sum() ##Checking for missing values
data_df = data_df.dropna() ##Dropping missing values
data_df = data_df.drop_duplicates() ##Dropping duplicates


In [79]:
##Renaming columns
data_df = data_df.rename(columns={'RIDEABLE_TYPE': 'BIKE_TYPE', 'START_STATION_NAME': 'START_STATION',
                                  'END_STATION_NAME': 'END_STATION','STARTED_AT':'START_DT','ENDED_AT':'END_DT'}) 
data_df["END_DT"] = pd.to_datetime(data_df["END_DT"], format="ISO8601")
data_df["START_DT"] = pd.to_datetime(data_df["START_DT"], format="ISO8601")

In [80]:

# Created function to calculate distance using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    r = 3958.8  # Radius of Earth in miles
    return r * c

# Apply the function to each row in the DataFrame
data_df['DISTANCE_MILES'] = data_df.apply(
    lambda row: haversine(row['START_LAT'], row['START_LNG'],
                          row['END_LAT'], row['END_LNG']), axis=1
)

data_df['DISTANCE_MILES'] = data_df['DISTANCE_MILES'].round(2) ##Rounding distance to 2 decimal places

In [81]:
# Calculating trips total duration in  mins & seconds
total_seconds = (data_df['END_DT'] - data_df['START_DT']).dt.total_seconds()
data_df['DURATION_MM_SS'] = (total_seconds // 60).astype(int).astype(str).str.zfill(2) + ':' + \
                            (total_seconds % 60).astype(int).astype(str).str.zfill(2)

# Finding AVG MPH
duration_hours = total_seconds / 3600
# Estimate distance travelled for round trips -- 8.5 mph is the average speed for a city bike
#data_df = data_df.reset_index(drop=True)

# data_df.loc[
#     (data_df["START_STATION"] == data_df["END_STATION"]),
#     "DISTANCE_MILES",
# ] = (
#     duration_hours * 8.5
# )
# Calculate average speed
data_df['AVG_MPH'] = data_df['DISTANCE_MILES'] / duration_hours



In [82]:
data_df[data_df['START_STATION'] == data_df['END_STATION']] ##Checking for round trips  

Unnamed: 0,RIDE_ID,BIKE_TYPE,START_DT,END_DT,START_STATION,START_STATION_ID,END_STATION,END_STATION_ID,START_LAT,START_LNG,END_LAT,END_LNG,MEMBER_CASUAL,DISTANCE_MILES,DURATION_MM_SS,AVG_MPH
790,40A08DF64E1C4FA4,classic_bike,2024-01-17 22:36:12.807,2024-01-17 22:37:17.680,Lawrence St & Willoughby St,4596.09,Lawrence St & Willoughby St,4596.09,40.692362,-73.986317,40.692362,-73.986317,member,0.00,01:04,0.000000
791,1E629C3BC68C6CFB,electric_bike,2024-01-03 18:55:54.634,2024-01-03 19:01:02.918,Palmetto St & Traffic Ave,5129.01,Palmetto St & Traffic Ave,5129.01,40.709284,-73.894472,40.709290,-73.894410,member,0.00,05:08,0.000000
792,3D87E18E0C79C07F,electric_bike,2024-01-23 16:46:19.796,2024-01-23 16:47:23.699,E 31 St & 3 Ave,6239.08,E 31 St & 3 Ave,6239.08,40.743996,-73.979786,40.743943,-73.979661,member,0.01,01:03,0.563354
793,A90B6796E56D04B7,classic_bike,2024-01-01 13:54:38.608,2024-01-01 15:25:55.380,Grand Ave & 55 St,5369.01,Grand Ave & 55 St,5369.01,40.718880,-73.914270,40.718880,-73.914270,casual,0.00,91:16,0.000000
794,5BD9FC0C5CDEC318,classic_bike,2024-01-31 08:30:57.963,2024-01-31 08:57:50.231,79 St & Woodside Ave,6174.03,79 St & Woodside Ave,6174.03,40.743640,-73.886220,40.743640,-73.886220,casual,0.00,26:52,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986409,E5B98CFC2EC57F4D,electric_bike,2024-06-04 13:09:14.200,2024-06-04 13:17:04.864,E 77 St & 3 Ave,7092.06,E 77 St & 3 Ave,7092.06,40.772950,-73.958724,40.773142,-73.958562,member,0.02,07:50,0.152975
986410,A6484BD7E5DDEC7A,classic_bike,2024-06-08 16:22:45.312,2024-06-08 19:48:39.801,Vesey Pl & River Terrace,5297.02,Vesey Pl & River Terrace,5297.02,40.715338,-74.016584,40.715338,-74.016584,casual,0.00,205:54,0.000000
986411,85F999C756B20290,classic_bike,2024-06-07 15:24:27.862,2024-06-07 15:41:16.970,Broadway & W 58 St,6948.10,Broadway & W 58 St,6948.10,40.766953,-73.981693,40.766953,-73.981693,member,0.00,16:49,0.000000
986412,028404BD57A0283A,classic_bike,2024-06-13 12:09:11.623,2024-06-13 12:22:53.261,Lexington Ave & Classon Ave,4452.03,Lexington Ave & Classon Ave,4452.03,40.686768,-73.959282,40.686768,-73.959282,member,0.00,13:41,0.000000


In [40]:
#data_df['DAY_OF_WEEK'] = data_df['START_DT'].dt.day_name() ##Extracting day of the week 

In [83]:
#Creating a function to get the time of the day
def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning(5am-12pm)'
    elif 12 <= hour < 17:
        return 'Afternoon(12pm-5pm)'
    elif 17 <= hour < 21:
        return 'Evening(5pm-9pm)'
    else:
        return 'Night(9pm-5am)'
data_df['TIME_OF_DAY'] = data_df['START_DT'].dt.hour.apply(time_of_day) ##Applying the function to get the time of the day

In [84]:
data_df['START_DT'] = data_df['START_DT'].dt.strftime('%Y-%m-%d') ##Converting start date to string
data_df['END_DT'] = data_df['END_DT'].dt.strftime('%Y-%m-%d') ##Converting end date to string
data_df = data_df.drop(
    ["START_STATION_ID", "END_STATION_ID", "END_LAT", "END_LNG"], axis=1
)  ##Dropping columns
#Formating the bike type for visualization
data_df["BIKE_TYPE"] = data_df["BIKE_TYPE"].str[:-5]

data_df

Unnamed: 0,RIDE_ID,BIKE_TYPE,START_DT,END_DT,START_STATION,END_STATION,START_LAT,START_LNG,MEMBER_CASUAL,DISTANCE_MILES,DURATION_MM_SS,AVG_MPH,TIME_OF_DAY
0,5078F3D302000BD2,electric,2024-01-22,2024-01-22,Frederick Douglass Blvd & W 145 St,St Nicholas Ave & W 126 St,40.823072,-73.941738,member,0.96,04:51,11.847951,Evening(5pm-9pm)
1,814337105D37302A,electric,2024-01-11,2024-01-11,W 54 St & 6 Ave,E 74 St & 1 Ave,40.761822,-73.977036,member,1.26,28:17,2.672502,Evening(5pm-9pm)
2,A33A920E2B10710C,electric,2024-01-30,2024-01-30,E 11 St & Ave B,W 10 St & Washington St,40.727592,-73.979751,casual,1.56,15:08,6.183905,Evening(5pm-9pm)
3,A3A5FC0DD7D34D74,electric,2024-01-27,2024-01-27,W 54 St & 6 Ave,E 74 St & 1 Ave,40.761779,-73.977144,member,1.27,10:59,6.933008,Morning(5am-12pm)
4,6F96728ECEFBDAA4,electric,2024-01-16,2024-01-16,Madison Ave & E 99 St,E 74 St & 1 Ave,40.789808,-73.952214,member,1.45,13:45,6.326077,Afternoon(12pm-5pm)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,FA32293146DB0F75,electric,2024-06-11,2024-06-11,Somers St & Rockaway Ave,Greenpoint Ave & West St,40.678737,-73.911099,casual,4.33,28:23,9.149305,Evening(5pm-9pm)
999996,7FE88C2FF7C428FB,electric,2024-06-03,2024-06-03,E 97 St & Madison Ave,E 85 St & 3 Ave,40.787961,-73.953355,casual,0.69,07:16,5.687295,Afternoon(12pm-5pm)
999997,2BC0191D8D596C7E,electric,2024-06-02,2024-06-02,Garfield Pl & 8 Ave,Dahill Rd & 12 Ave,40.671201,-73.974907,casual,1.77,13:08,8.085638,Afternoon(12pm-5pm)
999998,6901EBC452881D3E,electric,2024-06-13,2024-06-13,2 Ave & E 29 St,E 30 St & Park Ave S,40.741917,-73.978125,casual,0.31,04:19,4.303231,Afternoon(12pm-5pm)


In [43]:
print(data_df['DISTANCE_MILES'].sum()) ##Total distance travelled
data_df[(data_df['START_DT'] != data_df['END_DT']) & (data_df['DISTANCE_MILES'] == 0)] ##Checking for trips that started and ended on different days


0.0


Unnamed: 0,RIDE_ID,BIKE_TYPE,START_DT,END_DT,START_STATION,END_STATION,START_LAT,START_LNG,MEMBER_CASUAL,UNNAMED: 0,RIDEABLE_TYPE_DUPLICATE_COLUMN_NAME_1,DISTANCE_MILES,DURATION_MM_SS,AVG_MPH,TIME_OF_DAY


In [44]:
# aggregating the data
# aggregated_df = data_df.groupby(['BIKE_TYPE', 'START_DT', 'START_STATION', 'MEMBER_CASUAL', 'TIME_OF_DAY']).agg({
#     'RIDE_ID': 'count',
#     'DISTANCE_MILES': 'sum',
#     'DURATION_MM_SS': 'sum',
#     'AVG_MPH': 'mean'
# }).reset_index()

# aggregated_df.columns = ['BIKE_TYPE', 'START_DT', 'START_STATION', 'MEMBER_CASUAL', 'TIME_OF_DAY', 'RIDE_COUNT', 'TOTAL_DISTANCE', 'TOTAL_DURATION', 'AVG_MPH']

# aggregated_df


In [88]:
# data_df_sample = data_df#.sample(n=900000, random_state=42)
# data_df_sample.to_csv('citibike_data.csv', index=False) ##Saving the sampled data to a csv file

chunk_size = 1000000  # Adjust as needed
for i, start in enumerate(range(0, data_df.shape[0], chunk_size)):
    chunk = data_df.iloc[start : start + chunk_size]
    chunk.to_csv(f"citibike_data{i}.csv", index=False)