In [111]:
# Importing Libraries
import pandas as pd
import glob ##For reading multiple files
import numpy as np
import dask.dataframe as dd


In [112]:
folder_path = 'NY-citibike-tripdata/*.csv' ##Path to the folder containing the data files
csv_files = glob.glob(folder_path) ##Reading all the files in the folder
csv_files = csv_files[:3] ##Reading only the first 5 files
data_df = pd.concat(
    [pd.read_csv(file, dtype={5: str, 7: str}) for file in csv_files]
)  ##Concatenating all the files to form a


###Data Exploration & Pre-Processing


In [113]:
data_df.columns = data_df.columns.str.upper() ##Converting column names to uppercase


In [114]:

data_df.isnull().sum() ##Checking for missing values
data_df = data_df.dropna() ##Dropping missing values
data_df = data_df.drop_duplicates() ##Dropping duplicates


In [115]:
##Renaming columns
data_df = data_df.rename(columns={'RIDEABLE_TYPE': 'BIKE_TYPE', 'START_STATION_NAME': 'START_STATION',
                                  'END_STATION_NAME': 'END_STATION','STARTED_AT':'START_DT','ENDED_AT':'END_DT'}) 
data_df['END_DT'] = pd.to_datetime(data_df['END_DT'])
data_df['START_DT'] = pd.to_datetime(data_df['START_DT'])

In [116]:
# Created function to calculate distance using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    r = 3958.8  # Radius of Earth in miles
    return r * c

# Apply the function to each row in the DataFrame
data_df['DISTANCE_MILES'] = data_df.apply(
    lambda row: haversine(row['START_LAT'], row['START_LNG'],
                          row['END_LAT'], row['END_LNG']), axis=1
)

data_df['DISTANCE_MILES'] = data_df['DISTANCE_MILES'].round(2) ##Rounding distance to 2 decimal places

In [117]:
# Calculating trips total duration in  mins & seconds
total_seconds = (data_df['END_DT'] - data_df['START_DT']).dt.total_seconds()
data_df['DURATION_MM_SS'] = (total_seconds // 60).astype(int).astype(str).str.zfill(2) + ':' + \
                            (total_seconds % 60).astype(int).astype(str).str.zfill(2)

# Finding AVG MPH
duration_hours = total_seconds / 3600
# Estimate distance travelled for round trips -- 8.5 mph is the average speed for a city bike
#data_df = data_df.reset_index(drop=True)

# data_df.loc[
#     (data_df["START_STATION"] == data_df["END_STATION"]),
#     "DISTANCE_MILES",
# ] = (
#     duration_hours * 8.5
# )
# Calculate average speed
data_df['AVG_MPH'] = data_df['DISTANCE_MILES'] / duration_hours



In [118]:
data_df[data_df['START_STATION'] == data_df['END_STATION']] ##Checking for round trips  

Unnamed: 0,RIDE_ID,BIKE_TYPE,START_DT,END_DT,START_STATION,START_STATION_ID,END_STATION,END_STATION_ID,START_LAT,START_LNG,END_LAT,END_LNG,MEMBER_CASUAL,DISTANCE_MILES,DURATION_MM_SS,AVG_MPH
42727,09DDEB87C8EF7C31,electric_bike,2023-01-26 16:00:49.969,2023-01-26 17:00:50.809,Tinton Ave & E 165 St,7991.01,Tinton Ave & E 165 St,7991.01,40.824839,-73.902387,40.824796,-73.902420,member,0.00,60:00,0.000000
43135,7C23B602A0B087F8,classic_bike,2023-01-21 20:04:31.732,2023-01-21 20:39:35.800,5 Ave & E 29 St,6248.06,5 Ave & E 29 St,6248.06,40.745168,-73.986831,40.745168,-73.986831,member,0.00,35:04,0.000000
43137,482F399CB6281EC4,electric_bike,2023-01-09 02:11:18.250,2023-01-09 02:12:50.118,Franklin St & Dupont St,5944.01,Franklin St & Dupont St,5944.01,40.735814,-73.958714,40.735640,-73.958660,member,0.01,01:31,0.391867
43138,6FE47DCBE9A450A2,electric_bike,2023-01-31 12:47:19.129,2023-01-31 13:04:07.223,5 Ave & E 29 St,6248.06,5 Ave & E 29 St,6248.06,40.745266,-73.987224,40.745168,-73.986831,member,0.02,16:48,0.071422
43143,609A7A019A791293,classic_bike,2023-01-24 21:43:52.285,2023-01-24 22:28:13.587,E 10 St & 2 Ave,5746.02,E 10 St & 2 Ave,5746.02,40.729708,-73.986598,40.729708,-73.986598,member,0.00,44:21,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998905,E14E59DA0BEAB238,electric_bike,2023-02-03 16:09:20.067,2023-02-03 16:42:05.662,Gerard Ave & McClellan St,8145.03,Gerard Ave & McClellan St,8145.03,40.833960,-73.921203,40.833909,-73.921120,member,0.01,32:45,0.018315
998911,2426D8A3993F9D12,classic_bike,2023-02-10 19:22:58.007,2023-02-10 19:31:59.916,E 7 St & Ave B,5584.05,E 7 St & Ave B,5584.05,40.725129,-73.981317,40.725129,-73.981317,member,0.00,09:01,0.000000
998912,0D8D270B5904447D,classic_bike,2023-02-10 18:34:38.995,2023-02-10 18:40:07.353,E 7 St & Ave B,5584.05,E 7 St & Ave B,5584.05,40.725129,-73.981317,40.725129,-73.981317,member,0.00,05:28,0.000000
998913,E79C718DCFFC39C4,classic_bike,2023-02-13 17:32:04.145,2023-02-13 17:33:10.264,Roebling St & N 4 St,5267.09,Roebling St & N 4 St,5267.09,40.714690,-73.957390,40.714690,-73.957390,member,0.00,01:06,0.000000


In [119]:
#data_df['DAY_OF_WEEK'] = data_df['START_DT'].dt.day_name() ##Extracting day of the week 

In [None]:
#Creating a function to get the time of the day
def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning(5am-12pm)'
    elif 12 <= hour < 17:
        return 'Afternoon(12pm-5pm)'
    elif 17 <= hour < 21:
        return 'Evening(5pm-9pm)'
    else:
        return 'Night(9pm-5am)'
data_df['TIME_OF_DAY'] = data_df['START_DT'].dt.hour.apply(time_of_day) ##Applying the function to get the time of the day

In [None]:
data_df['START_DT'] = data_df['START_DT'].dt.strftime('%Y-%m-%d') ##Converting start date to string
data_df['END_DT'] = data_df['END_DT'].dt.strftime('%Y-%m-%d') ##Converting end date to string
data_df = data_df.drop(
    ["START_STATION_ID", "END_STATION_ID", "END_LAT", "END_LNG"], axis=1
)  ##Dropping columns
#Formating the bike type for visualization
data_df["BIKE_TYPE"] = data_df["BIKE_TYPE"].str[:-5]

data_df

In [127]:
print(data_df['DISTANCE_MILES'].sum()) ##Total distance travelled
data_df[(data_df['START_DT'] != data_df['END_DT']) & (data_df['DISTANCE_MILES'] == 0)] ##Checking for trips that started and ended on different days


3035179.0799999987


Unnamed: 0,RIDE_ID,BIKE_TYPE,START_DT,END_DT,START_STATION,END_STATION,START_LAT,START_LNG,MEMBER_CASUAL,DISTANCE_MILES,DURATION_MM_SS,AVG_MPH,TIME_OF_DAY
49052,28AF0B8DC3D3ED15,electric,2023-01-08,2023-01-09,S 4 St & Rodney St,S 4 St & Rodney St,40.709342,-73.956092,member,0.0,42:11,0.0,Night(9pm-5am)
49667,21343C341D999C11,classic,2023-01-10,2023-01-11,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,40.764397,-73.973715,member,0.0,484:37,0.0,Evening(5pm-9pm)
49922,1349EF783D91A58C,classic,2023-01-30,2023-01-31,W 204 St & Nagle Ave,W 204 St & Nagle Ave,40.863211,-73.920827,member,0.0,41:41,0.0,Night(9pm-5am)
50300,255E6FB6CE734D58,classic,2023-01-24,2023-01-25,5 Ave & E 78 St,5 Ave & E 78 St,40.776321,-73.964274,casual,0.0,883:27,0.0,Evening(5pm-9pm)
50752,C9555AB9B0178CBC,electric,2023-01-01,2023-01-02,Boston Rd & Prospect Ave,Boston Rd & Prospect Ave,40.833232,-73.896665,member,0.0,34:08,0.0,Night(9pm-5am)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
969375,5E2E74286FAAABBA,electric,2023-02-08,2023-02-09,Clinton St & Cherry St,Clinton St & Cherry St,40.711471,-73.986726,casual,0.0,40:47,0.0,Night(9pm-5am)
969376,35600089FB56B92C,electric,2023-02-08,2023-02-09,Clinton St & Cherry St,Clinton St & Cherry St,40.711471,-73.986726,casual,0.0,41:01,0.0,Night(9pm-5am)
978233,AB26EDE1A14DC4AD,electric,2023-02-08,2023-02-09,Clinton St & Cherry St,Clinton St & Cherry St,40.711471,-73.986726,casual,0.0,51:39,0.0,Night(9pm-5am)
978234,1B4C02FEE0692B16,electric,2023-02-08,2023-02-09,Clinton St & Cherry St,Clinton St & Cherry St,40.711471,-73.986726,casual,0.0,52:04,0.0,Night(9pm-5am)


In [128]:
data_df.head(4000).to_csv('citibike_data.csv', index=False) ##Saving the data to a csv file