In [1]:
import pandas as pd
import os
import numpy as np

# Data Gathering

In [2]:
os.chdir("../Raw Data")

In [None]:
df = pd.concat([pd.read_csv("202301-divvy-tripdata.csv"), pd.read_csv("202302-divvy-tripdata.csv"), pd.read_csv("202303-divvy-tripdata.csv"),
               pd.read_csv("202304-divvy-tripdata.csv"), pd.read_csv("202305-divvy-tripdata.csv"), pd.read_csv("202306-divvy-tripdata.csv"),
               pd.read_csv("202307-divvy-tripdata.csv"), pd.read_csv("202308-divvy-tripdata.csv"), pd.read_csv("202309-divvy-tripdata.csv"),
               pd.read_csv("202310-divvy-tripdata.csv"), pd.read_csv("202311-divvy-tripdata.csv"), pd.read_csv("202312-divvy-tripdata.csv")])

In [None]:
df.to_csv("2023-divvy-tripdata.csv")

In [19]:
df = pd.read_csv("2023-divvy-tripdata.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5719877 entries, 0 to 5719876
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   ride_id             object 
 2   rideable_type       object 
 3   started_at          object 
 4   ended_at            object 
 5   start_station_name  object 
 6   start_station_id    object 
 7   end_station_name    object 
 8   end_station_id      object 
 9   start_lat           float64
 10  start_lng           float64
 11  end_lat             float64
 12  end_lng             float64
 13  member_casual       object 
dtypes: float64(4), int64(1), object(9)
memory usage: 610.9+ MB


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,start_lat,start_lng,end_lat,end_lng
count,5719877.0,5719877.0,5719877.0,5712887.0,5712887.0
mean,288817.6,41.90288,-87.64704,41.90322,-87.6472
std,200950.6,0.04505556,0.02733412,0.05444371,0.06919621
min,0.0,41.63,-87.94,0.0,-88.16
25%,119164.0,41.88096,-87.66,41.88103,-87.66027
50%,250568.0,41.89902,-87.64403,41.9,-87.6441
75%,439550.0,41.93,-87.62991,41.93,-87.63
max,771692.0,42.07,-87.46,42.18,0.0


In [20]:
df.drop("Unnamed: 0", axis = 1, inplace =True)

In [None]:
df[df['start_station_name'].isna()]

# Data Processing

## Cleaning 
- duplicate (ride_id)
- null/missing data (except start_station_id, and end_station_id)
## Validation
- inconsistency (ended_at < started_at)
- check latitude and longitude <br>
  $\quad$ Relevent link: Chicago Maphttps://wikimap.toolforge.org/?lang=en&page=Chicago<br>
  $\quad$ Chicago coordinate (Approximate):<br>
    $\quad$$\quad$lat_max: 42.02296 <br>
    $\quad$$\quad$lat_min: 41.64378 <br>
    $\quad$$\quad$lng_max: -87.52509 <br>
    $\quad$$\quad$lng_min: -87.94014 <br>
        
## Add Attributes
- weekday (Mon-Sun)
- Month (Jan-Dec)
- holiday (e.g.Christimas, Thanksgiving)   <br>
    relevent link: https://www.independent.co.uk/life-style/federal-holidays-2023-dates-december-b2468299.html
- duration in minute
- distance in km
- speed

In [None]:
df[df.duplicated()]
#df[df.duplicated(subset = 'ride_id')]

In [None]:
df.isna().sum()

In [9]:
#929202/5719877
6990/5719877

0.001222054250467274

- Based on above process, there is no duplicate in the dataset.
- About 16% station name and id are missing. This may be caused by user did not start/stop at the designated stations. It could happen when there is no station nearby or the bike is stole.
- 6990 pairs of end coordinates are missing. Based the distribution of end latitude and longtitude, it is in a small range with no outliers. Also, cconsidering it only takes 0.122% dataset, it can be replace with the mean of end latitude and longtitude.
- Station ids and ride id can be removed, because they won't contribute to our analysis.
- Station name should be saved to do further analysis, rename them based on the coordinate.(lat_lng)

In [21]:
#Calculate the means of the columns having missing values
end_lat_mean, end_lng_mean = (df['end_lat'].mean(), df['end_lng'].mean()) 

# Replace NaNs in column end_lat and end_lng with means values
df['end_lat'].fillna(value = end_lat_mean, inplace = True)
df['end_lng'].fillna(value = end_lng_mean, inplace = True)

In [22]:
df.drop(["ride_id","end_station_id", "start_station_id"], axis = 1, inplace = True)

######################### Station Name (Unsaved Change) ################################

In [None]:
df1 = df[df['start_station_name'].isna() | df['end_station_name'].isna()]
#df1['end_station_name'].fillna(value = df1['end_lat'].astype(str) + df1['end_lng'].astype(str), inplace = True)
df1['start_station_name'] = df1['start_station_name'].where(df1['start_station_name'].isna() == False, df1['start_lat'].astype(str)+df['start_lng'].astype(str))
df1['end_station_name'] = df1['end_station_name'].where(df1['end_station_name'].isna() == False, df1['end_lat'].astype(str)+df['end_lng'].astype(str))

## Validation
- inconsistency (ended_at < started_at)
- check latitude and longitude <br>
  $\quad$ Relevent link: Chicago Map https://wikimap.toolforge.org/?lang=en&page=Chicago<br>
  $\quad$ Chicago coordinate (Approximate):<br>
    $\quad$$\quad$lat_max: 42.02296 <br>
    $\quad$$\quad$lat_min: 41.64378 <br>
    $\quad$$\quad$lng_max: -87.52509 <br>
    $\quad$$\quad$lng_min: -87.94014 <br>

In order to validate the start and end time, converting data type from object to Datetime is required.<br>
Considering this analysis only focus on Chicago area, only records of trips that occurred in Chicago should be selected. It will be achieved by using extreme coordinate of Chicago.

In [23]:
# Convert timestamps to Datetime format
df['started_at'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d %H:%M:%S')
df['ended_at'] = pd.to_datetime(df['ended_at'], format='%Y-%m-%d %H:%M:%S')

In [24]:
# Filter out invalide rides
df = df[df['ended_at'] > df['started_at']] #272

In [25]:
df[(df['end_lng']>-87.52) | (df['end_lng']<-87.94014)] #OH Charging Stx - Test & Stony Island Ave & 63rd S

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
843779,electric_bike,2023-04-13 17:09:46,2023-04-13 19:39:45,Tripp Ave & 65th St,,41.77,-87.73,41.66,-88.11,casual
2149865,electric_bike,2023-06-15 12:38:05,2023-06-15 12:38:41,OH Charging Stx - Test,OH Charging Stx - Test,41.863166,-87.679811,0.0,0.0,member
2159176,classic_bike,2023-06-15 09:38:07,2023-06-15 09:42:57,State St & 54th St,OH Charging Stx - Test,41.796642,-87.625923,0.0,0.0,casual
2295693,electric_bike,2023-06-09 18:24:36,2023-06-09 19:01:28,Hegewisch Metra Station,,41.648576,-87.54624,41.69,-87.5,member
2298463,electric_bike,2023-06-09 18:24:26,2023-06-09 19:01:47,Hegewisch Metra Station,,41.648572,-87.546237,41.69,-87.5,casual
2298992,electric_bike,2023-06-15 01:33:07,2023-06-15 03:12:36,State St & Van Buren St,,41.877269,-87.627921,42.07,-88.16,casual
2361075,electric_bike,2023-06-15 16:05:52,2023-06-15 18:08:48,,,41.93,-87.74,41.95,-87.98,casual
3003685,electric_bike,2023-07-30 14:24:32,2023-07-30 14:55:02,Burnham Greenway & 112th St,,41.691728,-87.530582,41.66,-87.51,casual
3168381,electric_bike,2023-08-28 20:40:40,2023-08-28 22:27:09,Indiana Ave & 133rd St,,41.653742,-87.617125,41.65,-87.44,casual
3168387,electric_bike,2023-08-20 14:17:16,2023-08-20 15:25:33,Indiana Ave & 133rd St,,41.65373,-87.61709,41.7,-87.51,casual


In [27]:
# Reset index and print dataframe preview
df.reset_index(drop=True, inplace = True)

In [30]:
df[(df['end_station_name'] == 'OH Charging Stx - Test') | (df['start_station_name'] == 'OH Charging Stx - Test')]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
2019264,electric_bike,2023-06-28 15:32:50,2023-06-28 15:33:07,OH Charging Stx - Test,OH - BONFIRE - TESTING,41.863619,-87.679668,41.863324,-87.679788,member
2148399,electric_bike,2023-06-29 14:29:06,2023-06-29 14:29:13,OH Charging Stx - Test,OH Charging Stx - Test,41.863616,-87.679722,41.86257,-87.679935,member
2148400,electric_bike,2023-06-29 14:41:13,2023-06-29 14:41:19,OH Charging Stx - Test,OH Charging Stx - Test,41.86339,-87.679775,41.86257,-87.679935,member
2148401,classic_bike,2023-06-29 14:36:06,2023-06-29 14:36:13,OH Charging Stx - Test,OH Charging Stx - Test,41.86257,-87.679935,41.86257,-87.679935,member
2149355,classic_bike,2023-06-28 15:44:00,2023-06-28 15:44:06,OH Charging Stx - Test,OH Charging Stx - Test,41.86257,-87.679935,41.86257,-87.679935,member
2149356,classic_bike,2023-06-28 10:56:35,2023-06-28 10:56:40,OH Charging Stx - Test,OH Charging Stx - Test,41.86257,-87.679935,41.86257,-87.679935,member
2149472,classic_bike,2023-06-28 15:43:40,2023-06-28 15:43:44,OH Charging Stx - Test,OH Charging Stx - Test,41.86257,-87.679935,41.86257,-87.679935,member
2149473,electric_bike,2023-06-28 15:32:11,2023-06-28 15:32:27,OH Charging Stx - Test,OH Charging Stx - Test,41.863619,-87.679668,41.86257,-87.679935,member
2149474,electric_bike,2023-06-28 15:34:27,2023-06-28 15:34:33,OH Charging Stx - Test,OH Charging Stx - Test,41.863511,-87.679753,41.86257,-87.679935,member
2149475,electric_bike,2023-06-28 15:38:05,2023-06-28 15:38:13,OH Charging Stx - Test,OH Charging Stx - Test,41.86348,-87.67987,41.86257,-87.679935,member


In [33]:
# Manually correct coordinate at "OH Charging Stx - Test" station

df.iloc[2149693, 7] = 41.86257
df.iloc[2149865, 8] = -87.679935

df.iloc[2159004, 7] = 41.86257
df.iloc[2159004, 8] = -87.679935

In [37]:
df[((df['end_station_name'] == 'Stony Island Ave & 63rd St') | (df['start_station_name'] == 'Stony Island Ave & 63rd St')) & ((df['start_lat'] == 0) | (df['start_lng'] == 0)| (df['end_lat'] == 0) | (df['end_lng'] == 0))]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
3413193,classic_bike,2023-08-21 18:43:22,2023-08-21 22:05:55,Dearborn St & Erie St,Stony Island Ave & 63rd St,41.893992,-87.629318,0.0,0.0,casual


In [39]:
# Manually correct coordinate at "Stony Island Ave & 63rd St" station

df.iloc[3413193, 7] = 41.780506
df.iloc[3413193, 8] = -87.586853

In [50]:
# Filter out rides with out-of-Chicago coordinate
# 555 lines removed

df = df[(df['start_lat'] < 42.064955 ) & (df['start_lat'] > 41.64378)] #225
df = df[(df['end_lat'] < 42.064955 ) & (df['end_lat'] > 41.64378)] #328
df = df[(df['start_lng'] < -87.52) & (df['start_lng'] > -87.94014)] #8
df = df[(df['end_lng'] < -87.52) & (df['end_lng'] > -87.94014)] #34

During checking data quality, we removed total **1826** lines of invalid data, including 272 invalid rides (*end time is ahead of start time*), and 1554 rides that are out of Chicago.<br>
While filtering ride out of the range, there are 3 records contains station name, but no coordinates. This may be caused by unstable Internet or other technical issue. By searching station name in the data set, it has been manually updated.

## Add Attributes
- duration in minute
- distance in km
- speed
- weekday (Mon-Sun)
- Month (Jan-Dec)
- (Optional) holiday (e.g.Christimas, Thanksgiving)   <br>
    relevent link: https://www.independent.co.uk/life-style/federal-holidays-2023-dates-december-b2468299.html


In [53]:
# Calculate ride durations
df['duration'] = df['ended_at'] - df['started_at']

# Convert ride durations into minutes
df['duration_m'] = df['duration'].dt.total_seconds() / 60

# Convert ride durations into hours
df['duration_h'] = df['duration'].dt.total_seconds() / 3600

In [None]:
conda install -c conda-forge pyproj

In [56]:
# Calculate distance in m
from pyproj import Geod

def get_distance(start_lat, start_lng, end_lat, end_lng):
    g = Geod(ellps='WGS84')
    # 2D distance in meters with longitude, latitude of the points
    azimuth1, azimuth2, distance_2d = g.inv(start_lat, start_lng, end_lat, end_lng)
    return distance_2d

In [57]:
df['distance'] = get_distance(df['start_lat'].tolist(), df['start_lng'].tolist(), df['end_lat'].tolist(), df['end_lng'].tolist())

In [58]:
# Convert distance unit from m to km

df['distance'] = df['distance'] / 1000

In [59]:
# Extract day of the week for each ride from the 'started_at_datetime' value (Monday = 0, Tuesday = 1, etc.)
df['day_of_week_num'] = df['started_at'].dt.dayofweek

# Extract date (start) for each ride
df['day_of_month'] = df['started_at'].dt.day

# Extract hour (start) for each ride
df['start_hour'] = df['started_at'].dt.hour

# Map week days values
week_days = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday", 4:"Friday", 5:"Saturday", 6:"Sunday"}
df['day_of_week'] = df['day_of_week_num'].map(week_days)

In [60]:
# Calculate speed
df['ride_speed'] = df['distance'] / df['duration_h']

In [65]:
df

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration,duration_m,duration_h,distance,day_of_week_num,day_of_month,start_hour,day_of_week,ride_speed
0,electric_bike,2023-01-21 20:05:42,2023-01-21 20:16:33,Lincoln Ave & Fullerton Ave,Hampden Ct & Diversey Ave,41.924074,-87.646278,41.930000,-87.640000,member,0 days 00:10:51,10.850000,0.180833,0.701773,5,21,20,Saturday,3.880775
1,classic_bike,2023-01-10 15:37:36,2023-01-10 15:46:05,Kimbark Ave & 53rd St,Greenwood Ave & 47th St,41.799568,-87.594747,41.809835,-87.599383,member,0 days 00:08:29,8.483333,0.141389,0.520031,1,10,15,Tuesday,3.678022
2,electric_bike,2023-01-02 07:51:57,2023-01-02 08:05:11,Western Ave & Lunt Ave,Valli Produce - Evanston Plaza,42.008571,-87.690483,42.039742,-87.699413,casual,0 days 00:13:14,13.233333,0.220556,1.007211,0,2,7,Monday,4.566699
3,classic_bike,2023-01-22 10:52:58,2023-01-22 11:01:44,Kimbark Ave & 53rd St,Greenwood Ave & 47th St,41.799568,-87.594747,41.809835,-87.599383,member,0 days 00:08:46,8.766667,0.146111,0.520031,6,22,10,Sunday,3.559151
4,classic_bike,2023-01-12 13:58:01,2023-01-12 14:13:20,Kimbark Ave & 53rd St,Greenwood Ave & 47th St,41.799568,-87.594747,41.809835,-87.599383,member,0 days 00:15:19,15.316667,0.255278,0.520031,3,12,13,Thursday,2.037120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718048,electric_bike,2023-12-07 13:15:24,2023-12-07 13:17:37,900 W Harrison St,Racine Ave & Congress Pkwy,41.874702,-87.649804,41.874640,-87.657030,casual,0 days 00:02:13,2.216667,0.036944,0.807106,3,7,13,Thursday,21.846470
5718049,classic_bike,2023-12-08 18:42:21,2023-12-08 18:45:56,900 W Harrison St,Racine Ave & Congress Pkwy,41.874754,-87.649807,41.874640,-87.657030,casual,0 days 00:03:35,3.583333,0.059722,0.806752,4,8,18,Friday,13.508407
5718050,classic_bike,2023-12-05 14:09:11,2023-12-05 14:13:01,900 W Harrison St,Racine Ave & Congress Pkwy,41.874754,-87.649807,41.874640,-87.657030,member,0 days 00:03:50,3.833333,0.063889,0.806752,1,5,14,Tuesday,12.627424
5718051,electric_bike,2023-12-02 21:36:07,2023-12-02 21:53:45,Damen Ave & Madison St,Morgan St & Lake St*,41.881396,-87.674984,41.885492,-87.652289,casual,0 days 00:17:38,17.633333,0.293889,2.534872,5,2,21,Saturday,8.625273


In [62]:
# Reset index and print dataframe preview
df.reset_index(drop=True, inplace = True)

In [63]:
df.describe()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,duration,duration_m,duration_h,distance,day_of_week_num,day_of_month,start_hour,ride_speed
count,5718053.0,5718053.0,5718053.0,5718053.0,5718053,5718053.0,5718053.0,5718053.0,5718053.0,5718053.0,5718053.0,5718053.0
mean,41.90287,-87.64703,41.90323,-87.64724,0 days 00:18:11.319629076,18.18866,0.3031443,1.435817,3.030483,15.53902,14.09414,8.28718
std,0.04502873,0.0273293,0.0451314,0.02743402,0 days 03:00:14.269288494,180.2378,3.003964,1.50967,1.950702,8.738677,4.941571,15.07495
min,41.6485,-87.94,41.6485,-87.94,0 days 00:00:01,0.01666667,0.0002777778,0.0,0.0,1.0,0.0,0.0
25%,41.88096,-87.66,41.88103,-87.66027,0 days 00:05:25,5.416667,0.09027778,0.3794326,1.0,8.0,11.0,2.22762
50%,41.89901,-87.64403,41.9,-87.6441,0 days 00:09:32,9.533333,0.1588889,1.055408,3.0,15.0,15.0,6.555815
75%,41.93,-87.62991,41.93,-87.63,0 days 00:16:55,16.91667,0.2819444,1.996474,5.0,23.0,18.0,12.22231
max,42.06488,-87.52823,42.06485,-87.52823,68 days 09:29:04,98489.07,1641.484,32.39102,6.0,31.0,23.0,3497.313


In [69]:
df[(df['distance'] == 0)|(df['ride_speed'] == 0)]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration,duration_m,duration_h,distance,day_of_week_num,day_of_month,start_hour,day_of_week,ride_speed
39,electric_bike,2023-01-19 19:17:46,2023-01-19 19:18:14,Hampden Ct & Diversey Ave,Hampden Ct & Diversey Ave,41.930000,-87.640000,41.930000,-87.640000,casual,0 days 00:00:28,0.466667,0.007778,0.0,3,19,19,Thursday,0.0
40,electric_bike,2023-01-26 12:36:49,2023-01-26 12:37:37,Hampden Ct & Diversey Ave,Hampden Ct & Diversey Ave,41.930000,-87.640000,41.930000,-87.640000,member,0 days 00:00:48,0.800000,0.013333,0.0,3,26,12,Thursday,0.0
56,classic_bike,2023-01-19 17:42:02,2023-01-19 17:42:06,Sacramento Blvd & Franklin Blvd,Sacramento Blvd & Franklin Blvd,41.890469,-87.702608,41.890469,-87.702608,member,0 days 00:00:04,0.066667,0.001111,0.0,3,19,17,Thursday,0.0
57,electric_bike,2023-01-02 13:49:25,2023-01-02 13:49:41,Hampden Ct & Diversey Ave,Hampden Ct & Diversey Ave,41.930000,-87.640000,41.930000,-87.640000,casual,0 days 00:00:16,0.266667,0.004444,0.0,0,2,13,Monday,0.0
58,electric_bike,2023-01-17 07:48:48,2023-01-17 07:49:00,Pulaski Rd & 51st St,Pulaski Rd & 51st St,41.800000,-87.720000,41.800000,-87.720000,casual,0 days 00:00:12,0.200000,0.003333,0.0,1,17,7,Tuesday,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5715308,classic_bike,2023-12-10 16:22:23,2023-12-10 16:22:25,Clark St & Lincoln Ave,Clark St & Lincoln Ave,41.915689,-87.634600,41.915689,-87.634600,member,0 days 00:00:02,0.033333,0.000556,0.0,6,10,16,Sunday,0.0
5715309,classic_bike,2023-12-09 13:35:27,2023-12-09 13:47:04,Fairbanks Ct & Grand Ave,Fairbanks Ct & Grand Ave,41.891847,-87.620580,41.891847,-87.620580,member,0 days 00:11:37,11.616667,0.193611,0.0,5,9,13,Saturday,0.0
5715312,classic_bike,2023-12-27 12:44:15,2023-12-27 12:47:27,Fairbanks Ct & Grand Ave,Fairbanks Ct & Grand Ave,41.891847,-87.620580,41.891847,-87.620580,member,0 days 00:03:12,3.200000,0.053333,0.0,2,27,12,Wednesday,0.0
5715653,classic_bike,2023-12-16 19:26:50,2023-12-16 19:28:22,Fairbanks Ct & Grand Ave,Fairbanks Ct & Grand Ave,41.891847,-87.620580,41.891847,-87.620580,member,0 days 00:01:32,1.533333,0.025556,0.0,5,16,19,Saturday,0.0


In [75]:
#df[(df['duration_m'] < 1)  & (df['distance'] == 0)]#81954
df[(df['duration_m'] > 1)  & (df['distance'] == 0) & (df['start_station_name'] != df['end_station_name'])] #41142

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration,duration_m,duration_h,distance,day_of_week_num,day_of_month,start_hour,day_of_week,ride_speed
96,electric_bike,2023-01-19 20:09:59,2023-01-19 20:12:43,Western Ave & Grace St,Campbell Ave & Irving Park Rd,41.950000,-87.690000,41.950000,-87.690000,member,0 days 00:02:44,2.733333,0.045556,0.0,3,19,20,Thursday,0.0
124,electric_bike,2023-01-18 10:51:17,2023-01-18 10:56:13,,Pulaski Rd & 51st St,41.800000,-87.720000,41.800000,-87.720000,member,0 days 00:04:56,4.933333,0.082222,0.0,2,18,10,Wednesday,0.0
127,electric_bike,2023-01-17 13:50:40,2023-01-17 13:53:04,,Pulaski Rd & 51st St,41.800000,-87.720000,41.800000,-87.720000,casual,0 days 00:02:24,2.400000,0.040000,0.0,1,17,13,Tuesday,0.0
205,electric_bike,2023-01-12 13:24:35,2023-01-12 13:30:15,,Hampden Ct & Diversey Ave,41.930000,-87.640000,41.930000,-87.640000,member,0 days 00:05:40,5.666667,0.094444,0.0,3,12,13,Thursday,0.0
211,electric_bike,2023-01-04 14:13:10,2023-01-04 14:19:44,,Campbell Ave & Irving Park Rd,41.950000,-87.690000,41.950000,-87.690000,casual,0 days 00:06:34,6.566667,0.109444,0.0,2,4,14,Wednesday,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5612462,electric_bike,2023-12-08 15:23:51,2023-12-08 15:46:36,,,41.870000,-87.630000,41.870000,-87.630000,member,0 days 00:22:45,22.750000,0.379167,0.0,4,8,15,Friday,0.0
5612482,electric_bike,2023-12-04 17:46:36,2023-12-04 17:50:41,,,41.900000,-87.660000,41.900000,-87.660000,member,0 days 00:04:05,4.083333,0.068056,0.0,0,4,17,Monday,0.0
5612485,electric_bike,2023-12-26 16:59:44,2023-12-26 17:03:27,,,41.940000,-87.640000,41.940000,-87.640000,member,0 days 00:03:43,3.716667,0.061944,0.0,1,26,16,Tuesday,0.0
5612486,electric_bike,2023-12-25 22:41:33,2023-12-25 23:03:03,,,41.970000,-87.660000,41.970000,-87.660000,member,0 days 00:21:30,21.500000,0.358333,0.0,0,25,22,Monday,0.0


By observing the statistics, the minimum of ride speed is 0. Considering speed is determined by distance and duration in hour. It is likely caused by distance is equal to 0. This hypothesis is proven by filtering trips with same output where distance is 0 or speed is 0. (287767 rows)<br>
When we pull out the data, there are over 287k trips. Based on that, it is unlikely to be technical problems but normal case.<br>

- Senario 1 (137752 rows: Duration > 1 minute & start place = end place):</br>
    The trip has same departure and arrival location. Because of the privacy terms, there is no data about stops and detailed trip data. In senario 1, even though the speed may not accurately reflect the real story, those data are still valid.
- Senario 2 (41142 rows: Duration < 1 minute & start place = end place):</br>
    The trip has same departure and arrival location, and the duration is less than 1 minute. It is likely to be accidental check-ins or check-outs. In senario 2, those data will not contribute to the analysis; thus, removing them to avoid the distortion caused by it.
- Senario 3 (67373 rows: Duration > 1 minute & start place != end place & distance = 0):<br>
    The departure and arrival names are different, and the latitudes and longitudes are same. It is likely cause by technical issue when the station names are recorded and the coordinate are same. For most trips, the reason that the start station is different from the end station is because the name is not recorded, but the geographic coordinate is same. Same to senario 1. <br>
- Senario 4 (40812 rows: Duration < 1 minute & start place != end place & distance = 0) <br>
    Similar to above situation, the reason that the start station is different from the end station is because the name is not recorded. However, the duration is less than 1 min, it makes senario 4 same to senario 2.
    * Special case: Through further investigation, there are several pairs of similar station name inputs.(e.g. start_station_name: Wilton Ave & Diversey Pkwy* & end_station_name: Wilton Ave & Diversey Pkwy; start_station_name: Wilton Ave & Diversey Pkwy (Temp) & end_station_name: Wilton Ave & Diversey Pkwy) Because how the station name is define is unclear, let's assume different station name refers to different place, even if they are similar.<br>


To solve this inconsistency, remove rides less then 1 minute would be reasonable to remve unrealistic trips.

In [79]:
# Filter out rides less than 1 minute or over 24 hours
# 156203 rows removed

df = df[(df['duration_m'] > 1) & (df['duration_h'] < 24)] #5561850

Based on the regulation published in 2017, the speed limit for bicycle is 30 mph(approx.  48 km/h).<br>
Revelent link: https://www.chicago.gov/content/dam/city/depts/bacp/publicvehicleinfo/publicchauffer/TipsforMotorist03072017.pdf

In [80]:
df = df[df['ride_speed'] <= 48]

In [81]:
# Reset index and print dataframe preview
df.reset_index(drop = True, inplace = True)

In [None]:
# Save the processed data
df.to_csv("2023-divvy-tripdata_processed.csv")