In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date, time
import pickle

In [21]:
df_chicago = pd.read_csv('../dataset/chicago_2018.csv') 

df_chicago

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2018-04-01 00:04:44,2018-04-01 00:13:03,22,171,May St & Taylor St,May St & Cullerton St,3819,Subscriber
1,2018-04-01 00:06:42,2018-04-01 00:27:07,157,190,Lake Shore Dr & Wellington Ave,Southport Ave & Wrightwood Ave,5000,Subscriber
2,2018-04-01 00:07:19,2018-04-01 00:23:19,106,106,State St & Pearson St,State St & Pearson St,5165,Customer
3,2018-04-01 00:07:33,2018-04-01 00:14:47,241,171,Morgan St & Polk St,May St & Cullerton St,3851,Subscriber
4,2018-04-01 00:10:23,2018-04-01 00:22:12,228,219,Damen Ave & Melrose Ave,Damen Ave & Cortland St,5065,Subscriber
...,...,...,...,...,...,...,...,...
3603077,2018-03-31 23:46:34,2018-04-01 00:05:24,158,260,Milwaukee Ave & Wabansia Ave,Kedzie Ave & Milwaukee Ave,1935,Subscriber
3603078,2018-03-31 23:47:43,2018-03-31 23:52:05,299,229,Halsted St & Roscoe St,Southport Ave & Roscoe St,5852,Subscriber
3603079,2018-03-31 23:50:18,2018-03-31 23:57:38,327,226,Sheffield Ave & Webster Ave,Racine Ave & Belmont Ave,4414,Subscriber
3603080,2018-03-31 23:52:26,2018-04-01 00:07:13,265,426,Cottage Grove Ave & Oakwood Blvd,Ellis Ave & 60th St,6448,Subscriber


In [22]:
df_chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603082 entries, 0 to 3603081
Data columns (total 8 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   start_time          object
 1   end_time            object
 2   start_station_id    int64 
 3   end_station_id      int64 
 4   start_station_name  object
 5   end_station_name    object
 6   bike_id             int64 
 7   user_type           object
dtypes: int64(3), object(5)
memory usage: 219.9+ MB


In [23]:
#Check number of rows
Lenght = len(df_chicago)
print("The dataset includes " + str(Lenght) + " entries.")

The dataset includes 3603082 entries.


In [25]:
#Check for duplicates
df_chicago[df_chicago.duplicated(keep=False)]

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type


In [26]:
#Drop duplicates and keep the last
df_chicago= df_chicago.drop_duplicates(keep='last')

In [27]:
#Check for null values
df_chicago.isnull().sum()

start_time            0
end_time              0
start_station_id      0
end_station_id        0
start_station_name    0
end_station_name      0
bike_id               0
user_type             0
dtype: int64

In [28]:
#Define conditions if start_time is smaller than end_time
conditions = [df_chicago['end_time'] < df_chicago['start_time'],
              df_chicago['start_time'] < df_chicago['end_time'],
              df_chicago['start_time'] == df_chicago['end_time']]

#Define choices
choices=["Delete","Keep", "Delete"]

#Create new column in DataFrame that displays results of comparisons
df_chicago['result'] = np.select(conditions, choices, default='Tie')

#view result
df_chicago.head(10)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,result
0,2018-04-01 00:04:44,2018-04-01 00:13:03,22,171,May St & Taylor St,May St & Cullerton St,3819,Subscriber,Keep
1,2018-04-01 00:06:42,2018-04-01 00:27:07,157,190,Lake Shore Dr & Wellington Ave,Southport Ave & Wrightwood Ave,5000,Subscriber,Keep
2,2018-04-01 00:07:19,2018-04-01 00:23:19,106,106,State St & Pearson St,State St & Pearson St,5165,Customer,Keep
3,2018-04-01 00:07:33,2018-04-01 00:14:47,241,171,Morgan St & Polk St,May St & Cullerton St,3851,Subscriber,Keep
4,2018-04-01 00:10:23,2018-04-01 00:22:12,228,219,Damen Ave & Melrose Ave,Damen Ave & Cortland St,5065,Subscriber,Keep
5,2018-04-01 00:11:29,2018-04-01 00:22:28,244,325,Ravenswood Ave & Irving Park Rd,Clark St & Winnemac Ave (Temp),5962,Subscriber,Keep
6,2018-04-01 00:15:49,2018-04-01 00:19:47,128,130,Damen Ave & Chicago Ave,Damen Ave & Division St,4570,Subscriber,Keep
7,2018-04-01 00:17:00,2018-04-01 00:22:53,130,69,Damen Ave & Division St,Damen Ave & Pierce Ave,1323,Subscriber,Keep
8,2018-04-01 00:18:24,2018-04-01 00:23:06,130,69,Damen Ave & Division St,Damen Ave & Pierce Ave,1977,Subscriber,Keep
9,2018-04-01 00:20:00,2018-04-01 00:26:22,121,351,Blackstone Ave & Hyde Park Blvd,Cottage Grove Ave & 51st St,2602,Subscriber,Keep


In [29]:
#Check if every station_name is assigned to only one station_id and vice versa

df_id_time = df_chicago.sort_values(by=['start_time','bike_id'], ascending=True)

df_name_unique = df_id_time.groupby('start_station_name')['start_station_id'].nunique().reset_index()

df_id_unique = df_id_time.groupby('start_station_id')['start_station_name'].nunique().reset_index()

print(df_name_unique)
print(df_id_unique)

               start_station_name  start_station_id
0             2112 W Peterson Ave                 1
1                   63rd St Beach                 1
2               900 W Harrison St                 1
3      Aberdeen St & Jackson Blvd                 1
4         Aberdeen St & Monroe St                 1
..                            ...               ...
617           Wood St & Taylor St                 1
618        Woodlawn Ave & 55th St                 1
619        Woodlawn Ave & 75th St                 1
620  Woodlawn Ave & Lake Park Ave                 1
621          Yates Blvd & 75th St                 1

[622 rows x 2 columns]
     start_station_id  start_station_name
0                   2                   1
1                   3                   1
2                   4                   1
3                   5                   1
4                   6                   1
..                ...                 ...
616               660                   1
617             

In [35]:
#Check maximum number of station_ids assigned to one station_name
print(df_id_unique.start_station_name.max())

2


In [36]:
#Check maximum number of station_names assigned to one station_id
print(df_name_unique.start_station_id.max())

1


In [38]:
#Format the columns from datatype object to datatype datetime
df_chicago['start_time'] = pd.to_datetime(df_chicago['start_time'])
df_chicago['end_time'] = pd.to_datetime(df_chicago['end_time'])

In [39]:
#Calculate the duration time of each trip
df_chicago['duration_time_trip'] = df_chicago['end_time'] - df_chicago['start_time']
df_chicago.head(5)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,result,duration_time_trip
0,2018-04-01 00:04:44,2018-04-01 00:13:03,22,171,May St & Taylor St,May St & Cullerton St,3819,Subscriber,Keep,0 days 00:08:19
1,2018-04-01 00:06:42,2018-04-01 00:27:07,157,190,Lake Shore Dr & Wellington Ave,Southport Ave & Wrightwood Ave,5000,Subscriber,Keep,0 days 00:20:25
2,2018-04-01 00:07:19,2018-04-01 00:23:19,106,106,State St & Pearson St,State St & Pearson St,5165,Customer,Keep,0 days 00:16:00
3,2018-04-01 00:07:33,2018-04-01 00:14:47,241,171,Morgan St & Polk St,May St & Cullerton St,3851,Subscriber,Keep,0 days 00:07:14
4,2018-04-01 00:10:23,2018-04-01 00:22:12,228,219,Damen Ave & Melrose Ave,Damen Ave & Cortland St,5065,Subscriber,Keep,0 days 00:11:49
