In [2]:
import pandas as pd
import os

In [3]:
# OS dependent path
bog_path = os.path.join("dataset", "bog_clean.csv")
mex_path = os.path.join("dataset", "mex_clean.csv")
equ_path = os.path.join("dataset", "uio_clean.csv")

In [4]:
# read dataset
bog_df = pd.read_csv(bog_path)
mex_df = pd.read_csv(mex_path)
equ_df = pd.read_csv(equ_path)

In [5]:
bog_df.tail(3)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_meters,wait_sec
3060,3061,Bogotá,2016-10-27 12:28:22,2016-10-27 01:16:58,-74.061556,4.709213,-74.042396,4.708566,N,2917,10994,1401
3061,3062,Bogotá,2016-10-27 07:40:49,2016-10-27 09:08:09,-74.050934,4.752078,-74.050875,4.752123,N,5240,15803,3076
3062,3063,Bogotá,2016-10-26 04:27:39,2016-10-28 06:50:28,-74.052223,4.705252,-74.050725,4.714622,N,138169,5934,354


In [6]:
mex_df.tail(3)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_meters,wait_sec
12691,12692,México DF Taxi Libre,2016-10-27 11:19:44,2016-10-27 11:38:35,-99.170637,19.283637,-99.178194,19.280982,N,1131,6051,228
12692,12693,México DF Taxi de Sitio,2016-10-28 06:49:41,2016-10-28 06:51:25,-99.194384,19.396768,-99.194622,19.396717,N,104,49,96
12693,12694,México DF Radio Taxi,2016-10-27 10:26:38,2016-10-28 07:10:21,-99.180135,19.369919,-99.180551,19.372276,N,31424,3448,217


In [7]:
equ_df.tail(3)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_meters,wait_sec
32363,32364,Quito,2016-10-27 12:10:18,2016-10-27 06:33:53,-78.477247,-0.107514,-78.490093,-0.100859,N,66216,23607,958
32364,32365,Quito,2016-10-25 04:58:55,2016-10-25 05:00:25,-78.550264,-0.25673,-78.550306,-0.256756,N,91,43,70
32365,32366,Quito,2016-10-28 06:47:59,2016-10-28 06:58:31,-78.431986,-0.341538,-78.446296,-0.327428,N,633,3296,150


In [8]:
print(bog_df.shape)
print(mex_df.shape)
print(equ_df.shape)

(3063, 12)
(12694, 12)
(32366, 12)


In [9]:
# check missing data each files
bog_df.isna().sum(axis=0)

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dist_meters           0
wait_sec              0
dtype: int64

In [10]:
mex_df.isna().sum(axis=0)

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dist_meters           0
wait_sec              0
dtype: int64

In [11]:
equ_df.isna().sum(axis=0)

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dist_meters           0
wait_sec              0
dtype: int64

In [12]:
# looks good no missing data
# lets just concat the datasets from bogota, mexico, and equador
# then sort by ID
dataset_df = pd.concat([bog_df, mex_df, equ_df], ignore_index=True)
dataset_df = dataset_df.sort_values(by=['id'])
print(dataset_df.shape)

(48123, 12)


In [13]:
# check type data each column
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48123 entries, 0 to 48122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  48123 non-null  int64  
 1   vendor_id           48123 non-null  object 
 2   pickup_datetime     48123 non-null  object 
 3   dropoff_datetime    48123 non-null  object 
 4   pickup_longitude    48123 non-null  float64
 5   pickup_latitude     48123 non-null  float64
 6   dropoff_longitude   48123 non-null  float64
 7   dropoff_latitude    48123 non-null  float64
 8   store_and_fwd_flag  48123 non-null  object 
 9   trip_duration       48123 non-null  int64  
 10  dist_meters         48123 non-null  int64  
 11  wait_sec            48123 non-null  int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 4.8+ MB


In [14]:
# check suspicious store_and_fwd_flag value
dataset_df["store_and_fwd_flag"].unique()

array(['N'], dtype=object)

In [15]:
# so store_and_fwd_flag only have one unique value, just drop it coz not very useful
dataset_df = dataset_df.drop(columns="store_and_fwd_flag")

In [16]:
# check and drop if there is any duplicated data
dataset_df = dataset_df.drop_duplicates()
print(dataset_df.shape)

(48123, 11)
