## Data Cleaning notebook

In [1]:
import pandas as pd
import sys, os

sys.path.append(os.path.abspath(os.path.join('../utils')))

In [2]:
from eda_helpers import EDA

In [3]:
eda = EDA('../data/raw')
eda.load_data()

In [4]:
trips_file = eda.trips_df
requests_file = eda.requests_df

In [5]:
trips_file.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time
0,391996,"6.508813001668548,3.37740316890347","6.650969799999999,3.3450307",2021-07-01 07:28:04,2021-07-01 07:29:37
1,391997,"6.4316714,3.4555375","6.4280814653326,3.4721885847586",2021-07-01 06:38:04,2021-07-01 07:07:28
2,391998,"6.631679399999999,3.3388976","6.508324099999999,3.3590397",2021-07-01 06:21:02,2021-07-01 07:02:23
3,391999,"6.572757200000001,3.3677082","6.584881099999999,3.3614073",2021-07-01 07:16:07,2021-07-01 07:29:42
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36


In [6]:
requests_file.head()

Unnamed: 0,id,order_id,driver_id,driver_action,lat,lng,created_at,updated_at
0,1,392001,243828,accepted,6.602207,3.270465,,
1,2,392001,243588,rejected,6.592097,3.287445,,
2,3,392001,243830,rejected,6.596133,3.281784,,
3,4,392001,243539,rejected,6.596142,3.280526,,
4,5,392001,171653,rejected,6.609232,3.2888,,


In [7]:
print("Trips Data Summary:")
trips_file.info()


Trips Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536020 entries, 0 to 536019
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Trip ID           536020 non-null  int64 
 1   Trip Origin       536020 non-null  object
 2   Trip Destination  536020 non-null  object
 3   Trip Start Time   534369 non-null  object
 4   Trip End Time     536019 non-null  object
dtypes: int64(1), object(4)
memory usage: 20.4+ MB


In [8]:
print("\nRequests Data Summary:")
requests_file.info()


Requests Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557740 entries, 0 to 1557739
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   id             1557740 non-null  int64  
 1   order_id       1557740 non-null  int64  
 2   driver_id      1557740 non-null  int64  
 3   driver_action  1557740 non-null  object 
 4   lat            1557740 non-null  float64
 5   lng            1557740 non-null  float64
 6   created_at     0 non-null        float64
 7   updated_at     0 non-null        float64
dtypes: float64(4), int64(3), object(1)
memory usage: 95.1+ MB


In [9]:
print("Trips Data Null Values:")
print(trips_file.isnull().sum())

Trips Data Null Values:
Trip ID                0
Trip Origin            0
Trip Destination       0
Trip Start Time     1651
Trip End Time          1
dtype: int64


In [10]:
# To drop the Trip Start Time column, you can use the drop method:
trips_df=trips_file.copy()


In [11]:
trips_df.shape

(536020, 5)

In [12]:
# trips_df.drop('Trip Start Time', axis=1, inplace=True)
trips_df = trips_df[trips_df['Trip Start Time'].notna()]
trips_df.shape

(534369, 5)

In [13]:
trips_df.dropna(subset=['Trip End Time'], inplace=True)
trips_df.shape

(534368, 5)

In [14]:

trips_df.isnull().sum()

Trip ID             0
Trip Origin         0
Trip Destination    0
Trip Start Time     0
Trip End Time       0
dtype: int64

In [15]:
trips_df = trips_df.rename(columns={'Trip ID': 'order_id'})

In [16]:

print("\nRequests Data Null Values:")
print(requests_file.isnull().sum())


Requests Data Null Values:
id                     0
order_id               0
driver_id              0
driver_action          0
lat                    0
lng                    0
created_at       1557740
updated_at       1557740
dtype: int64


In [17]:
requests_df=requests_file.copy()

In [18]:
requests_df['driver_action'].unique()

array(['accepted', 'rejected'], dtype=object)

In [19]:
accepted_count = requests_df[requests_df['driver_action'] == 'accepted'].shape[0]
print(f"Count of 'accepted' in 'driver_action' column: {accepted_count}")

Count of 'accepted' in 'driver_action' column: 25903


In [20]:
requests_df.shape

(1557740, 8)

In [21]:
requests_df.drop(['created_at', 'updated_at'], axis=1, inplace=True)


In [22]:
requests_df.isnull().sum()

id               0
order_id         0
driver_id        0
driver_action    0
lat              0
lng              0
dtype: int64

### Save cleaned data

In [23]:
# Specify the folder path where you want to save the CSV files
folder_path = '../data/processed'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the DataFrames as CSV files
trips_df.to_csv(os.path.join(folder_path, 'trips_df.csv'), index=False)
requests_df.to_csv(os.path.join(folder_path, 'requests_df.csv'), index=False)