In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys, os

#sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(os.path.abspath(os.path.join("../scripts")))

In [3]:
import explore

# Import The Data

In [4]:
# Data that contains information about the completed orders

data_url = explore.get_data_url("data/nb.csv", "v0")
data_completed = pd.read_csv(data_url)

In [18]:
print("completed orders data has {} rows and {} columns".format(data_completed.shape[0], data_completed.shape[1]))

completed orders data has 536020 rows and 5 columns


In [5]:
data_completed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536020 entries, 0 to 536019
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Trip ID           536020 non-null  int64 
 1   Trip Origin       536020 non-null  object
 2   Trip Destination  536020 non-null  object
 3   Trip Start Time   534369 non-null  object
 4   Trip End Time     536019 non-null  object
dtypes: int64(1), object(4)
memory usage: 20.4+ MB


In [6]:
# able that contains delivery requests by clients (completed and unfulfilled) and driver locations during request

data_url = explore.get_data_url("data/driver_locations_during_request.csv", "v0")
data_driversloc = pd.read_csv(data_url)

In [19]:
print("driver locations during request data has {} rows and {} columns".format(data_driversloc.shape[0], data_driversloc.shape[1]))

driver locations during request data has 1557740 rows and 8 columns


In [7]:
data_driversloc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557740 entries, 0 to 1557739
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   id             1557740 non-null  int64  
 1   order_id       1557740 non-null  int64  
 2   driver_id      1557740 non-null  int64  
 3   driver_action  1557740 non-null  object 
 4   lat            1557740 non-null  float64
 5   lng            1557740 non-null  float64
 6   created_at     0 non-null        float64
 7   updated_at     0 non-null        float64
dtypes: float64(4), int64(3), object(1)
memory usage: 95.1+ MB


# Missing Values

In [8]:
explore.missing_values_table(data_completed)

Your selected dataframe has 5 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Dtype
Trip Start Time,1651,0.3,object
Trip End Time,1,0.0,object


In [9]:
# missing values are small percentabe. We drop them

cleanData_completed = data_completed.dropna()
explore.missing_values_table(cleanData_completed)

Your selected dataframe has 5 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Dtype


In [10]:
explore.missing_values_table(data_driversloc)

Your selected dataframe has 8 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Dtype
created_at,1557740,100.0,float64
updated_at,1557740,100.0,float64


In [11]:
# two columns are empty. we drop them

cleanData_driversloc = data_driversloc.dropna(axis=1)
explore.missing_values_table(cleanData_driversloc)

Your selected dataframe has 6 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Dtype


# Calculate Trips' distances and time durations

## Calculate Durations

In [None]:
# Change time data to datetime stamps

cleanData_completed['Trip Start Time'] = pd.to_datetime(cleanData_completed['Trip Start Time'])
cleanData_completed['Trip End Time'] = pd.to_datetime(cleanData_completed['Trip End Time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanData_completed['Trip Start Time'] = pd.to_datetime(cleanData_completed['Trip Start Time'])


In [36]:
cleanData_completed['TripDuration'] = cleanData_completed['Trip End Time']- cleanData_completed['Trip Start Time']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanData_completed['TripDuration'] = cleanData_completed['Trip End Time']-  cleanData_completed['Trip Start Time']


In [52]:
# trip duration in seconds 

cleanData_completed['TripDuration-sec'] = cleanData_completed['TripDuration'].apply(lambda x: x.seconds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanData_completed['TripDuration-sec'] = cleanData_completed['TripDuration'].apply(lambda x: x.seconds)


## Calculate Distances

In [14]:
# Table of completed orders has 'Trip Origin' and 'Trip Destination' (latitude, longitude) as str type.
# We convert these into columns


cleanData_completed['TripOrigin-Lat'] = cleanData_completed['Trip Origin'].apply(lambda x: x.split(',')[0])
cleanData_completed['TripOrigin-Long'] = cleanData_completed['Trip Origin'].apply(lambda x: x.split(',')[1])

cleanData_completed['TripDest-Lat'] = cleanData_completed['Trip Destination'].apply(lambda x: x.split(',')[0])
cleanData_completed['TripDest-Long'] = cleanData_completed['Trip Destination'].apply(lambda x: x.split(',')[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanData_completed['TripOrigin-Lat'] = cleanData_completed['Trip Origin'].apply(lambda x: x.split(',')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanData_completed['TripOrigin-Long'] = cleanData_completed['Trip Origin'].apply(lambda x: x.split(',')[1])


In [None]:
# 