In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import math
from geopy.geocoders import Nominatim

In [144]:
pip install haversine

Note: you may need to restart the kernel to use updated packages.


In [145]:
amazon_del = pd.read_csv("amazon_delivery.csv")
amazon_del.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43739 entries, 0 to 43738
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order_ID         43739 non-null  object 
 1   Agent_Age        43739 non-null  int64  
 2   Agent_Rating     43685 non-null  float64
 3   Store_Latitude   43739 non-null  float64
 4   Store_Longitude  43739 non-null  float64
 5   Drop_Latitude    43739 non-null  float64
 6   Drop_Longitude   43739 non-null  float64
 7   Order_Date       43739 non-null  object 
 8   Order_Time       43739 non-null  object 
 9   Pickup_Time      43739 non-null  object 
 10  Weather          43648 non-null  object 
 11  Traffic          43739 non-null  object 
 12  Vehicle          43739 non-null  object 
 13  Area             43739 non-null  object 
 14  Delivery_Time    43739 non-null  int64  
 15  Category         43739 non-null  object 
dtypes: float64(5), int64(2), object(9)
memory usage: 5.3+ MB


# **Introduction**  


## **Quick look column by column**  

### **Order_ID**  
Should be equivalent to PRIMARY KEY of a SQL table, only unique values. If so it can be removed when building a machine learning.  

In [146]:
duplicates_order = amazon_del.duplicated(subset=["Order_ID"])
print(len(amazon_del[duplicates_order]))

0


All IDs are different.

### **Agent_Age**  
On first sight everything seems ok. Type: integer and no missing values.  


In [147]:
amazon_del["Agent_Age"].describe()

count    43739.000000
mean        29.567137
std          5.815155
min         15.000000
25%         25.000000
50%         30.000000
75%         35.000000
max         50.000000
Name: Agent_Age, dtype: float64

No Age = 0 or other abeerrant numbers.

### **Agent Rating**  
Based on the info() table there are apparent missing values (54).

In [148]:
missing_ratings = amazon_del[amazon_del["Agent_Rating"].isna()]
non_missing_ratings = amazon_del[~(amazon_del["Agent_Rating"].isna())]
len(missing_ratings)/len(amazon_del)

0.0012345961270262237

I could potentially remove the values as they represent less than 1%. But I will see if other values are missing.

In [149]:
missing_ratings["Traffic"].value_counts(dropna=False)

Traffic
Low        23
Jam        15
Medium     14
High        2
Name: count, dtype: int64

In [150]:
missing_ratings["Weather"].value_counts(dropna=False)

Weather
Windy         12
Sunny         11
Cloudy        11
Stormy         8
Sandstorms     7
Fog            5
Name: count, dtype: int64

I can remove them. There are no obvious reason of these missing values.  

In [151]:
amazon_del = amazon_del.drop(missing_ratings.index, axis=0)
amazon_del.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43685 entries, 0 to 43738
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order_ID         43685 non-null  object 
 1   Agent_Age        43685 non-null  int64  
 2   Agent_Rating     43685 non-null  float64
 3   Store_Latitude   43685 non-null  float64
 4   Store_Longitude  43685 non-null  float64
 5   Drop_Latitude    43685 non-null  float64
 6   Drop_Longitude   43685 non-null  float64
 7   Order_Date       43685 non-null  object 
 8   Order_Time       43685 non-null  object 
 9   Pickup_Time      43685 non-null  object 
 10  Weather          43594 non-null  object 
 11  Traffic          43685 non-null  object 
 12  Vehicle          43685 non-null  object 
 13  Area             43685 non-null  object 
 14  Delivery_Time    43685 non-null  int64  
 15  Category         43685 non-null  object 
dtypes: float64(5), int64(2), object(9)
memory usage: 5.7+ MB


### **Store Latitude**  
When the latitude is positive we are above the equator line. When the latitude is negative we are south to the equator line.  

In [152]:
amazon_del["Store_Latitude"].describe()

count    43685.000000
mean        17.214543
std          7.750885
min        -30.902872
25%         12.933298
50%         18.551440
75%         22.732225
max         30.914057
Name: Store_Latitude, dtype: float64

### **Store Longitude**  
No apparent missing values.

In [153]:
amazon_del["Store_Longitude"].describe()

count    43685.000000
mean        70.668593
std         21.459258
min        -88.366217
25%         73.170283
50%         75.898497
75%         78.045359
max         88.433452
Name: Store_Longitude, dtype: float64

In [154]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [155]:

from modules_amazon import Location

latitude_0 = amazon_del.iloc[0,3]
longitude_0 = amazon_del.iloc[0,4]
loc = Location(latitude_0,longitude_0)
print(loc.city())
print(loc.country())


Indore
India


### **Order_Date**  
No apparent missing values. I will switch to date format.

In [156]:
print(amazon_del["Order_Date"].head())
amazon_del["Order_Date"] = pd.to_datetime(amazon_del["Order_Date"], format="%Y-%m-%d")
print(amazon_del["Order_Date"].dtype)


0    2022-03-19
1    2022-03-25
2    2022-03-19
3    2022-04-05
4    2022-03-26
Name: Order_Date, dtype: object
datetime64[ns]


### **Order_Time**  
No apparent missing values.

I don't have any NaN at this position or nearby. 

In [157]:
print(amazon_del["Order_Time"].unique())

['11:30:00' '19:45:00' '08:30:00' '18:00:00' '13:30:00' '21:20:00'
 '19:15:00' '17:25:00' '20:55:00' '21:55:00' '14:55:00' '17:30:00'
 '09:20:00' '19:50:00' '20:25:00' '20:30:00' '20:40:00' '21:15:00'
 '20:20:00' '22:30:00' '08:15:00' '19:30:00' '12:25:00' '18:35:00'
 '20:35:00' '23:20:00' '23:35:00' '22:35:00' '23:25:00' '13:35:00'
 '21:35:00' '18:55:00' '14:15:00' '11:00:00' '09:45:00' '08:40:00'
 '23:00:00' '19:10:00' '10:55:00' '21:40:00' '19:00:00' '16:45:00'
 '15:10:00' '22:45:00' '22:10:00' '20:45:00' '22:50:00' '17:55:00'
 '09:25:00' '20:15:00' '22:25:00' '22:40:00' '23:50:00' '15:25:00'
 '10:20:00' '10:40:00' '15:55:00' '20:10:00' '12:10:00' '15:30:00'
 '10:35:00' '21:10:00' '20:50:00' '12:35:00' '21:00:00' '23:40:00'
 '18:15:00' '18:20:00' '11:45:00' '12:45:00' '23:30:00' '10:50:00'
 '21:25:00' '10:10:00' '17:50:00' '22:20:00' '12:40:00' '23:55:00'
 '10:25:00' '08:45:00' '23:45:00' '19:55:00' '22:15:00' '23:10:00'
 '09:15:00' '18:25:00' '18:45:00' '16:50:00' '00:00:00' '14:20

However, I can see NaN written with a whitespace. That is the reason why it cannot be considered as missing value.

In [158]:
amazon_del["Order_Time"] = amazon_del["Order_Time"].str.strip()
missing_Order_Time = amazon_del[amazon_del["Order_Time"] == "NaN"]
print(missing_Order_Time.info())

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 2286 to 43490
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Order_ID         91 non-null     object        
 1   Agent_Age        91 non-null     int64         
 2   Agent_Rating     91 non-null     float64       
 3   Store_Latitude   91 non-null     float64       
 4   Store_Longitude  91 non-null     float64       
 5   Drop_Latitude    91 non-null     float64       
 6   Drop_Longitude   91 non-null     float64       
 7   Order_Date       91 non-null     datetime64[ns]
 8   Order_Time       91 non-null     object        
 9   Pickup_Time      91 non-null     object        
 10  Weather          0 non-null      object        
 11  Traffic          91 non-null     object        
 12  Vehicle          91 non-null     object        
 13  Area             91 non-null     object        
 14  Delivery_Time    91 non-null     int64     

Interestingly, all the missing values Order_Time correlate with the absence of Weather values.  
I won't remove them right away, I will tr< to understand why is that.

In [159]:
amazon_del["Order_Time"] = pd.to_datetime(amazon_del["Order_Time"], infer_datetime_format=True, errors="coerce")
print(amazon_del["Order_Time"].head())

0   2024-12-03 11:30:00
1   2024-12-03 19:45:00
2   2024-12-03 08:30:00
3   2024-12-03 18:00:00
4   2024-12-03 13:30:00
Name: Order_Time, dtype: datetime64[ns]


  amazon_del["Order_Time"] = pd.to_datetime(amazon_del["Order_Time"], infer_datetime_format=True, errors="coerce")
  amazon_del["Order_Time"] = pd.to_datetime(amazon_del["Order_Time"], infer_datetime_format=True, errors="coerce")


### **Pickup_Time**  
No apparent missing values.

In [160]:
amazon_del["Pickup_Time"] = pd.to_datetime(amazon_del["Pickup_Time"], format="%H:%M:%S")
amazon_del["Pickup_Time"] = amazon_del["Pickup_Time"].dt.time
print(amazon_del["Pickup_Time"].dtype)
print(amazon_del["Pickup_Time"].head())

object
0    11:45:00
1    19:50:00
2    08:45:00
3    18:10:00
4    13:45:00
Name: Pickup_Time, dtype: object


### **Weather**  
Based on the info table there are 91 missing values. As observed above they all contain also missing Order_Time values.  

In [161]:
missing_weather = amazon_del[amazon_del["Weather"].isna()]
non_missing_weather = amazon_del[~(amazon_del["Weather"].isna())]
print(missing_weather.info())

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 2286 to 43490
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Order_ID         91 non-null     object        
 1   Agent_Age        91 non-null     int64         
 2   Agent_Rating     91 non-null     float64       
 3   Store_Latitude   91 non-null     float64       
 4   Store_Longitude  91 non-null     float64       
 5   Drop_Latitude    91 non-null     float64       
 6   Drop_Longitude   91 non-null     float64       
 7   Order_Date       91 non-null     datetime64[ns]
 8   Order_Time       0 non-null      datetime64[ns]
 9   Pickup_Time      91 non-null     object        
 10  Weather          0 non-null      object        
 11  Traffic          91 non-null     object        
 12  Vehicle          91 non-null     object        
 13  Area             91 non-null     object        
 14  Delivery_Time    91 non-null     int64     

In [162]:
missing_weather.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
2286,xige084493792,15,1.0,-26.891191,75.802083,26.981191,75.892083,2022-03-12,NaT,17:20:00,,,motorcycle,Urban,75,Home
2779,oilg311747812,15,1.0,22.75004,75.902847,22.81004,75.962847,2022-04-03,NaT,20:30:00,,,scooter,Metropolitian,145,Kitchen
2825,pbox816153129,15,1.0,21.149569,72.772697,21.209569,72.832697,2022-03-21,NaT,22:10:00,,,bicycle,Metropolitian,100,Clothing
3438,nzsa056960624,50,6.0,-12.970324,-77.645748,13.010324,77.685748,2022-03-13,NaT,12:30:00,,,van,Urban,125,Cosmetics
4514,eids248121351,50,6.0,-17.451976,-78.385883,17.561976,78.495883,2022-04-04,NaT,23:20:00,,,bicycle,Metropolitian,135,Jewelry


In [163]:
amazon_del["Traffic"].unique()

array(['High ', 'Jam ', 'Low ', 'Medium ', 'NaN '], dtype=object)

Actually, Traffic also have NaN values due again to a white space.

In [164]:
amazon_del["Traffic"] = amazon_del["Traffic"].str.strip()
missing_weather["Traffic"] = missing_weather["Traffic"].str.strip()
print(missing_weather["Traffic"].unique())

['NaN']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_weather["Traffic"] = missing_weather["Traffic"].str.strip()


In [165]:
missing_weather["Traffic"] = missing_weather["Traffic"].replace("NaN", np.nan)

  missing_weather["Traffic"] = missing_weather["Traffic"].replace("NaN", np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_weather["Traffic"] = missing_weather["Traffic"].replace("NaN", np.nan)


In [166]:
missing_weather.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 2286 to 43490
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Order_ID         91 non-null     object        
 1   Agent_Age        91 non-null     int64         
 2   Agent_Rating     91 non-null     float64       
 3   Store_Latitude   91 non-null     float64       
 4   Store_Longitude  91 non-null     float64       
 5   Drop_Latitude    91 non-null     float64       
 6   Drop_Longitude   91 non-null     float64       
 7   Order_Date       91 non-null     datetime64[ns]
 8   Order_Time       0 non-null      datetime64[ns]
 9   Pickup_Time      91 non-null     object        
 10  Weather          0 non-null      object        
 11  Traffic          0 non-null      float64       
 12  Vehicle          91 non-null     object        
 13  Area             91 non-null     object        
 14  Delivery_Time    91 non-null     int64     

We can now as well all the missing weather values also contain only Traffic null values. 

### **Vehicle**

In [167]:
amazon_del["Vehicle"].unique()

array(['motorcycle ', 'scooter ', 'van', 'bicycle '], dtype=object)

In [168]:
amazon_del["Vehicle"] = amazon_del["Vehicle"].str.strip()
print(amazon_del["Vehicle"].unique())

['motorcycle' 'scooter' 'van' 'bicycle']


In [169]:
order_vehicle = ["bicycle", "scooter", "motorcycle","van"]
amazon_del["Vehicle"] = amazon_del["Vehicle"].astype("category")
amazon_del["Vehicle"] = amazon_del["Vehicle"].cat.reorder_categories(new_categories=order_vehicle, ordered=True)

### **Area**

In [170]:
amazon_del["Area"].unique()

array(['Urban ', 'Metropolitian ', 'Semi-Urban ', 'Other'], dtype=object)

In [171]:
amazon_del["Area"] = amazon_del["Area"].str.strip()


In [172]:
amazon_del["Area"] = amazon_del["Area"].replace("Metropolitian","Metropolitan")
order_Area = ["Other","Semi-Urban","Urban","Metropolitan"]
amazon_del["Area"] = amazon_del["Area"].astype("category")
amazon_del["Area"] = amazon_del["Area"].cat.reorder_categories(new_categories=order_Area, ordered=True)


### **Delivery_Time**  
No apparent missing values. That will be my dependent variable.

In [173]:
print(amazon_del["Delivery_Time"].describe())

count    43685.000000
mean       124.907588
std         51.924227
min         10.000000
25%         90.000000
50%        125.000000
75%        160.000000
max        270.000000
Name: Delivery_Time, dtype: float64


### **Category**  
No apparent missing values.  

In [174]:
amazon_del["Category"].unique()

array(['Clothing', 'Electronics', 'Sports', 'Cosmetics', 'Toys', 'Snacks',
       'Shoes', 'Apparel', 'Jewelry', 'Outdoors', 'Grocery', 'Books',
       'Kitchen', 'Home', 'Pet Supplies', 'Skincare'], dtype=object)

## **Creating a column for distance between the store point and the drop point**

In [175]:
import haversine as hs
from haversine import Unit

distance = []
for loc1,loc2 in zip(zip(amazon_del["Store_Latitude"],amazon_del["Store_Longitude"]),\
                     zip(amazon_del["Drop_Latitude"],amazon_del["Drop_Longitude"])):
    dist = hs.haversine(loc1,loc2, unit=Unit.KILOMETERS)
    distance.append(dist)
    
amazon_del["Distance_Store_Drop_km"] = distance
print(amazon_del["Distance_Store_Drop_km"].describe())

count    43685.000000
mean        38.077532
std        532.442985
min          1.465069
25%          4.663439
50%          9.220463
75%         13.682398
max      19692.701807
Name: Distance_Store_Drop_km, dtype: float64


The max distance is aberrantly high. I will see how many of these I have and if I can remove them.

In [176]:
iqr_dist = np.quantile(amazon_del["Distance_Store_Drop_km"],0.75) - np.quantile(amazon_del["Distance_Store_Drop_km"],0.25)
sup_outlier_dist = np.quantile(amazon_del["Distance_Store_Drop_km"], 0.75) + 1.5*iqr_dist
dist_outliers_sup = amazon_del[amazon_del["Distance_Store_Drop_km"] > sup_outlier_dist]

print(dist_outliers_sup.info())
print(sup_outlier_dist)

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 90 to 43343
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Order_ID                183 non-null    object        
 1   Agent_Age               183 non-null    int64         
 2   Agent_Rating            183 non-null    float64       
 3   Store_Latitude          183 non-null    float64       
 4   Store_Longitude         183 non-null    float64       
 5   Drop_Latitude           183 non-null    float64       
 6   Drop_Longitude          183 non-null    float64       
 7   Order_Date              183 non-null    datetime64[ns]
 8   Order_Time              151 non-null    datetime64[ns]
 9   Pickup_Time             183 non-null    object        
 10  Weather                 151 non-null    object        
 11  Traffic                 183 non-null    object        
 12  Vehicle                 183 non-null    category    

In [177]:
len(amazon_del[amazon_del["Distance_Store_Drop_km"] >= 19000])

4

In [178]:
print(amazon_del[amazon_del["Distance_Store_Drop_km"] >= 19000])

            Order_ID  Agent_Age  Agent_Rating  Store_Latitude  \
9148   jysr585437673         50           6.0      -23.374989   
13106  giiq483839538         50           6.0      -23.416792   
21446  jthh363774386         15           1.0      -23.351058   
32179  qkrp333211010         50           6.0      -22.533662   

       Store_Longitude  Drop_Latitude  Drop_Longitude Order_Date Order_Time  \
9148        -85.335486      23.444989       85.405486 2022-03-20        NaT   
13106       -85.316842      23.466792       85.366842 2022-03-13        NaT   
21446       -85.325731      23.421058       85.395731 2022-03-27        NaT   
32179       -88.366217      22.663662       88.496217 2022-02-18        NaT   

      Pickup_Time Weather Traffic     Vehicle          Area  Delivery_Time  \
9148     17:55:00     NaN     NaN     bicycle  Metropolitan            160   
13106    20:40:00     NaN     NaN  motorcycle  Metropolitan            140   
21446    21:35:00     NaN     NaN         va

Ah ah! The problem is the Drop_Latitude and Drop_Longitude. The coordinates are at the opposite of the Store coordinates. I can impute them. I will know better when I will know the countries involved in this dataset. 

## **Creating a column for city and a column for country**  
I have created a module allowing to determine the city and country names using the longitude and latitude.

In [179]:

from modules_amazon import Location

aberr_dist = amazon_del[amazon_del["Distance_Store_Drop_km"] >= 100 ]
print(len(aberr_dist))
print(aberr_dist["Distance_Store_Drop_km"].describe())


183
count      183.000000
mean      6775.900325
std       4711.467596
min       2216.068855
25%       4423.349282
50%       5173.323369
75%       6748.175550
max      19692.701807
Name: Distance_Store_Drop_km, dtype: float64


The superior outliers have all >100 km distance. They have actually a median of 5173km distance (the median of our sample is 9km)

In [180]:
print(dist_outliers_sup["Store_Latitude"].describe())
print(dist_outliers_sup["Store_Longitude"].describe())

no_outliers = amazon_del[~(amazon_del["Distance_Store_Drop_km"] >= sup_outlier_dist)]
print(no_outliers["Store_Latitude"].describe())
print(no_outliers["Store_Longitude"].describe())

loc = Location(18.5,75.9)
print(loc.country())


count    183.000000
mean     -21.487587
std        6.184380
min      -30.902872
25%      -26.472849
50%      -22.538999
75%      -15.569600
max       -9.959778
Name: Store_Latitude, dtype: float64
count    183.000000
mean      55.696719
std       56.227809
min      -88.366217
25%       73.950889
50%       76.307589
75%       80.318244
max       88.433452
Name: Store_Longitude, dtype: float64
count    43502.000000
mean        17.377351
std          7.337660
min          0.000000
25%         12.934179
50%         18.554382
75%         22.732225
max         30.914057
Name: Store_Latitude, dtype: float64
count    43502.000000
mean        70.731575
std         21.172214
min          0.000000
25%         73.170283
50%         75.898497
75%         78.044095
max         88.433452
Name: Store_Longitude, dtype: float64
India


Our data outside the outliers is based (or mainly) in India. I will therefore switch to the positive coordinates and recalculate the distance.

In [181]:
amazon_del["abs_Store_Latitude"] = abs(amazon_del["Store_Latitude"])
amazon_del["abs_Store_Longitude"] = abs(amazon_del["Store_Longitude"])


In [182]:
distance = []
for loc1,loc2 in zip(zip(amazon_del["abs_Store_Latitude"],amazon_del["abs_Store_Longitude"]),\
                     zip(amazon_del["Drop_Latitude"],amazon_del["Drop_Longitude"])):
    dist = hs.haversine(loc1,loc2, unit=Unit.KILOMETERS)
    distance.append(dist)
    
amazon_del["Distance_Store_Drop_km"] = distance

print(amazon_del["Distance_Store_Drop_km"].describe())

count    43685.000000
mean         9.732511
std          5.604092
min          1.465069
25%          4.663418
50%          9.220160
75%         13.681435
max         20.969518
Name: Distance_Store_Drop_km, dtype: float64


### **Creating a column city**  
Because the median of the Store coordinates is located in India and that the median distance is 9km, I can confidently conclude that at least most of the data is located in India.  
Hence, I will only make a "city" column to see if there are discrepancies between cities.  
However, the size of the dataset is too large. I will make a sample of 400 rows.

In [183]:
amazon_sample400 = amazon_del.sample(n=400, replace=False, random_state=1984)


In [184]:
City_Stores = []
for lat,long in zip(amazon_sample400["abs_Store_Latitude"],amazon_sample400["abs_Store_Longitude"]) :
    loc = Location(lat,long)
    City_Stores.append(loc.city())



In [185]:
amazon_sample400["City"] = City_Stores
print(amazon_sample400["City"].unique())

['' 'Mysuru' 'Mumbai Suburban' 'Coimbatore' 'Indore' 'Kolkata' 'Jaipur'
 'Ranchi' 'Prayagraj' 'Pune City' 'Bhopal' 'Dehradun' 'Kanpur' 'Surat'
 'Agra' 'Hyderabad' 'Chhatrapati Sambhaji Nagar' 'Bengaluru' 'Mumbai City'
 'Ernakulam' 'Chennai' 'Vadodara' 'Ludhiana'
 'Jaipur Municipal Corporation' 'Thane']


In [186]:
print(amazon_sample400.info())

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 4628 to 16065
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Order_ID                400 non-null    object        
 1   Agent_Age               400 non-null    int64         
 2   Agent_Rating            400 non-null    float64       
 3   Store_Latitude          400 non-null    float64       
 4   Store_Longitude         400 non-null    float64       
 5   Drop_Latitude           400 non-null    float64       
 6   Drop_Longitude          400 non-null    float64       
 7   Order_Date              400 non-null    datetime64[ns]
 8   Order_Time              399 non-null    datetime64[ns]
 9   Pickup_Time             400 non-null    object        
 10  Weather                 399 non-null    object        
 11  Traffic                 400 non-null    object        
 12  Vehicle                 400 non-null    category  

In this sample I have only 1 unknown wheather value, meaning that I also have 1 order and 1 traffic missing data.

# **EDA with the amazon_sample400 new dataset**

I will first remove the missing weather row.

In [187]:
amazon_sample400 = amazon_sample400.dropna(subset=["Weather"])
print(amazon_sample400.info())

<class 'pandas.core.frame.DataFrame'>
Index: 399 entries, 4628 to 16065
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Order_ID                399 non-null    object        
 1   Agent_Age               399 non-null    int64         
 2   Agent_Rating            399 non-null    float64       
 3   Store_Latitude          399 non-null    float64       
 4   Store_Longitude         399 non-null    float64       
 5   Drop_Latitude           399 non-null    float64       
 6   Drop_Longitude          399 non-null    float64       
 7   Order_Date              399 non-null    datetime64[ns]
 8   Order_Time              399 non-null    datetime64[ns]
 9   Pickup_Time             399 non-null    object        
 10  Weather                 399 non-null    object        
 11  Traffic                 399 non-null    object        
 12  Vehicle                 399 non-null    category  

## **Is Delivery time the difference between order time and pickup time?**