# Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Read data

In [114]:
data = pd.read_csv("Supply_Chain_Shipment_Pricing_Dataset_20240302.csv",index_col="id")

# Data Cleaning

In [115]:
newcols = [i.strip().replace(" ","_").replace("#","num").replace("/","_").replace("(","").replace(")","") for i in data.columns]
data.columns = newcols

In [116]:
data.shape

(10324, 32)

In [117]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10324 entries, 1 to 86823
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   project_code                  10324 non-null  object 
 1   pq_num                        10324 non-null  object 
 2   po___so_num                   10324 non-null  object 
 3   asn_dn_num                    10324 non-null  object 
 4   country                       10324 non-null  object 
 5   managed_by                    10324 non-null  object 
 6   fulfill_via                   10324 non-null  object 
 7   vendor_inco_term              10324 non-null  object 
 8   shipment_mode                 9964 non-null   object 
 9   pq_first_sent_to_client_date  10324 non-null  object 
 10  po_sent_to_vendor_date        10324 non-null  object 
 11  scheduled_delivery_date       10324 non-null  object 
 12  delivered_to_client_date      10324 non-null  object 
 13  delive

In [118]:
data.head()

Unnamed: 0_level_0,project_code,pq_num,po___so_num,asn_dn_num,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,pq_first_sent_to_client_date,...,unit_of_measure_per_pack,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,line_item_insurance_usd
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,100-CI-T01,Pre-PQ Process,SCMS-4,ASN-8,Côte d'Ivoire,PMO - US,Direct Drop,EXW,Air,Pre-PQ Process,...,30,19,551.0,29.0,0.97,Ranbaxy Fine Chemicals LTD,True,13,780.34,
3,108-VN-T01,Pre-PQ Process,SCMS-13,ASN-85,Vietnam,PMO - US,Direct Drop,EXW,Air,Pre-PQ Process,...,240,1000,6200.0,6.2,0.03,"Aurobindo Unit III, India",True,358,4521.5,
4,100-CI-T01,Pre-PQ Process,SCMS-20,ASN-14,Côte d'Ivoire,PMO - US,Direct Drop,FCA,Air,Pre-PQ Process,...,100,500,40000.0,80.0,0.8,ABBVIE GmbH & Co.KG Wiesbaden,True,171,1653.78,
15,108-VN-T01,Pre-PQ Process,SCMS-78,ASN-50,Vietnam,PMO - US,Direct Drop,EXW,Air,Pre-PQ Process,...,60,31920,127360.8,3.99,0.07,"Ranbaxy, Paonta Shahib, India",True,1855,16007.06,
16,108-VN-T01,Pre-PQ Process,SCMS-81,ASN-55,Vietnam,PMO - US,Direct Drop,EXW,Air,Pre-PQ Process,...,60,38000,121600.0,3.2,0.05,"Aurobindo Unit III, India",True,7590,45450.08,


# Convert data types

## Explain why `po_sent_to_vendor_date`  and `pq_first_sent_to_client_date` has missing data values

In [143]:
data['po_sent_to_vendor_date'] = data['po_sent_to_vendor_date'].replace(['Date Not Captured', 'N/A - From RDC'],np.nan)
data['pq_first_sent_to_client_date'] = data['po_sent_to_vendor_date'].replace(['Date Not Captured', 'Pre-PQ Process'],np.nan)

In [144]:
date_cols = ['po_sent_to_vendor_date',
 'scheduled_delivery_date',
 'delivered_to_client_date',
 'delivery_recorded_date',
 'pq_first_sent_to_client_date'
 ]
display(data[date_cols])
for d in date_cols:
    print(d)
    data[d] = pd.to_datetime(data[d])

Unnamed: 0_level_0,po_sent_to_vendor_date,scheduled_delivery_date,delivered_to_client_date,delivery_recorded_date,pq_first_sent_to_client_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,NaT,2006-06-02,2006-06-02,2006-06-02,NaT
3,NaT,2006-11-14,2006-11-14,2006-11-14,NaT
4,NaT,2006-08-27,2006-08-27,2006-08-27,NaT
15,NaT,2006-09-01,2006-09-01,2006-09-01,NaT
16,NaT,2006-08-11,2006-08-11,2006-08-11,NaT
...,...,...,...,...,...
86818,NaT,2015-07-31,2015-07-15,2015-07-20,NaT
86819,NaT,2015-07-31,2015-08-06,2015-08-07,NaT
86821,NaT,2015-08-31,2015-08-25,2015-09-03,NaT
86822,NaT,2015-09-09,2015-08-04,2015-08-11,NaT


po_sent_to_vendor_date
scheduled_delivery_date
delivered_to_client_date
delivery_recorded_date
pq_first_sent_to_client_date


In [146]:
data[date_cols].isnull().sum()

po_sent_to_vendor_date          5732
scheduled_delivery_date            0
delivered_to_client_date           0
delivery_recorded_date             0
pq_first_sent_to_client_date    5732
dtype: int64

In [141]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10324 entries, 1 to 86823
Data columns (total 34 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   project_code                         10324 non-null  object        
 1   pq_num                               10324 non-null  object        
 2   po___so_num                          10324 non-null  object        
 3   asn_dn_num                           10324 non-null  object        
 4   country                              10324 non-null  object        
 5   managed_by                           10324 non-null  object        
 6   fulfill_via                          10324 non-null  object        
 7   vendor_inco_term                     10324 non-null  object        
 8   shipment_mode                        9964 non-null   object        
 9   po_sent_to_vendor_date               4592 non-null   datetime64[ns]
 10  scheduled_deliv