In [6]:
import pandas as pd

# 1. Load with explicit encoding to avoid decode errors
supply_df = pd.read_csv('../data/DataCoSupplyChainDataset.csv', encoding='ISO-8859-1')
desc_df   = pd.read_csv('../data/DescriptionDataCoSupplyChain.csv', encoding='ISO-8859-1')

# 2. Preview the data
print("=== Supply Chain Data Preview ===")
display(supply_df.head())

print("\n=== Column Descriptions Preview ===")
display(desc_df)

# 3. Check dataframe info (types, non-null counts)
print("\n=== DataFrame Info ===")
supply_df.info()

# 4. Missing values per column
missing = (
    supply_df
    .isnull()
    .sum()
    .reset_index()
    .rename(columns={'index':'column', 0:'missing_count'})
)
print("\n=== Missing Values ===")
display(missing)

# 5. Unique counts per column
unique_counts = (
    supply_df
    .nunique()
    .reset_index()
    .rename(columns={'index':'column', 0:'unique_count'})
)
print("\n=== Unique Value Counts ===")
display(unique_counts)

# 6. Convert date‐strings into datetime
supply_df['Order_Date'] = pd.to_datetime(
    supply_df['order date (DateOrders)'], 
    infer_datetime_format=True, 
    errors='coerce'
)
supply_df['Ship_Date'] = pd.to_datetime(
    supply_df['shipping date (DateOrders)'], 
    infer_datetime_format=True, 
    errors='coerce'
)

# 7. Compute Scheduled_Ship_Date and Delivery_Delay
supply_df['Scheduled_Ship_Date'] = (
    supply_df['Order_Date'] 
    + pd.to_timedelta(supply_df['Days for shipment (scheduled)'], unit='D')
)
supply_df['Delivery_Delay_Days'] = (
    (supply_df['Ship_Date'] - supply_df['Scheduled_Ship_Date'])
    .dt.days
)

# 8. Summarize new features
print("\n=== Delivery Delay (Days) Summary ===")
display(supply_df['Delivery_Delay_Days'].describe())

print("\n=== Late Delivery Risk Distribution ===")
display(supply_df['Late_delivery_risk'].value_counts())


=== Supply Chain Data Preview ===


Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class



=== Column Descriptions Preview ===


Unnamed: 0,FIELDS,DESCRIPTION
0,Type,: Type of transaction made
1,Days for shipping (real),: Actual shipping days of the purchased product
2,Days for shipment (scheduled),: Days of scheduled delivery of the purchased...
3,Benefit per order,: Earnings per order placed
4,Sales per customer,: Total sales per customer made per customer
5,Delivery Status,: Delivery status of orders: Advance shipping...
6,Late_delivery_risk,: Categorical variable that indicates if send...
7,Category Id,: Product category code
8,Category Name,: Description of the product category
9,Customer City,: City where the customer made the purchase



=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  object 
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Id                    180519 non-null  int64  
 8   Category Name                  180519 non-null  object 
 9   Customer City                  180519 non-null  object 
 10  Customer Country               180519 non-null  object 
 11  Customer Email                 180519 non-null  object 
 12  Custom

Unnamed: 0,column,missing_count
0,Type,0
1,Days for shipping (real),0
2,Days for shipment (scheduled),0
3,Benefit per order,0
4,Sales per customer,0
5,Delivery Status,0
6,Late_delivery_risk,0
7,Category Id,0
8,Category Name,0
9,Customer City,0



=== Unique Value Counts ===


Unnamed: 0,column,unique_count
0,Type,4
1,Days for shipping (real),7
2,Days for shipment (scheduled),4
3,Benefit per order,21998
4,Sales per customer,2927
5,Delivery Status,4
6,Late_delivery_risk,2
7,Category Id,51
8,Category Name,50
9,Customer City,563



=== Delivery Delay (Days) Summary ===


count    180519.000000
mean          0.540010
std           1.491881
min          -2.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           4.000000
Name: Delivery_Delay_Days, dtype: float64


=== Late Delivery Risk Distribution ===


1    98977
0    81542
Name: Late_delivery_risk, dtype: int64