This file is mostly focused on deducing the relationship betweenorder status, payment type, and possibly linking them up with shipment delays

###  Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

#### Read dataframes with index columns

In [2]:
departments_df = pd.read_csv("BigSupplyCo_Data_Files\departments_v2.csv", index_col="Department Id")
orders_df = pd.read_csv("BigSupplyCo_Data_Files\orders_v2.csv", index_col = "Order Id")
products_df = pd.read_csv("BigSupplyCo_Data_Files\products_v2.csv", index_col = "Product Id")
customers_df = pd.read_csv("BigSupplyCo_Data_Files\customers_v2.csv", index_col = "Customer Id")

## 1. Analyze the shipping info in more detail

Trying to deduce relationships between the 4 columns at the end that relate to shippping and spot any errors

Create subset of orders dataframe

In [3]:
shipping_df = orders_df.filter(["Days for shipping (real)","Days for shipment (scheduled)","Delivery Status","Late Delivery Risk"]).copy()
shipping_df.head(3)

Unnamed: 0_level_0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77202,3,4,Advance shipping,0
75939,5,4,Late delivery,1
75938,4,4,Shipping on time,0


Basically I want to deduce the meaning of the possible values in each column and how they combine.
The meaning of scheduled and real shipping days are clear enough. 
It seems that the Delivery status depends on the comparison of these two columns' values. Let's verify that

First want to check the value counts

In [4]:
shipping_df.apply(pd.Series.value_counts)

Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
0,4839.0,9293.0,,73788.0
1,4454.0,26513.0,,98977.0
2,54205.0,33806.0,,
3,27478.0,,,
4,27297.0,103153.0,,
5,27003.0,,,
6,27489.0,,,
Advance shipping,,,41592.0,
Late delivery,,,98977.0,
Shipping on time,,,32196.0,


Interesting that nothing is scheduled to ship in 3, 5, 6 days

### Get rows where shipping is made on schedule

In [5]:
on_time_df = shipping_df.loc[(shipping_df["Days for shipping (real)"] == shipping_df["Days for shipment (scheduled)"])]
on_time_df.head(1)

Unnamed: 0_level_0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
75938,4,4,Shipping on time,0


In [6]:
on_time_df.apply(pd.Series.value_counts)

Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
0,4839.0,4839.0,,32196.0
2,6819.0,6819.0,,
4,20538.0,20538.0,,
Shipping on time,,,32196.0,


Get percentage of orders that are cancelled

In [7]:
percentage_list = [0,0,0]

In [8]:
a = len(on_time_df.index)
p = round(1557/a*100, 2)
print( p, " % of shipments sent on time are cancelled ")
percentage_list[0] = p

4.84  % of shipments sent on time are cancelled 


Interesting that nothing that gets scheduled to ship in 1 day is delivered on time

### Get rows where shipment takes longer than expected

In [9]:
delayed_df = shipping_df.loc[(shipping_df["Days for shipping (real)"] > shipping_df["Days for shipment (scheduled)"])]
delayed_df.head(1)

Unnamed: 0_level_0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
75939,5,4,Late delivery,1


In [10]:
delayed_df.apply(pd.Series.value_counts)

Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
0,,4454.0,,
1,4454.0,26513.0,,98977.0
2,26513.0,26987.0,,
3,6759.0,,,
4,6759.0,41023.0,,
5,27003.0,,,
6,27489.0,,,
Late delivery,,,98977.0,


Interesting that nothing is scheduled to ship in 3, 5, 6 days

Also note that the Late delivery Risk column is assigned the value of 0 (no late delivery) when the delivery is late, but it has been cancelled

Get percentage of orders that are cancelled

In [11]:
a = len(delayed_df.index)
p = round(4423.0/a*100, 2)
print( p, " % of shipments that are delayed are cancelled ")
percentage_list[1] = p

4.47  % of shipments that are delayed are cancelled 


### Get rows where shipment takes shorter than expected

In [12]:
advance_df = shipping_df.loc[(shipping_df["Days for shipping (real)"] < shipping_df["Days for shipment (scheduled)"])]
advance_df

Unnamed: 0_level_0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77202,3,4,Advance shipping,0
75937,3,4,Advance shipping,0
75936,2,4,Advance shipping,0
75912,3,4,Advance shipping,0
75911,2,4,Advance shipping,0
...,...,...,...,...
26071,2,4,Advance shipping,0
26052,3,4,Advance shipping,0
26052,3,4,Advance shipping,0
26047,3,4,Advance shipping,0


In [13]:
advance_df.apply(pd.Series.value_counts)

Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk
0,,,,41592.0
2,20873.0,,,
3,20719.0,,,
4,,41592.0,,
Advance shipping,,,41592.0,


Interesting that only orders that are scheduled to be sent on 4 days are sent in advance, and only in 2 or 3 days

Get percentage of orders that are cancelled

In [14]:
a = len(advance_df.index)
p = round(1774/a*100, 2)
print( p, " % of shipments that are sent in advance are cancelled ")
percentage_list[2] = p

4.27  % of shipments that are sent in advance are cancelled 


### Inspect Cancelled Shipping

In [15]:
print("On time, delayed, advanced")
print(percentage_list)

On time, delayed, advanced
[4.84, 4.47, 4.27]


I am unsure how to interpret the meaning of a shipment cancellation. It may have been cancelled by the customer or by the company. The above percentages suggest the customers don't cancel it because the values are quite similar, and we would expect customers receiving their deliveries late have cancelled


Most likely this has to do with payments having failed etc. I'll investigate this further after I figure out the meaning of order status.

## Deduce the relationship between order status and payment type  and delivery status

In [16]:
order_status_df = orders_df.filter(["Order Status","Type","Delivery Status","Late Delivery Risk"]).copy()
order_status_df.head()

Unnamed: 0_level_0,Order Status,Type,Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77202,COMPLETE,DEBIT,Advance shipping,0
75939,PENDING,TRANSFER,Late delivery,1
75938,CLOSED,CASH,Shipping on time,0
75937,COMPLETE,DEBIT,Advance shipping,0
75936,PENDING_PAYMENT,PAYMENT,Advance shipping,0


In [17]:
order_status_df["Order Status"].value_counts()

COMPLETE           59491
PENDING_PAYMENT    39832
PROCESSING         21902
PENDING            20227
CLOSED             19616
ON_HOLD             9804
PAYMENT_REVIEW      1893
Name: Order Status, dtype: int64

#### Lets inspect these one by one 

##### 1.Start with complete orders

In [18]:
complete_df = order_status_df.loc[(order_status_df["Order Status"] == "COMPLETE")]
complete_df.head()

Unnamed: 0_level_0,Order Status,Type,Delivery Status,Late Delivery Risk
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77202,COMPLETE,DEBIT,Advance shipping,0
75937,COMPLETE,DEBIT,Advance shipping,0
75934,COMPLETE,DEBIT,Late delivery,1
75927,COMPLETE,DEBIT,Late delivery,1
75925,COMPLETE,DEBIT,Late delivery,1


In [19]:
complete_df.apply(pd.Series.value_counts)

Unnamed: 0,Order Status,Type,Delivery Status,Late Delivery Risk
0,,,,25292.0
1,,,,34199.0
Advance shipping,,,14136.0,
COMPLETE,59491.0,,,
DEBIT,,59491.0,,
Late delivery,,,34199.0,
Shipping on time,,,11156.0,


#### All orders with status "Complete" are paid by debit, and their shipping is never cancelled

##### Get the dataframes matching each other order

In [20]:
statuses = order_status_df["Order Status"].unique()
statuses

array(['COMPLETE', 'PENDING', 'CLOSED', 'PENDING_PAYMENT', 'PROCESSING',
       'ON_HOLD', 'PAYMENT_REVIEW'], dtype=object)

In [21]:
pending_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[1])]
closed_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[2])]
pending_payment_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[3])]
canceled_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[4])]
processing_df =order_status_df.loc[(order_status_df["Order Status"] == statuses[5])]
fraud_df =order_status_df.loc[(order_status_df["Order Status"] == statuses[6])]
onhold_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[7])]
paymentreview_df = order_status_df.loc[(order_status_df["Order Status"] == statuses[8])]

IndexError: index 7 is out of bounds for axis 0 with size 7

#### 2. Pending Orders

In [None]:
pending_df.head(1)

In [None]:
pending_df.apply(pd.Series.value_counts)

#### All Pending orders are paid via Transfer and are never cancelled

#### 3. Closed orders

In [None]:
closed_df.head(1)

In [None]:
closed_df.apply(pd.Series.value_counts)

##### All Closed orders are settled with cash and never cancelled

#### 4. Pending Payment orders

In [None]:
pending_payment_df.head(1)

In [None]:
pending_payment_df.apply(pd.Series.value_counts)

#### All "Pending Payment" status orders have payment type = "Payment" and are never cancelled

#### 5. Cancelled Orders

In [None]:
canceled_df.head(1) 

In [None]:
canceled_df.apply(pd.Series.value_counts)

#### All "Canceled" status orders are paid via transfer and have their shipping cancelled

#### 6. Processing Orders

In [None]:
processing_df.head(1)

In [None]:
processing_df.apply(pd.Series.value_counts)

##### All "Processing" orders are paid by transfer and are never cancelled

#### 7. Fraudulent Orders

In [None]:
fraud_df.head(1) 

In [None]:
fraud_df.apply(pd.Series.value_counts) 

##### All fraudulent orders are paid via transfer and have their shipping cancelled

#### 8. On hold orders 

In [None]:
onhold_df.head(1) 

In [None]:
onhold_df.apply(pd.Series.value_counts) 

#####  All ON_Hold orders are paid via debit and are never cancelled

#### 9. Payment Review orders 

In [None]:
paymentreview_df.head(1) 

In [None]:
paymentreview_df.apply(pd.Series.value_counts)

##### All payment_review orders are paid via "Payment" and are never cancelled

# Create new tablet to summarize this info 

Function to get % decomposition of delivery status and payment type for each order status

In [None]:
my_dict = {}
n = orders_df.shape[0]

def get_percentages(df,i):
    global my_dict
    global statuses
    global n
    delivery = df["Delivery Status"].value_counts(normalize=True)
    payment = df.iloc[1]["Type"]
    percent = round(df.shape[0]/n*100,2)
    
    if i ==4 or i == 6:
        my_dict[statuses[i]] = [percent,0,0,0,round(delivery[0]*100,2), payment]
    else:
        my_dict[statuses[i]] = [percent,round(delivery[0]*100,2), round(delivery[1]*100,2), round(delivery[2]*100,2),0, payment]
    

Put all order dfs in a list

In [None]:
df_list = [complete_df,
pending_df,
closed_df,
pending_payment_df,
canceled_df ,
processing_df,
fraud_df ,
onhold_df ,
paymentreview_df ]

##### Call function and populate dict

In [None]:
for i in range(len(df_list)):
    get_percentages(df_list[i],i)

In [None]:
my_dict

#### Convert into dataframe

In [None]:
orders_by_status_df = pd.DataFrame.from_dict(my_dict, orient="index")
orders_by_status_df.set_axis(["% of all orders","Late delivery %","Advance shipping %","Shipping on time %","Shipping Cancelled %","Payment%"],axis=1, inplace=True)
orders_by_status_df

There does not seem to be a relationship between order type and shipping 
I would have expected that those which take longer to processes would experience more delays

##### Save file

In [None]:
orders_by_status_df.to_csv("BigSupplyCo_Data_Files/custom_order_table.csv")

# Get some summary stats for the orders as a whole 

#### Get percentage compositon of orders by payment type as well 

In [None]:
payment_composition = orders_df["Type"].value_counts(normalize=True)*100
payment_composition

In [None]:
payments_delays = orders_df.filter(["Type","Late Delivery Risk"]).copy()
temp = payments_delays.value_counts(normalize=True)*100
temp

Modify above series to show percentages for each category

In [None]:
for i in range(len(payment_composition)):
    temp[2*i] = round(temp[2*i] / payment_composition[i] * 100, 2)
    temp[2*i+1] = round(temp[2*i+1]  / payment_composition[i]  * 100, 2)
    
temp

#### It seems that transfer payments are less likely to lead to delays, but this is because orders with transfer payments are the only ones that end up getting their shipments cancelled, and these show up as having Late Delivery Risk = 0 no matter what

I am going to delete these

## Delete cancelled and fraudulent orders 

In [None]:
orders_df_new = orders_df.loc(orders_df["Delivery Status"] != "Shipping Cancelled")


In [None]:
orders_df_new = orders_df.loc[(orders_df["Delivery Status"] != "Shipping canceled")]
orders_df_new.head(1)

In [None]:
print(orders_df.shape[0]-orders_df_new.shape[0], "Orders have been removed")

In [None]:
orders_df_new.to_csv("BigSupplyCo_Data_Files/orders_v2.csv")

## Redo above percentages 

In [None]:
payment_composition = orders_df_new["Type"].value_counts(normalize=True)*100
payments_delays = orders_df_new.filter(["Type","Late Delivery Risk"]).copy()
temp =  payments_delays.value_counts(normalize=True)*100
temp

In [None]:
for i in range(len(payment_composition)):
    temp[2*i] = round(temp[2*i] / payment_composition[i] * 100, 2)
    temp[2*i+1] = round(temp[2*i+1]  / payment_composition[i]  * 100, 2)
    
temp

# DO CHI SQUARED TEST HERE 

In [None]:
temp = temp.to_frame()

In [None]:
temp

Save file

In [None]:
temp.to_csv("BigSupplyCo_Data_Files/custom_payment_table.csv")