###  Imports

In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

#### Read dataframes with index columns

In [236]:
departments_df = pd.read_csv("BigSupplyCo_Data_Files\departments_v2.csv", index_col="Department Id")
orders_df = pd.read_csv("BigSupplyCo_Data_Files\orders_v2.csv", index_col = "Order Id")
products_df = pd.read_csv("BigSupplyCo_Data_Files\products_v2.csv", index_col = "Product Id")
customers_df = pd.read_csv("BigSupplyCo_Data_Files\customers_v2.csv", index_col = "Customer Id")

# Analyze how the customer segment relates to shipping info 

In [237]:
customers_df.head(1)

Unnamed: 0_level_0,Customer Segment,Customer Country,Customer City,Customer State,Customer Street,Customer Zipcode
Customer Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20755,Consumer,Puerto Rico,Caguas,PR,5365 Noble Nectar Island,725.0


I want to remove the extra location info

I'll consider the country for now

In [238]:
customers_df.drop(["Customer City","Customer State","Customer Street","Customer Zipcode","Customer Country"],axis=1,inplace=True)
customers_df.head(1)

Unnamed: 0_level_0,Customer Segment,Customer Country
Customer Id,Unnamed: 1_level_1,Unnamed: 2_level_1
20755,Consumer,Puerto Rico


In [239]:
customers_df.apply(pd.Series.value_counts)

Unnamed: 0,Customer Segment,Customer Country
Consumer,10695.0,
Corporate,6239.0,
EE. UU.,,12719.0
Home Office,3718.0,
Puerto Rico,,7933.0


## Combine customer and Orders dataset 

In [129]:
orders_df.rename(columns = {"Order Customer Id": "Customer Id"}, inplace=True)

In [130]:
orders_df = pd.merge(orders_df, customers_df, on="Customer Id")

## Need to isolate other factors 

Lets start with geography

In [132]:
orders_df["Market"].value_counts()

LATAM           49309
Europe          48090
Pacific Asia    39585
USCA            24627
Africa          11154
Name: Market, dtype: int64

In [133]:
orders_df.head(1)

Unnamed: 0,Product Id,Customer Id,Department Id,Market,Order City,Order Country,Order Region,Order State,Order Status,Order Zipcode,order date (DateOrders),Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Quantity,Sales,Order Item Total,Order Profit,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk,Customer Segment
0,1360,20755,2,Pacific Asia,Bekasi,Indonesia,Southeast Asia,Java Occidental,COMPLETE,,1/31/2018 22:56,13.11,0.04,180517,1,327.75,314.640015,91.25,DEBIT,3,4,Advance shipping,0,Consumer


In [134]:
orders_df.drop(["Order City","Order Country","Order Region","Order State"],axis=1,inplace=True)

I am going to assume for now that the purchase specific info does not affect shipping and cut those columns

In [135]:
orders_df.head(1)

Unnamed: 0,Product Id,Customer Id,Department Id,Market,Order Status,Order Zipcode,order date (DateOrders),Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Quantity,Sales,Order Item Total,Order Profit,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk,Customer Segment
0,1360,20755,2,Pacific Asia,COMPLETE,,1/31/2018 22:56,13.11,0.04,180517,1,327.75,314.640015,91.25,DEBIT,3,4,Advance shipping,0,Consumer


In [137]:
orders_df.drop(["Order Status","Order Zipcode","Order Item Discount","Order Item Discount Rate","Sales","Order Item Total","Order Profit","Type"],axis=1,inplace=True)
orders_df

KeyError: "['Order Status', 'Order Zipcode', 'Order Item Discount', 'Order Item Discount Rate', 'Sales', 'Order Item Total', 'Order Profit', 'Type'] not found in axis"

Will drop customer id, product id (and keep category id), order data, quantity

In [139]:
orders_df.drop(["Customer Id","Order Item Id","order date (DateOrders)","Order Item Quantity"],axis=1,inplace=True)
orders_df

Unnamed: 0,Product Id,Department Id,Market,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late Delivery Risk,Customer Segment
0,1360,2,Pacific Asia,3,4,Advance shipping,0,Consumer
1,1360,2,Pacific Asia,5,4,Late delivery,1,Consumer
2,1360,2,Pacific Asia,4,4,Shipping on time,0,Consumer
3,1360,2,Pacific Asia,3,4,Advance shipping,0,Home Office
4,1360,2,Pacific Asia,2,4,Advance shipping,0,Corporate
...,...,...,...,...,...,...,...,...
172760,1358,9,Pacific Asia,5,2,Late delivery,1,Home Office
172761,1354,9,Pacific Asia,2,2,Shipping on time,0,Home Office
172762,1358,9,Pacific Asia,2,2,Shipping on time,0,Home Office
172763,403,4,USCA,6,2,Late delivery,1,Home Office


Drop more columns

In [140]:
orders_df.drop(["Days for shipping (real)","Days for shipment (scheduled)","Delivery Status"],axis=1,inplace=True)
orders_df

Unnamed: 0,Product Id,Department Id,Market,Late Delivery Risk,Customer Segment
0,1360,2,Pacific Asia,0,Consumer
1,1360,2,Pacific Asia,1,Consumer
2,1360,2,Pacific Asia,0,Consumer
3,1360,2,Pacific Asia,0,Home Office
4,1360,2,Pacific Asia,0,Corporate
...,...,...,...,...,...
172760,1358,9,Pacific Asia,1,Home Office
172761,1354,9,Pacific Asia,0,Home Office
172762,1358,9,Pacific Asia,0,Home Office
172763,403,4,USCA,1,Home Office


Put market and customer segment in categories:

In [141]:
markets = pd.Series.unique(orders_df["Market"])
print(markets)
segments =  pd.Series.unique(orders_df["Customer Segment"])
print(segments)

['Pacific Asia' 'Africa' 'LATAM' 'USCA' 'Europe']
['Consumer' 'Home Office' 'Corporate']


In [142]:
category_dict = { "Market": {"Pacific Asia": 0, "Africa":1,"LATAM":2,"USCA":3,"Europe":4}, 
                 "Customer Segment": {"Consumer":0, "Home Office":1 , "Corporate":2}
                }
categories_df = pd.DataFrame.from_dict(category_dict)
categories_df

Unnamed: 0,Market,Customer Segment
Pacific Asia,0.0,
Africa,1.0,
LATAM,2.0,
USCA,3.0,
Europe,4.0,
Consumer,,0.0
Home Office,,1.0
Corporate,,2.0


In [143]:
orders_df.replace(category_dict, inplace=True)

In [144]:
orders_df

Unnamed: 0,Product Id,Department Id,Market,Late Delivery Risk,Customer Segment
0,1360,2,0,0,0
1,1360,2,0,1,0
2,1360,2,0,0,0
3,1360,2,0,0,1
4,1360,2,0,0,2
...,...,...,...,...,...
172760,1358,9,0,1,1
172761,1354,9,0,0,1
172762,1358,9,0,0,1
172763,403,4,3,1,1


### Replace product id with category 

In [148]:
orders_df = pd.merge(orders_df,products_df, on="Product Id")


In [151]:
orders_df.drop(["Product Id","Product Name","Product Price","Category Name"],axis=1, inplace=True)
orders_df

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
0,2,0,0,0,73
1,2,0,1,0,73
2,2,0,0,0,73
3,2,0,0,1,73
4,2,0,0,2,73
...,...,...,...,...,...
172760,7,0,1,0,74
172761,7,0,1,0,74
172762,7,0,1,0,74
172763,7,0,0,0,74


s

In [152]:
consumer_orders = orders_df.loc[(orders_df["Customer Segment"] == 0)]
home_orders = orders_df.loc[(orders_df["Customer Segment"] == 1)] 
corporate_orders = orders_df.loc[(orders_df["Customer Segment"] == 2)]

In [153]:
consumer_orders["Late Delivery Risk"].value_counts(normalize=True)*100

1    57.311563
0    42.688437
Name: Late Delivery Risk, dtype: float64

In [154]:
home_orders["Late Delivery Risk"].value_counts(normalize=True)*100

1    57.588344
0    42.411656
Name: Late Delivery Risk, dtype: float64

In [155]:
corporate_orders["Late Delivery Risk"].value_counts(normalize=True)*100

1    57.07813
0    42.92187
Name: Late Delivery Risk, dtype: float64

### How do the customers differ by regions

In [156]:
consumer_orders["Market"].value_counts(normalize=True)*100

2    28.454484
4    27.928875
0    22.768955
3    14.405055
1     6.442630
Name: Market, dtype: float64

In [157]:
home_orders["Market"].value_counts(normalize=True)*100

2    28.805529
4    27.439400
0    23.237174
3    14.125320
1     6.392576
Name: Market, dtype: float64

In [158]:
corporate_orders["Market"].value_counts(normalize=True)*100

2    28.533354
4    27.908925
0    22.966799
3    14.074398
1     6.516525
Name: Market, dtype: float64

These seem very roughly in line

##### Check for department stores 

In [160]:
consumer_orders["Department Id"].value_counts(normalize=True)*100

7     37.080072
4     27.058824
5     18.425408
3      8.104451
6      5.353389
2      1.367703
9      1.126146
10     0.825319
11     0.260568
12     0.203534
8      0.194587
Name: Department Id, dtype: float64

In [162]:
home_orders["Department Id"].value_counts(normalize=True)*100

7     36.794626
4     27.468605
5     18.291852
3      7.937178
6      5.373657
2      1.307720
9      1.265535
10     0.791771
8      0.305026
11     0.288802
12     0.175228
Name: Department Id, dtype: float64

In [163]:
corporate_orders["Department Id"].value_counts(normalize=True)*100

7     37.193497
4     27.076988
5     18.380673
3      7.991928
6      5.376180
2      1.424002
9      1.039446
10     0.788151
11     0.285562
8      0.234161
12     0.209412
Name: Department Id, dtype: float64

##### Check for product categories

In [166]:
(consumer_orders["Product Category Id"].value_counts(normalize=True)*100)[0:10]

17    13.594274
18    12.234399
24    11.622679
46    10.650861
45     9.676806
48     8.664728
43     7.548647
9      6.971595
29     6.170879
37     1.163051
Name: Product Category Id, dtype: float64

In [168]:
(home_orders["Product Category Id"].value_counts(normalize=True)*100)[0:10]

17    14.053931
18    12.291917
24    11.600740
46    10.422819
45     9.523964
48     8.569945
43     7.667846
9      6.752766
29     6.025895
37     1.087062
Name: Product Category Id, dtype: float64

In [169]:
(corporate_orders["Product Category Id"].value_counts(normalize=True)*100)[0:10]

17    13.377627
18    12.440984
24    11.704234
46    10.916083
45     9.532059
48     8.549726
43     7.698751
9      6.916311
29     5.977764
37     1.102269
Name: Product Category Id, dtype: float64

Their top 10 product categories are roughly identical

In [171]:
orders_df.corr()

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
Department Id,1.0,-0.029279,0.000681,0.000232,0.889059
Market,-0.029279,1.0,0.003122,-0.002994,-0.083852
Late Delivery Risk,0.000681,0.003122,1.0,-0.001735,0.001499
Customer Segment,0.000232,-0.002994,-0.001735,1.0,0.001429
Product Category Id,0.889059,-0.083852,0.001499,0.001429,1.0


# Lets compare a given category of products, shipped to a given region, from a single department store, and get an insight

Department store 7 only:

In [200]:
order_test_df = orders_df.loc[(orders_df["Department Id"] == 7)]
order_test_df.head()

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
43930,7,1,1,1,48
43931,7,2,1,1,48
43932,7,2,1,1,48
43933,7,2,0,1,48
43934,7,1,1,1,48


In [201]:
order_test_df["Product Category Id"].value_counts(normalize=True)

46    0.288445
45    0.259163
48    0.232380
43    0.205472
74    0.007918
44    0.006622
Name: Product Category Id, dtype: float64

Lets pick product 46, which is: Lifevests

In [202]:
products_df.loc[products_df["Product Category Id"] == 46]

Unnamed: 0_level_0,Product Category Id,Product Name,Product Price,Category Name
Product Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1014,46,O'Brien Men's Neoprene Life Vest,49.98,Indoor/Outdoor Games


In [203]:
order_test_df = order_test_df.loc[(order_test_df["Product Category Id"] == 46)]
order_test_df.head()

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
58770,7,0,1,1,46
58771,7,0,1,1,46
58772,7,0,0,1,46
58773,7,1,1,1,46
58774,7,0,1,1,46


In [204]:
order_test_df["Market"].value_counts(normalize=True)

2    0.303086
4    0.269518
0    0.209962
3    0.148349
1    0.069085
Name: Market, dtype: float64

Pick region 2, which is LATAM

In [205]:
order_test_df = order_test_df.loc[(order_test_df["Market"] == 2)]
order_test_df.head()

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
58778,7,2,1,1,46
58782,7,2,1,1,46
58794,7,2,0,1,46
58796,7,2,1,1,46
58797,7,2,1,1,46


Everything is equal other than customer segment, so lets see

In [206]:
order_test_df.apply(pd.Series.value_counts)

Unnamed: 0,Department Id,Market,Late Delivery Risk,Customer Segment,Product Category Id
0,,,2352.0,2889.0,
1,,,3246.0,962.0,
2,,5598.0,,1747.0,
7,5598.0,,,,
46,,,,,5598.0


### Drop everything else 

In [208]:
order_test_df.drop(labels=["Department Id","Market","Product Category Id"], axis=1,inplace=True)
order_test_df

Unnamed: 0,Late Delivery Risk,Customer Segment
58778,1,1
58782,1,1
58794,0,1
58796,1,1
58797,1,1
...,...,...
77235,0,1
77236,0,1
77237,0,0
77238,0,0


In [211]:
order_test_df.value_counts()

Late Delivery Risk  Customer Segment
1                   0                   1681
0                   0                   1208
1                   2                   1003
0                   2                    744
1                   1                    562
0                   1                    400
dtype: int64

# Get delay percentage by region

In [219]:
markets_deliveries = orders_df.filter(["Market", "Late Delivery Risk"])
markets_deliveries.value_counts()

Market  Late Delivery Risk
2       1                     28044
4       1                     27743
0       1                     22712
2       0                     21265
4       0                     20347
0       0                     16873
3       1                     14138
        0                     10489
1       1                      6340
        0                      4814
dtype: int64

In [224]:
orders_df["Late Delivery Risk"].value_counts(normalize=True)

1    0.5729
0    0.4271
Name: Late Delivery Risk, dtype: float64