In [2]:
import pandas as pd
import numpy as np

We load the data set (excel)

In [3]:
sales_funnel_df = pd.read_excel("sales-funnel.xlsx")
sales_funnel_df

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won
5,218895,Kulas Inc,Daniel Hilton,Debra Henley,CPU,2,40000,pending
6,218895,Kulas Inc,Daniel Hilton,Debra Henley,Software,1,10000,presented
7,412290,Jerde-Hilpert,John Smith,Debra Henley,Maintenance,2,5000,pending
8,740150,Barton LLC,John Smith,Debra Henley,CPU,1,35000,declined
9,141962,Herman LLC,Cedric Moss,Fred Anderson,CPU,2,65000,won


We change the type of Status and Account into Categories (categorical data)

In [28]:
sales_funnel_df["Status"] = sales_funnel_df["Status"].astype("category")
sales_funnel_df["Account"] = sales_funnel_df["Account"].astype("category")

We start with a simple pivot. Note that the aggregation function is the mean. Ex: for Kulas/Quantity, two rows [1, 2] => Quantity = 1.5

In [30]:
pd.pivot_table(sales_funnel_df, index=["Name"])

Unnamed: 0_level_0,Price,Quantity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barton LLC,35000.0,1.0
"Fritsch, Russel and Anderson",35000.0,1.0
Herman LLC,65000.0,2.0
Jerde-Hilpert,5000.0,2.0
"Kassulke, Ondricka and Metz",7000.0,3.0
Keeling LLC,100000.0,5.0
Kiehn-Spinka,65000.0,2.0
Koepp Ltd,35000.0,2.0
Kulas Inc,25000.0,1.5
Purdy-Kunde,30000.0,1.0


We now do a pivot using multiple columns for the Index

In [31]:
pd.pivot_table(sales_funnel_df, index=['Name', 'Rep', 'Manager'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Price,Quantity
Name,Rep,Manager,Unnamed: 3_level_1,Unnamed: 4_level_1
Barton LLC,John Smith,Debra Henley,35000.0,1.0
"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,35000.0,1.0
Herman LLC,Cedric Moss,Fred Anderson,65000.0,2.0
Jerde-Hilpert,John Smith,Debra Henley,5000.0,2.0
"Kassulke, Ondricka and Metz",Wendy Yule,Fred Anderson,7000.0,3.0
Keeling LLC,Wendy Yule,Fred Anderson,100000.0,5.0
Kiehn-Spinka,Daniel Hilton,Debra Henley,65000.0,2.0
Koepp Ltd,Wendy Yule,Fred Anderson,35000.0,2.0
Kulas Inc,Daniel Hilton,Debra Henley,25000.0,1.5
Purdy-Kunde,Cedric Moss,Fred Anderson,30000.0,1.0


Please notice that if we change the order in the index, pandas detects the relations between managers and reps and groups them automatically.

In [32]:
pd.pivot_table(sales_funnel_df, index=['Manager',  'Rep', 'Name'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Price,Quantity
Manager,Rep,Name,Unnamed: 3_level_1,Unnamed: 4_level_1
Debra Henley,Craig Booker,"Fritsch, Russel and Anderson",35000.0,1.0
Debra Henley,Craig Booker,Trantow-Barrows,15000.0,1.333333
Debra Henley,Daniel Hilton,Kiehn-Spinka,65000.0,2.0
Debra Henley,Daniel Hilton,Kulas Inc,25000.0,1.5
Debra Henley,John Smith,Barton LLC,35000.0,1.0
Debra Henley,John Smith,Jerde-Hilpert,5000.0,2.0
Fred Anderson,Cedric Moss,Herman LLC,65000.0,2.0
Fred Anderson,Cedric Moss,Purdy-Kunde,30000.0,1.0
Fred Anderson,Cedric Moss,Stokes LLC,7500.0,1.0
Fred Anderson,Wendy Yule,"Kassulke, Ondricka and Metz",7000.0,3.0


Now, thinking only about understanding the sales funnel for each salesman, we drop the account name from the index

In [34]:
pd.pivot_table(sales_funnel_df, index=['Manager', 'Rep'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Quantity
Manager,Rep,Unnamed: 2_level_1,Unnamed: 3_level_1
Debra Henley,Craig Booker,20000.0,1.25
Debra Henley,Daniel Hilton,38333.333333,1.666667
Debra Henley,John Smith,20000.0,1.5
Fred Anderson,Cedric Moss,27500.0,1.25
Fred Anderson,Wendy Yule,44250.0,3.0


We can also specify the columns to be aggregated. For ex: Price

In [35]:
pd.pivot_table(sales_funnel_df, index=['Manager', 'Rep'], values=['Price'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,20000
Debra Henley,Daniel Hilton,38333
Debra Henley,John Smith,20000
Fred Anderson,Cedric Moss,27500
Fred Anderson,Wendy Yule,44250


So far for the aggregating columns we have been getting the mean value. We can specify another aggregation strategy with the aggfunc parameter

In [36]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep"], values=['Price'], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,80000
Debra Henley,Daniel Hilton,115000
Debra Henley,John Smith,40000
Fred Anderson,Cedric Moss,110000
Fred Anderson,Wendy Yule,177000


We can also give a list of functions to aggfunc, and it will get that agggregation for all the values

In [37]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep"], values=['Price'], aggfunc=[np.sum, len])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2
Debra Henley,Craig Booker,80000,4
Debra Henley,Daniel Hilton,115000,3
Debra Henley,John Smith,40000,2
Fred Anderson,Cedric Moss,110000,4
Fred Anderson,Wendy Yule,177000,4


We can also define columns to further segment our values. Remember that the aggregations are always done on the values.

In [41]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep"], values=['Price'], 
               columns=['Product'] ,aggfunc=[np.sum, len])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,len,len,len,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Debra Henley,Craig Booker,65000.0,5000.0,,10000.0,2.0,1.0,,1.0
Debra Henley,Daniel Hilton,105000.0,,,10000.0,2.0,,,1.0
Debra Henley,John Smith,35000.0,5000.0,,,1.0,1.0,,
Fred Anderson,Cedric Moss,95000.0,5000.0,,10000.0,2.0,1.0,,1.0
Fred Anderson,Wendy Yule,165000.0,7000.0,5000.0,,2.0,1.0,1.0,


You can notice that some values are NaN (more on this in the last lesson of the day).
We can change those NaNs into zeros.

In [42]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep"], values=['Price'], 
               columns=['Product'] ,aggfunc=[np.sum, len], fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,len,len,len,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Debra Henley,Craig Booker,65000,5000,0,10000,2,1,0,1
Debra Henley,Daniel Hilton,105000,0,0,10000,2,0,0,1
Debra Henley,John Smith,35000,5000,0,0,1,1,0,0
Fred Anderson,Cedric Moss,95000,5000,0,10000,2,1,0,1
Fred Anderson,Wendy Yule,165000,7000,5000,0,2,1,1,0


If we move Products into the index, we have another way of summarizing the same info.

In [44]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep", "Product"], values=['Price'], 
               aggfunc=[np.sum, len], fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Price,Price
Manager,Rep,Product,Unnamed: 3_level_2,Unnamed: 4_level_2
Debra Henley,Craig Booker,CPU,65000,2
Debra Henley,Craig Booker,Maintenance,5000,1
Debra Henley,Craig Booker,Software,10000,1
Debra Henley,Daniel Hilton,CPU,105000,2
Debra Henley,Daniel Hilton,Software,10000,1
Debra Henley,John Smith,CPU,35000,1
Debra Henley,John Smith,Maintenance,5000,1
Fred Anderson,Cedric Moss,CPU,95000,2
Fred Anderson,Cedric Moss,Maintenance,5000,1
Fred Anderson,Cedric Moss,Software,10000,1


We can also use margin, to get a total for each value column.

In [45]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Rep", "Product"], values=['Price'], 
               aggfunc=[np.sum, len], fill_value=0, margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Price,Price
Manager,Rep,Product,Unnamed: 3_level_2,Unnamed: 4_level_2
Debra Henley,Craig Booker,CPU,65000.0,2.0
Debra Henley,Craig Booker,Maintenance,5000.0,1.0
Debra Henley,Craig Booker,Software,10000.0,1.0
Debra Henley,Daniel Hilton,CPU,105000.0,2.0
Debra Henley,Daniel Hilton,Software,10000.0,1.0
Debra Henley,John Smith,CPU,35000.0,1.0
Debra Henley,John Smith,Maintenance,5000.0,1.0
Fred Anderson,Cedric Moss,CPU,95000.0,2.0
Fred Anderson,Cedric Moss,Maintenance,5000.0,1.0
Fred Anderson,Cedric Moss,Software,10000.0,1.0


We can go up a level and try to analyse the value of the funnel for each manager, de aggregating on each status.

In [48]:
pd.pivot_table(sales_funnel_df, index=["Manager", "Status"], values=['Price'], aggfunc=[np.sum, len])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price
Manager,Status,Unnamed: 2_level_2,Unnamed: 3_level_2
Debra Henley,declined,70000,2
Debra Henley,pending,50000,3
Debra Henley,presented,50000,3
Debra Henley,won,65000,1
Fred Anderson,declined,65000,1
Fred Anderson,pending,5000,1
Fred Anderson,presented,45000,3
Fred Anderson,won,172000,3


Another trick is to pass a dict for aggfun so we can specify different aggregation strategies for each value. Also, each value of the dict can be a list, as usual.

In [52]:
table = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
               aggfunc={"Quantity":len,"Price":[np.sum, len]},fill_value=0)
print table

                        Price                                       \
                          len                                  sum   
Product                   CPU Maintenance Monitor Software     CPU   
Manager       Status                                                 
Debra Henley  declined      2           0       0        0   70000   
              pending       1           2       0        0   40000   
              presented     1           0       0        2   30000   
              won           1           0       0        0   65000   
Fred Anderson declined      1           0       0        0   65000   
              pending       0           1       0        0       0   
              presented     1           0       1        1   30000   
              won           2           1       0        0  165000   

                                                     Quantity              \
                                                          len               
Produ

Finally, once we have a pivot we are happy with, we can also query it.

In [53]:
table.query('Manager == ["Debra Henley"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Price,Price,Price,Price,Price,Price,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,len,len,len,len,sum,sum,sum,sum,len,len,len,len
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Debra Henley,declined,2,0,0,0,70000,0,0,0,2,0,0,0
Debra Henley,pending,1,2,0,0,40000,10000,0,0,1,2,0,0
Debra Henley,presented,1,0,0,2,30000,0,0,20000,1,0,0,2
Debra Henley,won,1,0,0,0,65000,0,0,0,1,0,0,0


In [54]:
table.query("Status != ['won']")

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Price,Price,Price,Price,Price,Price,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,len,len,len,len,sum,sum,sum,sum,len,len,len,len
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Debra Henley,declined,2,0,0,0,70000,0,0,0,2,0,0,0
Debra Henley,pending,1,2,0,0,40000,10000,0,0,1,2,0,0
Debra Henley,presented,1,0,0,2,30000,0,0,20000,1,0,0,2
Fred Anderson,declined,1,0,0,0,65000,0,0,0,1,0,0,0
Fred Anderson,pending,0,1,0,0,0,5000,0,0,0,1,0,0
Fred Anderson,presented,1,0,1,1,30000,0,5000,10000,1,0,1,1
