In [14]:
import pandas as pd
import numpy as np

# Assignment 1: Groupby

Return a table containing the top 10 stores by total transactions in the data

Make sure they’re sorted from highest to lowest.

In [15]:
# Read in transactions data -- parse dates specified here for help with later problem

transactions = pd.read_csv("../../Data/transactions.csv", parse_dates=["date"])

transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [16]:
# Group transactions by store_nbr number. Perform sum aggregation on transactions column

(transactions
 .groupby(["store_nbr"])[["transactions"]] 
 .sum() 
 .sort_values("transactions", ascending=False)  # sort in descending order
 .iloc[:10]  # Grab top 10 rows (will be highest values due to sort descending)
)

Unnamed: 0_level_0,transactions
store_nbr,Unnamed: 1_level_1
44,7273093
47,6535810
45,6201115
46,5990113
3,5366350
48,5107785
8,4637971
49,4574103
50,4384444
11,3972488


# Assignment 2: Groupby Multiple Columns

Get the total transactions by store and month.

Sort the table from first month to last, then by highest transactions to lowest within each month. 


In [17]:
# helper code to extract month date part from date column

transactions["month"] = transactions["date"].dt.month

transactions.head()

Unnamed: 0,date,store_nbr,transactions,month
0,2013-01-01,25,770,1
1,2013-01-02,1,2111,1
2,2013-01-02,2,2358,1
3,2013-01-02,3,3487,1
4,2013-01-02,4,1922,1


In [18]:
# group by store number and month, sum trasactions

(transactions
 .groupby(["store_nbr", "month"])[["transactions"]]
 .sum()
 .sort_values(["month", "transactions"], ascending=[True, False] # ascending month, descending transactions
))


Unnamed: 0_level_0,Unnamed: 1_level_0,transactions
store_nbr,month,Unnamed: 2_level_1
44,1,628438
47,1,568824
45,1,538370
46,1,522763
3,1,463260
...,...,...
32,12,86167
21,12,84128
42,12,76741
29,12,76627


# Assignment 3: The Agg Method

Calculate the mean of target met by store, and the sum of bonuses to be paid to each store.

Sort them by highest to lowest bonus payout.

Then, do the same for month.

In [19]:
transactions = transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions.date.dt.month,
    day_of_week=transactions.date.dt.dayofweek,
)

transactions.head()

Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week
0,2013-01-01,25,770,1,0.308,False,0,1
1,2013-01-02,1,2111,1,0.8444,False,0,2
2,2013-01-02,2,2358,1,0.9432,False,0,2
3,2013-01-02,3,3487,1,1.3948,True,100,2
4,2013-01-02,4,1922,1,0.7688,False,0,2


In [20]:
# Group transactions by store number to get store level stats 
# use agg to calculate mean of met_target, sum of transactions by store
# sort values by bonus payable in descending order

(transactions.groupby("store_nbr")
 .agg({"met_target": "mean", "bonus_payable": "sum"})
 .sort_values(by=["bonus_payable"], ascending=False)
)

Unnamed: 0_level_0,met_target,bonus_payable
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
47,0.999404,167600
44,0.998807,167500
45,0.997615,167300
3,0.99821,167300
46,0.989267,165900
8,0.888425,148900
48,0.690519,115800
49,0.637448,106900
50,0.45319,76000
11,0.296539,49700


In [21]:
# Group transactions by month to get month level stats 
# use agg to calculate mean of met_target, sum of transactions by store
# sort values by bonus payable in descending order

(transactions.groupby("month")
 .agg({"met_target": "mean", "bonus_payable": "sum"})
 .sort_values(by=["bonus_payable"], ascending=False)
)

Unnamed: 0_level_0,met_target,bonus_payable
month,Unnamed: 1_level_1,Unnamed: 2_level_1
12,0.25564,154100
5,0.170792,131800
3,0.169461,130400
4,0.174469,129700
7,0.162486,126300
2,0.17423,121700
6,0.161706,121700
8,0.174189,120800
1,0.163723,119600
11,0.163943,98300
