### Mean and Median

In [1]:
import pandas as pd
import numpy as np

In [3]:
sales = pd.read_csv("datasets/sales_subset.csv", index_col=0)
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [4]:
sales["weekly_sales"].mean()

23843.950148505668

In [5]:
sales["weekly_sales"].median()

12049.064999999999

### Summarizing Dates

In [7]:
sales["date"].max()

'2012-10-26'

In [8]:
sales["date"].min()

'2010-02-05'

### Efficient Summaries

In [9]:
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

sales["temperature_c"].agg(iqr)

16.583333333333336

In [12]:
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)

temperature_c           16.583333
fuel_price_usd_per_l     0.073176
unemployment             0.565000
dtype: float64

In [14]:
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median])

Unnamed: 0,temperature_c,fuel_price_usd_per_l,unemployment
iqr,16.583333,0.073176,0.565
median,16.966667,0.743381,8.099


### Cumulative Statistics

In [18]:
sales["cum_weekly_sales"] = sales["weekly_sales"].cumsum()
sales["cum_weekly_sales"]

0        2.492450e+04
1        4.675240e+04
2        1.040108e+05
3        1.214248e+05
4        1.389829e+05
             ...     
10769    2.568930e+08
10770    2.568934e+08
10771    2.568938e+08
10772    2.568938e+08
10773    2.568947e+08
Name: cum_weekly_sales, Length: 10774, dtype: float64

In [20]:
sales["cum_max_sales"] = sales["weekly_sales"].cummax()
sales["cum_max_sales"]

0         24924.50
1         24924.50
2         57258.43
3         57258.43
4         57258.43
           ...    
10769    293966.05
10770    293966.05
10771    293966.05
10772    293966.05
10773    293966.05
Name: cum_max_sales, Length: 10774, dtype: float64

In [22]:
sales[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]]

Unnamed: 0,date,weekly_sales,cum_weekly_sales,cum_max_sales
0,2010-02-05,24924.50,2.492450e+04,24924.50
1,2010-03-05,21827.90,4.675240e+04,24924.50
2,2010-04-02,57258.43,1.040108e+05,57258.43
3,2010-05-07,17413.94,1.214248e+05,57258.43
4,2010-06-04,17558.09,1.389829e+05,57258.43
...,...,...,...,...
10769,2011-12-09,895.00,2.568930e+08,293966.05
10770,2012-02-03,350.00,2.568934e+08,293966.05
10771,2012-06-08,450.00,2.568938e+08,293966.05
10772,2012-07-13,0.06,2.568938e+08,293966.05


### 

### Dropping Duplicates

In [24]:
sales.drop_duplicates(subset=["store", "type"])

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment,cum_weekly_sales,cum_max_sales
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106,24924.5,24924.5
901,2,A,1,2010-02-05,35034.06,False,4.55,0.679451,8.324,18863180.0,140504.41
1798,4,A,1,2010-02-05,38724.42,False,6.533333,0.686319,8.623,42653010.0,178982.89
2699,6,A,1,2010-02-05,25619.0,False,4.683333,0.679451,7.259,66180320.0,178982.89
3593,10,B,1,2010-02-05,40212.84,False,12.411111,0.782478,9.765,85470610.0,178982.89
4495,13,A,1,2010-02-05,46761.9,False,-0.261111,0.704283,8.316,108655600.0,232558.51
5408,14,A,1,2010-02-05,32842.31,False,-2.605556,0.735455,8.992,132073000.0,232558.51
6293,19,A,1,2010-02-05,21500.58,False,-6.133333,0.780365,8.35,158951500.0,293966.05
7199,20,A,1,2010-02-05,46021.21,False,-3.377778,0.735455,8.187,177033400.0,293966.05
8109,27,A,1,2010-02-05,32313.79,False,-2.672222,0.780365,8.237,202848000.0,293966.05


### Counting Categorical Variables

In [30]:
sales["type"].value_counts()

A    9872
B     902
Name: type, dtype: int64

In [31]:
sales["type"].value_counts(normalize=True)

A    0.91628
B    0.08372
Name: type, dtype: float64

In [36]:
sales["department"].value_counts(sort=True)

1     144
55    144
71    144
67    144
60    144
     ... 
50     72
78     56
77     39
39      7
43      2
Name: department, Length: 80, dtype: int64

In [37]:
sales["department"].value_counts(sort=True, normalize=True)

1     0.013366
55    0.013366
71    0.013366
67    0.013366
60    0.013366
        ...   
50    0.006683
78    0.005198
77    0.003620
39    0.000650
43    0.000186
Name: department, Length: 80, dtype: float64

### Grouped Summary Statistics

In [40]:
sales_by_type = sales.groupby("type")["weekly_sales"].sum()
sales_by_type

type
A    2.337163e+08
B    2.317840e+07
Name: weekly_sales, dtype: float64

In [41]:
sales_by_type / sum(sales["weekly_sales"])

type
A    0.909775
B    0.090225
Name: weekly_sales, dtype: float64

In [42]:
sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])

Unnamed: 0_level_0,amin,amax,mean,median
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-1098.0,293966.05,23674.667242,11943.92
B,-798.0,232558.51,25696.67837,13336.08


### Pivot Table
Pivot tables are the standard way of aggregating data in spreadsheets. In pandas, pivot tables are essentially just another way of performing grouped calculations. That is, the `pivot_table()` method is just an alternative to `groupby()`.

In [47]:
sales

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment,cum_weekly_sales,cum_max_sales
0,1,A,1,2010-02-05,24924.50,False,5.727778,0.679451,8.106,2.492450e+04,24924.50
1,1,A,1,2010-03-05,21827.90,False,8.055556,0.693452,8.106,4.675240e+04,24924.50
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808,1.040108e+05,57258.43
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808,1.214248e+05,57258.43
4,1,A,1,2010-06-04,17558.09,False,27.050000,0.714586,7.808,1.389829e+05,57258.43
...,...,...,...,...,...,...,...,...,...,...,...
10769,39,A,99,2011-12-09,895.00,False,9.644444,0.834256,7.716,2.568930e+08,293966.05
10770,39,A,99,2012-02-03,350.00,False,15.938889,0.887619,7.244,2.568934e+08,293966.05
10771,39,A,99,2012-06-08,450.00,False,27.288889,0.911922,6.989,2.568938e+08,293966.05
10772,39,A,99,2012-07-13,0.06,False,25.644444,0.860145,6.623,2.568938e+08,293966.05


In [50]:
# Pivot for mean weekly_sales for each store type
sales.pivot_table(values="weekly_sales", index="type", aggfunc="mean")

Unnamed: 0_level_0,weekly_sales
type,Unnamed: 1_level_1
A,23674.667242
B,25696.67837


In [52]:
# Pivot for mean and median weekly_sales for each store type
sales.pivot_table(values="weekly_sales", index="type", aggfunc=[np.mean, np.median])

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,weekly_sales,weekly_sales
type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,23674.667242,11943.92
B,25696.67837,13336.08


In [57]:
# Pivot for mean weekly_sales by store type and holiday 
sales.pivot_table(values="weekly_sales", index = "type", columns="is_holiday")

is_holiday,False,True
type,Unnamed: 1_level_1,Unnamed: 2_level_1
A,23768.583523,590.04525
B,25751.980533,810.705


### Fill in Missing Values and Sum Values with Pivot Tables
The `.pivot_table()` method has several useful arguments, including `fill_value` and `margins`.
- `fill_value` replaces missing values with a real value(known as imputation).


- `margins` is  a shortcut for when you pivoted by two variables, but also wanted to pivot by each of those variables seperately: it gives the row and column totals of the pivot table contents.

In [59]:
# Print mean weekly_sales by department and type; fill missing values with 0
sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value=0)

type,A,B
department,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30961.725379,44050.626667
2,67600.158788,112958.526667
3,17160.002955,30580.655000
4,44285.399091,51219.654167
5,34821.011364,63236.875000
...,...,...
95,123933.787121,77082.102500
96,21367.042857,9528.538333
97,28471.266970,5828.873333
98,12875.423182,217.428333


In [61]:
# Print the mean weekly_sales by department and type; fill missing values with 0s; sum all rows and cols
sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value=0, margins=True)

type,A,B,All
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,30961.725379,44050.626667,32052.467153
2,67600.158788,112958.526667,71380.022778
3,17160.002955,30580.655000,18278.390625
4,44285.399091,51219.654167,44863.253681
5,34821.011364,63236.875000,37189.000000
...,...,...,...
96,21367.042857,9528.538333,20337.607681
97,28471.266970,5828.873333,26584.400833
98,12875.423182,217.428333,11820.590278
99,379.123659,0.000000,379.123659
