## Setup

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

# turn of data table rendering
pd.set_option('display.notebook_repr_html', False)
pd.__version__

'0.23.0'

## Data set

In [3]:
# Constructing a beer sales DataFrame
df = pd.DataFrame({'Billy Beer': [13884, 23008, 17883, 24435, 49938],
                   'Lucky Lager': [34565, 83938, 59437, 28843, 48285],
                   'Triple Bock': [39987, 35512, 23542, 37729, 36647]})
df

   Billy Beer  Lucky Lager  Triple Bock
0       13884        34565        39987
1       23008        83938        35512
2       17883        59437        23542
3       24435        28843        37729
4       49938        48285        36647

In [4]:
# Quick insights / descriptive statistics
df.describe()

         Billy Beer   Lucky Lager   Triple Bock
count      5.000000      5.000000      5.000000
mean   25829.600000  51013.600000  34683.400000
std    14115.302841  21934.601587   6443.542294
min    13884.000000  28843.000000  23542.000000
25%    17883.000000  34565.000000  35512.000000
50%    23008.000000  48285.000000  36647.000000
75%    24435.000000  59437.000000  37729.000000
max    49938.000000  83938.000000  39987.000000

## Call functions on a DataFrame

In [5]:
# Computing the mean sales for each brand
df.mean()

Billy Beer     25829.6
Lucky Lager    51013.6
Triple Bock    34683.4
dtype: float64

In [6]:
# Calculate the 75% quartile
df.quantile(q=.75)

Billy Beer     24435.0
Lucky Lager    59437.0
Triple Bock    37729.0
Name: 0.75, dtype: float64

In [7]:
# Calculate the sample standard deviation
df.std()

Billy Beer     14115.302841
Lucky Lager    21934.601587
Triple Bock     6443.542294
dtype: float64

In [8]:
# Calculate the population standard deviation
df.std(ddof=0)

Billy Beer     12625.110670
Lucky Lager    19618.904084
Triple Bock     5763.279434
dtype: float64

## Using Apply or Lambda expression

In [9]:
# The same as calling .mean on the DataFrame
df.apply(np.mean)

Billy Beer     25829.6
Lucky Lager    51013.6
Triple Bock    34683.4
dtype: float64

In [10]:
# Same as above
np.mean(df)

Billy Beer     25829.6
Lucky Lager    51013.6
Triple Bock    34683.4
dtype: float64

In [11]:
# Specify a function to apply to the DataFrame
def zscore(series):
    result = (series - series.mean()) / series.std()
    return result

# Call Apply on the highest function
df.apply(zscore)

   Billy Beer  Lucky Lager  Triple Bock
0   -0.846287    -0.749893     0.823088
1   -0.199897     1.501026     0.128594
2   -0.562978     0.384023    -1.729080
3   -0.098801    -1.010759     0.472659
4    1.707962    -0.124397     0.304739

In [12]:
# The same result values as using scipy stats zscore with
# a dynamic degrees of freedom of 1 
stats.zscore(df, ddof=1)

array([[-0.84628719, -0.74989281,  0.82308764],
       [-0.19989653,  1.50102567,  0.12859386],
       [-0.56297765,  0.38402339, -1.72907998],
       [-0.09880057, -1.01075918,  0.47265927],
       [ 1.70796194, -0.12439706,  0.30473921]])

In [13]:
# Calculate inter quartile range with a lambda expression
df.apply(lambda x: x.quantile(q=.75) - x.quantile(q=.25))

Billy Beer      6552.0
Lucky Lager    24872.0
Triple Bock     2217.0
dtype: float64