# DAML 05 - Stats

Michal Grochmal <michal.grochmal@city.ac.uk>

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

#### Mean

$$\bar{x} = \frac{1}{N} \sum_{i=1}^{N} x_i$$

#### Variance

$$\sigma^2 = \frac{1}{N} \sum_{i=1}^{N} (x_i - \bar{x})$$

#### Standard Deviation

$$\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^{N} (x_i - \bar{x})}$$

#### Covariance

$$cov(X, Y) = \sigma_{xy} = \frac{1}{N} \sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})$$

#### Correlation

$$corr(X, Y) = r = \frac{cov(X, Y)}{\sigma_x \sigma_y} = \frac{\sigma_{xy}}{\sigma_x \sigma_y}$$

Note: $1/N$ often becomes $1/(N-1)$ in bias-corrected calculations.
Bias correction is needed when operating over a sample instead of operating over
the entire population.  All below `NumPy` functions (except correlation functions
which are not multiplied by $1/N$) accept a `ddof=` (degrees of freedom)
argument to perform a sample based calculation.

In [2]:
arr = np.arange(30)
acv = np.arange(30) + np.random.rand(30) - 1
acr = np.arange(30) + np.random.rand(30) - 1
arr, acv, acr

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 array([ -0.18833323,   0.75605901,   1.03884605,   2.85833667,
          3.01055911,   4.87843583,   5.31359384,   6.58122585,
          7.76807574,   8.3057505 ,   9.8443631 ,  10.38004517,
         11.36732447,  12.07538497,  13.13821922,  14.99367959,
         15.89848577,  16.73987686,  17.89608291,  18.82537819,
         19.05326147,  20.5604456 ,  21.28803787,  22.32258604,
         23.51468229,  24.77536548,  25.12516349,  26.08328785,
         27.62128403,  28.58855775]),
 array([ -0.39952892,   0.37582533,   1.56367317,   2.22137946,
          3.51960286,   4.17735475,   5.31802792,   6.71633429,
          7.84282861,   8.07969902,   9.22986308,  10.89834795,
         11.38614135,  12.97500493,  13.5845624 ,  14.53869749,
         15.79740982,  16.75491254,  17.94308249,  18.85272326,
         19.89039093,  20.36970084,  21.17792081,  22.34

In [3]:
print(arr.mean())
print(arr.sum() / len(arr))  # implementaion by hand

14.5
14.5


In [4]:
print(arr.std())
print(np.std(arr, ddof=1))

8.6554414484
8.80340843083


In [5]:
print(arr.var())
print(arr.var(ddof=1))

74.9166666667
77.5


In [6]:
print(np.cov([arr, acv], ddof=0))
print(np.cov([arr, acv], ddof=1))

[[ 74.91666667  74.61993322]
 [ 74.61993322  74.42015669]]
[[ 77.5         77.19303437]
 [ 77.19303437  76.98636899]]


In [7]:
print(np.corrcoef([arr, acv, acr]))
print(stats.pearsonr(arr, acv))
print(stats.pearsonr(acv, acr))
print(stats.pearsonr(arr, acr))

[[ 1.          0.99935627  0.99952096]
 [ 0.99935627  1.          0.99894446]
 [ 0.99952096  0.99894446  1.        ]]
(0.99935627364748081, 5.1171239494027857e-42)
(0.99894446186467312, 5.1851041196312409e-39)
(0.99952095688598197, 8.1825058817419931e-44)
