In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
x =  [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [5]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(z)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


In [6]:
mean_ = sum(x) / len(x)
mean_

8.7

In [7]:
mean_ = statistics.mean(x)
print(mean_)

8.7


In [8]:
mean_ = statistics.mean(x_with_nan)
print(mean_)

nan


In [9]:
mean_ = np.mean(y)
mean_

8.7

In [10]:
mean_ = y.mean()
mean_

8.7

In [11]:
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [12]:
np.nanmean(y_with_nan)

8.7

In [13]:
mean_ = z.mean()
mean_

8.7

In [14]:
z_with_nan.mean()

8.7

## Weighted Mean

In [15]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95
6.95


In [16]:
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


In [17]:
(w * y).sum() / w.sum()

6.95

## Harmonic Mean

In [18]:
hmean = len(x) / sum(1/item for item in x)
hmean

2.7613412228796843

In [19]:
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [20]:
scipy.stats.hmean(y)

2.7613412228796843

In [21]:
scipy.stats.hmean(z)

2.7613412228796843

## Geometric Mean

In [22]:
gmean = 1

for item in x:
    gmean *= item
    
gmean **= 1 / len(x)
gmean

4.677885674856041

In [23]:
scipy.stats.gmean(y)

4.67788567485604

In [24]:
scipy.stats.gmean(z)

4.67788567485604

## Measures of Variability

In [25]:
n =  len(x)

mean_ = sum(x) / n

var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

123.19999999999999

In [26]:
var_ = np.var(y, ddof=1)
var_

123.19999999999999

## Standar Deviation

In [27]:
std_ = var_**0.5
std_

11.099549540409285

In [28]:
std_ = statistics.stdev(x)
std_

11.099549540409287

In [29]:
np.std(y, ddof=1)

11.099549540409285

In [30]:
y.std(ddof=1)

11.099549540409285

In [31]:
z.std(ddof=1)

11.099549540409285

## Skewness

In [33]:
x = [8.0, 1, 2.5, 4, 28.0]

n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n - 1)
std_ = var_**0.5

skew_ = (sum((item-mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_

1.9470432273905929

In [34]:
y, y_with_nan= np.array(x), np.array(x_with_nan)

scipy.stats.skew(y, bias = False)

1.9470432273905927

In [35]:
scipy.stats.skew(y_with_nan, bias = False)

nan

In [36]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
z.skew()

1.9470432273905924

## Percentiles

In [43]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
statistics.quantiles(x, n=2)

[8.0]

In [44]:
statistics.quantiles(x, n=4, method='inclusive')

[0.1, 8.0, 21.0]

In [46]:
y = np.array(x)
np.percentile(y, 5)

-3.44

In [47]:
np.percentile(y, 95)

34.919999999999995

In [49]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [50]:
np.median(y)

8.0

In [51]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [52]:
np.nanpercentile(y_with_nan, [25,50,75])

array([ 0.1,  8. , 21. ])

In [56]:
np.quantile(y, 0.05)

-3.44

In [57]:
np.quantile(y, 0.95)

34.919999999999995

In [58]:
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)
z.quantile(0.95)

34.919999999999995

In [59]:
z.quantile(0.05)

-3.44

In [60]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [62]:
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

## Ranges

In [63]:
np.ptp(y)

46.0

In [64]:
np.ptp(z)

46.0

In [65]:
np.ptp(y_with_nan)

nan

In [66]:
np.ptp(z_with_nan)

nan

In [68]:
np.amax(y) - np.amin(y)

46.0

In [69]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

46.0

In [70]:
y.max() - y.min()

46.0

## Summary of Descriptive Statistics

In [71]:
result = scipy.stats.describe(y, ddof=1, bias=False)
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [72]:
result.nobs

9

In [73]:
result.minmax[0]

-5.0

In [74]:
result.minmax[1]

41.0

In [75]:
result.mean

11.622222222222222

In [76]:
result.variance

228.75194444444446

In [77]:
result.skewness

0.9249043136685094

In [78]:
result.kurtosis

0.14770623629658886

In [81]:
result = z.describe()
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

## Measures of Correlation Between Pairs of Data

In [85]:
x = list(range(-10, 11))
y = [0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]
x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

## Covariance

In [86]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n
cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n)) / (n - 1))
cov_xy

19.95

In [87]:
cov_matrix = np.cov(x_, y_)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [88]:
x_.var(ddof=1)

38.5

In [89]:
y_.var(ddof=1)

13.914285714285711

In [90]:
cov_xy = cov_matrix[0,1]
cov_xy

19.95

In [91]:
cov_xy = cov_matrix[1,0]
cov_xy

19.95

In [92]:
cov_xy = x__.cov(y__)
cov_xy

19.95

In [93]:
cov_xy = y__.cov(x__)
cov_xy

19.95

## Correlation Coefficient