### Descriptive statistics with numpy

In [None]:
# import necessary libraries
import numpy as np
import scipy.stats

#### Univariate

In [None]:
# create data to work with - np.ndarray
y = np.array([3.5, 1, 4.7, 12, 33.7])
yn = np.array([3.5, 1, 4.7, 12, np.nan, 33.7])

In [None]:
y

In [None]:
yn

#### Measures of central tendency

##### Median

Data without missing values

In [None]:
median_value_y = np.median(y)
median_value_y

Data with missing values (nan)

In [None]:
median_value_yn = np.median(yn)
median_value_yn

In [None]:
# np.nanmedian() ignores the nan values
median_value_yn = np.nanmedian(yn)
median_value_yn

##### Mode

In [None]:
xx = [1, 4, 4, 5, 5, 8, 2, 7, 7, 7]
xxx = [1, 4, 4, 5, 5, 8, 2, 7, 7]

yy = np.array(xx)
yyy = np.array(xxx)

In [None]:
# unimodal data set
mode_yy = scipy.stats.mode(yy)
mode_yy

In [None]:
mode_yy.mode, mode_yy.count

In [None]:
# multimodal data set - only the smallest mode value is returned.
mode_yyy = scipy.stats.mode(yyy)
mode_yyy

In [None]:
# When there are nan
xyz = np.array([1, 4, 4, 5, 5, 8, 2, 7, 7, np.nan, np.nan])
mode_xyz = scipy.stats.mode(xyz)
mode_xyz

In [None]:
# When there are nan
xyz = np.array([1, 4, 4, 5, 5, 8, 2, 7, 7, np.nan, np.nan, np.nan])
mode_xyz = scipy.stats.mode(xyz)
mode_xyz

##### Mean

Data without missing values

In [None]:
mean_value = sum(y) / len(y)
mean_value

In [None]:
mean_value_np = np.mean(y)
mean_value_np

In [None]:
mean_value_np = y.mean()
mean_value_np

Data with missing values (nan)

In [None]:
mean_value_np = np.mean(yn)
mean_value_np

In [None]:
mean_value_np = yn.mean()
mean_value_np

In [None]:
# .nanmean() ignores the nan values
mean_value_np = np.nanmean(yn)
mean_value_np

##### Weighted mean

In [None]:
# define positive weights for each data point (NOTE: we will normalise when calculating the weighted mean)
weights = np.array([0.2, 0.4, 0.6, 0.3, 0.3])

In [None]:
# calculate the weighted mean
w_mean_y = 0
for i in range(len(y)):
    w_mean_y += weights[i]*y[i]
w_mean_y /= sum(weights)
w_mean_y

In [None]:
w_mean = np.average(y,weights=weights)
w_mean

#### Measures of spread (variability)

##### Range

In [None]:
y = np.array([3.5, 1, 4.7, 12, 33.7])
range_y = max(y) - min(y)
range_y

In [None]:
range_y = y.max() - y.min()
range_y

In [None]:
# when there are nan values
range_yn = yn.max() - yn.min()
range_yn

In [None]:
# nanmin() and nanmax() ignore nan values
range_yn = np.nanmax(yn) - np.nanmin(yn)
range_yn

In [None]:
range_y = np.ptp(y)
range_yn = np.ptp(yn)
range_y, range_yn

Interquartile Range (IQR)

In [None]:
y = np.array([-3.5, -1.2, 1.7, 3.8, 9.9, 13.6, 20.5, 29.7, 45.6])
quant_y = np.quantile(y, [0.25, 0.5, 0.75])
quant_y # contains the quartiles - 25%, 50% (the median), 75%

In [None]:
IQR_y = quant_y[2] - quant_y[0]
IQR_y

##### Variance

In [None]:
# population variance
var_pop_y = np.var(y,ddof=0)
var_pop_y

In [None]:
# sample variance
var_sample_y = np.var(y,ddof=1)
var_sample_y

##### Standard deviation

In [None]:
# for a population
std_pop_y = np.std(y,ddof=0)
std_pop_y

In [None]:
# for a sample
std_sample_y = np.std(y,ddof=1)
std_sample_y

### Bivariate

In [None]:
x = np.array(list(range(-10, 11)))
y = np.array([0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14])

#### Covariance

In [None]:
cov_matrix = np.cov(x,y)
cov_matrix

#### Correlation

In [None]:
corr_matrix = np.corrcoef(x,y)
corr_matrix