### Descriptive statistics with pandas

In [None]:
# import necessary libraries
import pandas as pd
import scipy.stats
import numpy as np
import math

#### Univariate

In [None]:
# create data to work with - pd.Series
z = pd.Series([3.5, 1, 4.7, 12, 33.7])
zn = pd.Series([3.5, 1, 4.7, 12, math.nan, 33.7])

In [None]:
z

In [None]:
zn

#### Measures of central tendency

##### Median

Data without missing values

In [None]:
median_value_z = z.median()
median_value_z

Data with missing values (nan)

In [None]:
# .median() treats nan according to the parameter skipna. By default it is set to True so nan are ignored.
median_value_zn = zn.median()
median_value_zn

In [None]:
# .median() treats nan according to the parameter skipna. By default it is set to True so nan are ignored.
median_value_zn = zn.median(skipna=False)
median_value_zn

##### Mode

In [None]:
zz = pd.Series([1, 4, 4, 5, 5, 8, 2, 7, 7, 7])
zzz = pd.Series([1, 4, 4, 5, 5, 8, 2, 7, 7])

In [None]:
# unimodal data set
mode_zz = zz.mode()
mode_zz

In [None]:
# multimodal data set
mode_zzz = zzz.mode()
mode_zzz

In [None]:
# nan are ignored unless dropna is set to False
xyz = pd.Series([1, 4, 4, 5, 5, 8, 2, 7, 7, math.nan, math.nan])
mode_xyz = xyz.mode()
mode_xyz

In [None]:
# nan are ignored unless dropna is set to False
xyz = pd.Series([1, 4, 4, 5, 5, 8, 2, 7, 7, math.nan, math.nan, math.nan])
mode_xyz = xyz.mode()
mode_xyz

In [None]:
# nan are ignored unless dropna is set to False
xyz = pd.Series([1, 4, 4, 5, 5, 8, 2, 7, 7, math.nan, math.nan, math.nan])
mode_xyz = xyz.mode(dropna=False)
mode_xyz

##### Mean

Data without missing values

In [None]:
mean_value = sum(z) / len(z)
mean_value

In [None]:
mean_value_z = z.mean()
mean_value_z

Data with missing values (nan)

In [None]:
# nan are ignored unless skipna is set to false
mean_value_zn = zn.mean()
mean_value_zn

In [None]:
# nan are ignored unless skipna is set to false
mean_value_zn = zn.mean(skipna=False)
mean_value_zn

##### Weighted mean

In [None]:
# define positive weights for each data point (NOTE: we will normalise when calculating the weighted mean)
weights = np.array([0.2, 0.4, 0.6, 0.3, 0.3])

In [None]:
# calculate the weighted mean
w_mean_z = 0
for i in range(len(z)):
    w_mean_z += weights[i]*z[i]
w_mean_z /= sum(weights)
w_mean_z

In [None]:
# np.average() can be used with numpy arrays and pandas series
w_mean_z = np.average(z,weights=weights)
w_mean_z

#### Measures of spread (variability)

##### Range

In [None]:
z = pd.Series([3.5, 1, 4.7, 12, 33.7])
range_z = z.max() - z.min()
range_z

In [None]:
# nan values are ignored
range_zn = zn.max() - zn.min()
range_zn

In [None]:
range_z = np.ptp(z)
range_zn = np.ptp(zn)
range_z, range_zn

#### Interquartile Range (IQR)

In [None]:
z = pd.Series([-3.5, -1.2, 1.7, 3.8, 9.9, 13.6, 20.5, 29.7, 45.6])
quant_z = z.quantile([0.25, 0.5, 0.75])
quant_z # contains the quartiles - 25%, 50% (the median), 75%

In [None]:
IQR_z = quant_z[0.75] - quant_z[0.25]
IQR_z

##### Variance

In [None]:
# population variance
var_pop_z = z.var(ddof=0)
var_pop_z

In [None]:
# sample variance
var_sample_z = z.var(ddof=1)
var_sample_z

##### Standard deviation

In [None]:
# for a population
std_pop_z = z.std(ddof=0)
std_pop_z

In [None]:
# for a sample
std_sample_z = z.std(ddof=1)
std_sample_z

### Bivariate

In [None]:
x = pd.Series(list(range(-10, 11)))
y = pd.Series([0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14])

#### Covariance

In [None]:
cov_xy = x.cov(y)
cov_xy

#### Correlation

In [None]:
corr_xy = x.corr(y)
corr_xy