# Exercise 4 - Mathematical and Statistical Programming with Numpy

In [1]:
# import necessary libraries
import numpy as np

In [2]:
# create data to work with 
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Display the first 5 elements.

In [3]:
x[:5]

array([0, 1, 2, 3, 4])

Display every other element.

In [4]:
x[::2]

array([0, 2, 4, 6, 8])

Display the elements from index 5 in reverse order.

In [5]:
x[5::-1]

array([5, 4, 3, 2, 1, 0])

In [6]:
# two dimensional array
y = np.array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

Display the first 2 rows and the first 3 columns.

In [7]:
y[:2, :3] 

array([[12,  5,  2],
       [ 7,  6,  8]])

Display the first column of y.

In [8]:
y[:,0]

array([12,  7,  1])

Display the first row of y.

In [9]:
y[0,:]

array([12,  5,  2,  4])

Read the contents of file cdc_1.csv, containing heights, weights and ages, into array data.

In [10]:
data = np.genfromtxt('data/cdc_1.csv', delimiter=',', skip_header=1)

In [11]:
data

array([[ 70., 175.,  77.],
       [ 71., 194.,  31.],
       [ 67., 170.,  45.],
       ...,
       [ 69., 224.,  73.],
       [ 73., 200.,  35.],
       [ 69., 170.,  83.]])

Calculate the minimum, maximum and mean height, weight and age.

In [12]:
data.min(axis=0)

array([49., 78., 18.])

In [13]:
data.max(axis=0)

array([ 93., 500.,  94.])

In [14]:
data.mean(axis=0)

array([ 70.25164594, 189.32270875,  44.27306929])

### Descriptive statistics with numpy

In [15]:
# import necessary libraries
import numpy as np
import scipy.stats

Read the contents of file cdc_nan.csv, containing heights, weights and ages, into array data.

In [16]:
data = np.genfromtxt('data/cdc.csv', delimiter=',', skip_header=1)

Separate the heights (column 0) and the weights (column 1)

In [17]:
heights = data[:,0]

In [18]:
weights = data[:,1]

### Univariate

#### Measures of central tendency

##### Median

Calculate the median for the heights and the weights and assign the values to variables.

In [19]:
median_heights = np.median(heights)
median_heights

70.0

In [20]:
median_weights = np.median(weights)
median_weights

nan

What has happened? Check if the arrays contain missing values.

In [21]:
np.isnan(heights)

array([False, False, False, ..., False, False, False])

In [22]:
np.sum(np.isnan(heights))

0

In [23]:
np.isnan(weights)

array([False, False, False, ..., False, False, False])

In [24]:
np.sum(np.isnan(weights))

3

Array weights contain 3 nan values. Find their positions.

In [25]:
np.argwhere(np.isnan(weights))

array([[ 3],
       [11],
       [17]], dtype=int64)

Now calculate the median for the weights ignoring the nan values.

In [26]:
median_weights = np.nanmedian(weights)
median_weights

185.0

##### Mode

In [27]:
xx = [1, 4, 4, 5, 5, 8, 2, 7, 7, 7]
xxx = [1, 4, 4, 5, 5, 8, 2, 7, 7]

yy = np.array(xx)
yyy = np.array(xxx)

In [28]:
# unimodal data set
mode_yy = scipy.stats.mode(yy)
mode_yy

ModeResult(mode=7, count=3)

In [29]:
mode_yy.mode, mode_yy.count

(7, 3)

In [30]:
# multimodal data set - only the smallest mode value is returned.
mode_yyy = scipy.stats.mode(yyy)
mode_yyy

ModeResult(mode=4, count=2)

In [31]:
# When there are nan
xyz = np.array([1, 4, 4, 5, 5, 8, 2, 7, 7, np.nan, np.nan])
mode_xyz = scipy.stats.mode(xyz)
mode_xyz

ModeResult(mode=4.0, count=2)

In [32]:
# When there are nan
xyz = np.array([1, 4, 4, 5, 5, 8, 2, 7, 7, np.nan, np.nan, np.nan])
mode_xyz = scipy.stats.mode(xyz)
mode_xyz

ModeResult(mode=nan, count=3)

##### Mean

Calculate the mean values for heights and weights.

In [33]:
mean_heights = np.mean(heights)
mean_heights

70.25164594001463

In [34]:
mean_weights = np.nanmean(weights)
mean_weights

189.3251097637466

#### Measures of spread (variability)

##### Range

Calculate the range of the heights and the weights. Range is the difference between the maximum and the minimum value.

In [35]:
range_heights = np.max(heights) - np.min(heights)
range_heights

44.0

In [36]:
range_weights = np.max(weights) - np.min(weights)
range_weights

nan

In [37]:
range_weights = np.nanmax(weights) - np.nanmin(weights)
range_weights

422.0

##### Interquartile Range (IQR)

Find the quartiles for the heights and the weights, and the Interquartile Range (IQR).

In [38]:
quant_heights = np.quantile(heights, [0.25, 0.5, 0.75])
quant_heights

array([68., 70., 72.])

In [39]:
IQR_heights = quant_heights[2] - quant_heights[0]
IQR_heights

4.0

In [40]:
quant_weights = np.quantile(weights, [0.25, 0.5, 0.75])
quant_weights

array([nan, nan, nan])

In [41]:
quant_weights = np.nanquantile(weights, [0.25, 0.5, 0.75])
quant_weights

array([165., 185., 210.])

In [42]:
IQR_weights = quant_weights[2] - quant_weights[0]
IQR_weights

45.0

##### Variance

Calculate population and sample variance for heights and weights.

In [43]:
# population variance - heights
var_pop_heights = np.var(heights,ddof=0)
var_pop_heights

9.054450473032162

In [44]:
# sample variance - heights
var_sample_heights = np.var(heights,ddof=1)
var_sample_heights

9.055396799377588

In [45]:
# population variance - weights
var_pop_weights = np.var(weights,ddof=0)
var_pop_weights

nan

In [46]:
# population variance - weights
var_pop_weights = np.nanvar(weights,ddof=0)
var_pop_weights

1336.1876341871994

In [47]:
# sample variance - weights
var_sample_weights = np.nanvar(weights,ddof=1)
var_sample_weights

1336.3273297056714

##### Standard deviation

Calculate population and sample standard deviation for heights and weights.

In [48]:
# population standard variation - heights
std_pop_heights = np.std(heights,ddof=0)
std_pop_heights

3.009061394028404

In [49]:
# sample standard deviation - heights
std_sample_heights = np.std(heights,ddof=1)
std_sample_heights

3.009218636021249

In [50]:
# population standard deviation - weights
std_pop_weights = np.std(weights,ddof=0)
std_pop_weights

nan

In [51]:
# population standard deviation - weights
std_pop_weights = np.nanstd(weights,ddof=0)
std_pop_weights

36.55390039636262

In [52]:
# sample standard deviation - weights
std_sample_weights = np.nanstd(weights,ddof=1)
std_sample_weights

36.555811161916125

### Bivariate

#### Covariance

Find the covariance between heights and weights and comment on its direction and strength. 

In [53]:
cov_matrix = np.cov(heights,weights)
cov_matrix

array([[9.0553968,       nan],
       [      nan,       nan]])

In [54]:
weights_1 = weights[~np.isnan(weights)]
heights_1 = heights[~np.isnan(weights)]

In [55]:
cov_matrix = np.cov(heights_1,weights_1)
cov_matrix

array([[   9.05811323,   47.78471363],
       [  47.78471363, 1336.32732971]])

Positive covariance - height and weight both increase. We cannot judge the strength of the relationship. For that we have to calculate the correlation.

#### Correlation

Find the correlation between heights and weights and comment on its direction and strength. 

In [56]:
corr_matrix = np.corrcoef(heights,weights)
corr_matrix

array([[ 1., nan],
       [nan, nan]])

In [57]:
corr_matrix = np.corrcoef(heights_1,weights_1)
corr_matrix

array([[1.        , 0.43432386],
       [0.43432386, 1.        ]])

Positive correlation - height and weight both increase. Weak correlation (<0.5).