# Hari 5 – Statistika Deskriptif Untuk Data Science

## Menghitung Statistika Deskriptif Menggunakan Python

In [1]:
import math
import statistics
import numpy as np
import scipy.stats

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_dgn_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
x

[8.0, 1, 2.5, 4, 28.0]

In [3]:
x_dgn_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [4]:
y, y_dgn_nan = np.array(x), np.array(x_dgn_nan)
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [5]:
y_dgn_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

## Menghitung Ukuran Pemusatan Data

### Mean

In [6]:
mean_ = sum(x) / len(x)
mean_

8.7

In [8]:
# fungsi bawaan dari library statistics python
mean_ = statistics.mean(x)
mean_

8.7

In [9]:
mean_ = statistics.fmean(x)
mean_

8.7

In [10]:
mean_ = statistics.mean(x_dgn_nan)
mean_

nan

In [12]:
mean_ = statistics.fmean(x_dgn_nan)
mean_

nan

In [13]:
np.nanmean(y_dgn_nan)

8.7

### Weighted Mean

In [14]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

6.95

In [15]:
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
wmean

6.95

In [16]:
# penghitungan weighted mean ini menggunakan np.average()
y, w = np.array(x), np.array(w)
wmean = np.average(y, weights=w)
wmean

6.95

In [17]:
w = np.array([0.1, 0.2, 0.3, 0.0, 0.2, 0.1])
np.average(y_dgn_nan, weights=w)

nan

### Harmonic Mean

In [18]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [19]:
# menggunakan statistics.harmonic_mean()
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [20]:
statistics.harmonic_mean(x_dgn_nan)

nan

In [21]:
statistics.harmonic_mean([1, 0, 2])

0

In [22]:
statistics.harmonic_mean([1, 2, -2]) 

StatisticsError: harmonic mean does not support negative values

In [23]:
#  menggunakan scipy.stats.hmean()
scipy.stats.hmean(y)

2.7613412228796843

### Geometric Mean

In [24]:
gmean = 1
for item in x:
    gmean *= item

In [25]:
gmean **= 1 / len(x)
gmean

4.677885674856041

In [None]:
# menggunakan statistics.geometric_mean()

In [26]:
gmean = statistics.geometric_mean(x)
gmean

4.67788567485604

In [27]:
# menggunakan scipy.stats.gmean()
scipy.stats.gmean(y)

4.67788567485604

### Median

In [32]:
# mencari nilai median menggunakan python
n = len(x)
if n % 2:
     median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])

median_

4

In [33]:
# nilai median dengan statistics.median()
median_ = statistics.median(x)
median_

4

In [34]:
 # nilai median menggunakan np.median()
median_ = np.median(y)
median_

4.0

### Mode

In [35]:
u = [2, 3, 2, 8, 12]
mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

2

In [None]:
# mencari modus dari suatu daset menggunakan statistics.mode()
mode_ = statistics.mode(u)
mode_

In [36]:
# mencari modus dari suatu daset menggunakan statistics.multimode()
mode_ = statistics.multimode(u)
mode_

[2]

In [37]:
v = [12, 15, 12, 15, 21, 15, 12]
statistics.mode(v)  # StatisticsError

12

In [38]:
statistics.multimode(v)

[12, 15]

In [None]:
# nilai modus jika menggunakan scipy.stats.mode()

In [39]:
u, v = np.array(u), np.array(v)
mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=array([2]), count=array([2]))

In [41]:
mode_ = scipy.stats.mode(v)
mode_

ModeResult(mode=array([12]), count=array([3]))

## Menghitung Ukuran Sebaran Data

### Variance

In [42]:
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

123.19999999999999

In [43]:
# menggunakan statistics.variance()
var_ = statistics.variance(x)
var_

123.2

In [44]:
# menggunakan fungsi np.var() atau metode .var()
var_ = np.var(y, ddof=1)
var_

123.19999999999999

In [45]:
var_ = y.var(ddof=1)
var_

123.19999999999999

In [46]:
np.nanvar(y_dgn_nan, ddof=1)

123.19999999999999

### Standar Deviasi

In [47]:
std_ = var_ ** 0.5
std_

11.099549540409285

In [48]:
# menggunakan statistics.dev()
std_ = statistics.stdev(x)
std_

11.099549540409287

In [49]:
np.std(y, ddof=1)

11.099549540409285

In [50]:
y.std(ddof=1)

11.099549540409285

In [51]:
np.std(y_dgn_nan, ddof=1)

nan

In [52]:
y_dgn_nan.std(ddof=1)

nan

In [53]:
np.nanstd(y_dgn_nan, ddof=1)

11.099549540409285

### Skewness

In [54]:
x = [8.0, 1, 2.5, 4, 28.0]
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n - 1)
std_ = var_ ** 0.5
skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_

1.9470432273905929

In [None]:
# menggunakan scipy.stats.skew()

In [55]:
y, y_dgn_nan = np.array(x), np.array(x_dgn_nan)
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [56]:
scipy.stats.skew(y_dgn_nan, bias=False)

nan

### Percentiles

In [57]:
# menggunakan np.percentile()
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
y = np.array(x)
np.percentile(y, 5)

-3.44

In [58]:
np.percentile(y, 95)

34.919999999999995

In [59]:
y_dgn_nan = np.insert(y, 2, np.nan)
y_dgn_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [60]:
np.nanpercentile(y_dgn_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

### Ranges

In [61]:
np.ptp(y)

46.0

In [62]:
np.ptp(y_dgn_nan)

nan

In [63]:
np.amax(y) - np.amin(y)

46.0

In [64]:
np.nanmax(y_dgn_nan) - np.nanmin(y_dgn_nan)

46.0

In [65]:
y.max() - y.min()

46.0

## Menghitung Korelasi Antara Sepasang Data

### Covariance

In [66]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n
cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n)) / (n - 1))
cov_xy

228.75194444444446

In [77]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
y = np.array(x)

In [79]:
cov_matrix = np.cov(x, y)
cov_matrix

array([[228.75194444, 228.75194444],
       [228.75194444, 228.75194444]])

### Correlation coefficient

In [80]:
var_x = sum((item - mean_x)**2 for item in x) / (n - 1)
var_y = sum((item - mean_y)**2 for item in y) / (n - 1)
std_x, std_y = var_x ** 0.5, var_y ** 0.5
r = cov_xy / (std_x * std_y)
r

1.0

In [82]:
# fungsi pearsonr()
r, p = scipy.stats.pearsonr(x, y)
r

1.0

In [83]:
p

0.0

In [84]:
# fungsi np.corrcoef()
corr_matrix = np.corrcoef(x, y)
corr_matrix

array([[1., 1.],
       [1., 1.]])

In [85]:
r = corr_matrix[0, 1]
r

1.0

In [86]:
r = corr_matrix[1, 0]
r

1.0

In [88]:
scipy.stats.linregress(x, y)

LinregressResult(slope=1.0, intercept=0.0, rvalue=1.0, pvalue=3.292585384803146e-70, stderr=0.0, intercept_stderr=0.0)

In [89]:
# fungsi scipy.stats.linregress()
result = scipy.stats.linregress(x, y)
r = result.rvalue
r

1.0