In [2]:
import numpy as np

In [3]:
import matplotlib
import matplotlib.pyplot as plt

In [4]:
import scipy

In [5]:
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
import pandas as pd

In [7]:
import seaborn as sns

In [8]:
filepath = './data/'

# Descriptive statistics

In [44]:
speed = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]

if there's a sequence $x$ the i-th element in the sequence is denoted $x_i$ and in python that's `x[i-1]` (because indexing starts at 0)

eg. the 1st element of the speed array, $\text{speed}_1$, is accessed as `speed[0]`

In [45]:
speed[0]

99

$x^{(i)}$ denotes the i-th element in the ordered (from lowest value to highest) sequence

eg. the 1st element in the ordered speed array is $\text{speed}^{(1)}$

In [46]:
sorted_speed = sorted(speed)
sorted_speed

[77, 78, 85, 86, 86, 86, 87, 87, 88, 94, 99, 103, 111]

In [47]:
sorted_speed[0]

77

$x^{(-i)}$ denotes the last i-th element in the ordered (from lowest value to highest) sequence

eg. the last element in the ordered speed array is $\text{speed}^{(-1)}$

In [48]:
sorted_speed[-1]

111

for the examples with a numeric attribute from a pandas df

In [18]:
people = {
    'Name': ["John", "Anna", "Peter", "Linda", "John"], # categorical
    'Location' : ["New York", "Paris", "Berlin", "London", "London"], # categorical
    'Age': [25, 30, 22, 35, 29], # numerical discrete
    'Gender': ['Female', 'Male', 'Female', 'Male', 'Female'],  # categorical
    'Education_Level': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'Master'],  # categorical
    'Income': [50000, 60000, 45000, 70000, 55000]  # numerical continuous
}
df = pd.DataFrame(people)
df

Unnamed: 0,Name,Location,Age,Gender,Education_Level,Income
0,John,New York,25,Female,Bachelor,50000
1,Anna,Paris,30,Male,Master,60000
2,Peter,Berlin,22,Female,Bachelor,45000
3,Linda,London,35,Male,PhD,70000
4,John,London,29,Female,Master,55000


In [19]:
df.dtypes

Name               object
Location           object
Age                 int64
Gender             object
Education_Level    object
Income              int64
dtype: object

In [20]:
df['Income'] = df['Income'].astype(float)

In [10]:
nutri = pd.read_csv(filepath + 'nutri.csv')
nutri.head()

Unnamed: 0,gender,situation,tea,coffee,height,weight,age,meat,fish,raw_fruit,...,chocol,fat,meat_label,fish_label,raw_fruit_label,cooked_fruit_veg_label,chocol_label,gender_label,situation_label,fat_label
0,2,1,0,0,151.0,58.0,72,4,3,1,...,5,6,4-6 times a week,2-3 times a week,Less than once a week,4-6 times a week,Everyday,Female,Single,Mix of vegetable oils (eg. Isio4)
1,2,1,1,1,162.0,60.0,68,5,2,5,...,1,4,Everyday,Once a week,Everyday,Everyday,Less than once a week,Female,Single,Sunflower oil
2,2,1,0,4,162.0,75.0,78,3,1,5,...,5,4,2-3 times a week,Less than once a week,Everyday,Once a week,Everyday,Female,Single,Sunflower oil
3,2,1,0,0,154.0,45.0,91,0,4,4,...,3,2,Never,4-6 times a week,4-6 times a week,Never,2-3 times a week,Female,Single,Margarine
4,2,1,2,1,154.0,50.0,65,5,3,5,...,3,2,Everyday,2-3 times a week,Everyday,Everyday,2-3 times a week,Female,Single,Margarine


## extreme values

* maximum
* minimum
* outliers

### max/min

for a vector, $\max(x)=\max_i x_i = x^{(-1)}$ and $\min(x)=\min_i x_i = x^{(1)}$

In [57]:
def my_min(x):
    sorted_x = sorted(x)
    return sorted_x[0]

In [58]:
print(my_min(speed))

77


In [59]:
# using numpy arrays
def my_max(x):
    sorted_x = np.sort(x)
    return sorted_x[0]

In [52]:
print(my_min(speed))

77


In [60]:
def my_max(x):
    sorted_x = sorted(x)
    return sorted_x[len(x)-1]

In [61]:
print(my_max(speed))

111


In [62]:
# using numpy arrays
def my_max(x):
    sorted_x = np.sort(x)
    return sorted_x[-1]

In [63]:
print(my_max(speed))

111


using numpy

In [35]:
np.min(speed)

np.int64(77)

In [34]:
np.max(speed)

np.int64(111)

using dataframes

In [25]:
col = 'Age'

In [26]:
df[col].min()

np.int64(22)

In [27]:
df[col].max()

np.int64(35)

### outliers

this is discussed in other notebooks

## central tendency

These sample statistics give information about the `location` of the data

### arithmetic mean

The sample `mean` of the vector with n elements $x = (x_1, x_2, \ldots, x_n)$ is 
$$\bar{x} = \frac{1}{n}\sum_{i=1}^n x_i = \frac{x_1 + x_2 + \ldots + x_n}{n}$$

In [29]:
# To calculate the mean, find the sum of all values, and divide the sum by the number of values
def my_mean(nums):
    res = 0
    
    # the part inside the parenthesis
    for e in nums:
        res += e
    
    # division by n
    n = len(nums)
    res = res/n

    return res

In [30]:
# same thing but using a index in the loop to make it look more similar to the formula
def my_mean2(nums):
    res = 0
    
    # the part inside the parenthesis
    for i in range(len(nums)): # 1 to n
        res += nums[i]
    
    # division by n
    n = len(nums)
    res = res/n

    return res

In [31]:
print(my_mean(speed))
print(my_mean2(speed))

89.76923076923077
89.76923076923077


In [32]:
np.mean(speed)

np.float64(89.76923076923077)

In [69]:
col = 'Income'

In [70]:
df[col].mean()

np.float64(56000.0)

### median

* The sample `median` is the sample's `0.5-quantile`.
* The `p-quantile` of x is a value x such that at least a fraction $p$ of the data is less than or equal to x and at least a fraction $1 − p$ of the data is greater than or equal to x. The p-sample quantile is also called the $100 \times p$ `percentile`
* The 25, 50, and 75 sample percentiles are called the first, second, and third `quartiles` of x

$
\text{med}(x) =
\Large
\left \{
\begin{matrix}
x^{(\frac{n+1}{2})} & \text{if } n \text{ is odd} \\
\frac{x^{(\frac{n}{2})} + x^{(\frac{n}{2} + 1)}}{2}  & \text{if } n \text{ is even}
\end{matrix}
\right .
$

In [35]:
# The median value is the value in the middle, after you have sorted all the values
# If there are two numbers in the middle, divide the sum of those numbers by two.
def my_median(nums): 
    """ 
    Calculates the median of a list of numbers. 

    Args: 
        numbers (list): A list of numeric values. 

    Returns: 
        float: The median of the input list. 
    """ 
    # Sort the list in ascending order 
    sorted_nums = sorted(nums) 

    # Find the length of the list 
    n = len(sorted_nums) 

    # Calculate the median 
    if n % 2 == 0: 
        # If the length is even, take the average of the two middle elements 
        middle1 = sorted_nums[n//2 - 1] 
        middle2 = sorted_nums[n//2] 
        median_value = (middle1 + middle2) / 2.
    else: 
        # If the length is odd, take the middle element 
        median_value = sorted_nums[n//2] 

    return median_value

In [36]:
my_median(speed)

87

In [37]:
np.median(speed)

np.float64(87.0)

In [38]:
df[col].median()

np.float64(50000.0)

In [64]:
nutri['height'].mean()

np.float64(163.96017699115043)

In [74]:
quantiles = nutri['height'].quantile(q = [0.25, 0.5, 0.75])
quantiles

0.25    157.0
0.50    163.0
0.75    170.0
Name: height, dtype: float64

In [77]:
type(quantiles)

pandas.core.series.Series

In [78]:
# Get the first quantile (25th percentile)
Q1 = quantiles[0.25]
Q1

np.float64(157.0)

### mode

<p>
In <a href="/wiki/Statistics" title="Statistics">statistics</a>, the <b>mode</b> is the value that appears most often in a set of data values.<sup id="cite_ref-1" class="reference"><a href="#cite_note-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> If <span class="texhtml mvar" style="font-style:italic;"><b>X</b></span> is a discrete random variable, the mode is the value <span class="texhtml mvar" style="font-style:italic;">x</span> at which the <a href="/wiki/Probability_mass_function" title="Probability mass function">probability mass function</a> takes its maximum value (i.e., <span class="texhtml"><i>x</i>=argmax<sub><i>x</i><sub><i>i</i></sub></sub> P(<i><b>X</b></i> = <i>x</i><sub><i>i</i></sub>)</span>). In other words, it is the value that is most likely to be sampled.
</p>

In [39]:
# The mode value is the value that appears the most number of times
def my_mode(lst):
    
    # creating a dictionary
    freq = {}
    for i in lst:

        # mapping each value of list to a 
        # dictionary
        freq.setdefault(i, 0)
        freq[i] += 1

    # finding maximum value of dictionary
    hf = max(freq.values())

    # creating an empty list
    hflst = []

    # using for loop we are checking for most 
    # repeated value
    for i, j in freq.items():
        if j == hf:
            hflst.append(i)

    # returning the result
    return hflst

In [40]:
my_mode(speed)

[86]

In NumPy, there's no direct function to compute the mode
* use `np.unique` to get the unique values and their counts.
* use `np.argmax` to get the index of the maximum count, which will give you the mode.

In [41]:
# Step 1: Get the unique values and their counts
values, counts = np.unique(speed, return_counts=True)

# Step 2: Find the index of the maximum count (mode)
mode_index = np.argmax(counts)

# Step 3: Get the mode value
mode = values[mode_index]
mode

np.int64(86)

in ScyPy, there's a function for that

In [42]:
dir(scipy)

['LowLevelCallable',
 '__version__',
 'cluster',
 'constants',
 'datasets',
 'fft',
 'fftpack',
 'integrate',
 'interpolate',
 'io',
 'linalg',
 'misc',
 'ndimage',
 'odr',
 'optimize',
 'show_config',
 'signal',
 'sparse',
 'spatial',
 'special',
 'stats',
 'test']

In [43]:
scipy.stats.mode(speed)

ModeResult(mode=np.int64(86), count=np.int64(3))

In [44]:
df[col].mode()

0    50000
Name: Income, dtype: int64

In [45]:
df[col].mode()[0]

np.int64(50000)

### midrange

In [46]:
min_value = df[col].min()
max_value = df[col].max()
midrange = (max_value - min_value)/2
midrange

np.float64(12500.0)

## dispersion

These sample statistics give information about the `dispersion` (`spread`) of the data

### range

$\text{range}(x) = \max(x) - \min(x)$

In [72]:
df[col].max() - df[col].min()

np.float64(25000.0)

### a distance between sample quantiles

eg. between the 0.1- and 0.9-quantiles

In [79]:
quantiles = df[col].quantile([0.1, 0.9])
quantiles

0.1    47000.0
0.9    66000.0
Name: Income, dtype: float64

In [81]:
quantiles[0.1]-quantiles[0.9]

np.float64(-19000.0)

In [80]:
print('distance between 0.1 and 0.9 quantiles: ', abs(quantiles[0.1]-quantiles[0.9]))

distance between 0.1 and 0.9 quantiles:  19000.0


### IQR

In [83]:
quantiles = df[col].quantile([0.25, 0.5, 0.75])

In [84]:
Q1 = quantiles[0.25]
Q3 = quantiles[0.75]
IQR = Q3 - Q1
IQR

np.float64(10000.0)

### variance and standard deviation

One can find the standard deviation of an entire population in cases (such as standardized testing) where every member of a population is sampled. In cases where that cannot be done, the `standard deviation σ` is estimated by examining a random sample taken from the population and computing a statistic of the sample, which is used as an estimate of the population standard deviation. Such a statistic is called an estimator, and the estimator (or the value of the estimator, namely the estimate) is called a sample standard deviation, and is denoted by s (possibly with modifiers).

Unlike in the case of estimating the population mean of a normal distribution, for which the sample mean is a simple estimator with many desirable properties (unbiased, efficient, maximum likelihood), there is no single estimator for the standard deviation with all these properties, and unbiased estimation of standard deviation is a very technically involved problem. Most often, the standard deviation is estimated using the corrected sample standard deviation (using N − 1), defined below, and this is often referred to as the "sample standard deviation", without qualifiers.

<p>If the <i>biased <a href="/wiki/Sample_variance" class="mw-redirect" title="Sample variance">sample variance</a></i> (the second <a href="/wiki/Central_moment" title="Central moment">central moment</a> of the sample, which is a downward-biased estimate of the population variance) is used to compute an estimate of the population's standard deviation, the result is
<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s_{N}={\sqrt {{\frac {1}{N}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}}}.}">
  <semantics>
    <mrow class="MJX-TeXAtom-ORD">
      <mstyle displaystyle="true" scriptlevel="0">
        <msub>
          <mi>s</mi>
          <mrow class="MJX-TeXAtom-ORD">
            <mi>N</mi>
          </mrow>
        </msub>
        <mo>=</mo>
        <mrow class="MJX-TeXAtom-ORD">
          <msqrt>
            <mrow class="MJX-TeXAtom-ORD">
              <mfrac>
                <mn>1</mn>
                <mi>N</mi>
              </mfrac>
            </mrow>
            <munderover>
              <mo>∑<!-- ∑ --></mo>
              <mrow class="MJX-TeXAtom-ORD">
                <mi>i</mi>
                <mo>=</mo>
                <mn>1</mn>
              </mrow>
              <mrow class="MJX-TeXAtom-ORD">
                <mi>N</mi>
              </mrow>
            </munderover>
            <msup>
              <mrow>
                <mo>(</mo>
                <mrow>
                  <msub>
                    <mi>x</mi>
                    <mrow class="MJX-TeXAtom-ORD">
                      <mi>i</mi>
                    </mrow>
                  </msub>
                  <mo>−<!-- − --></mo>
                  <mrow class="MJX-TeXAtom-ORD">
                    <mrow class="MJX-TeXAtom-ORD">
                      <mover>
                        <mi>x</mi>
                        <mo stretchy="false">¯<!-- ¯ --></mo>
                      </mover>
                    </mrow>
                  </mrow>
                </mrow>
                <mo>)</mo>
              </mrow>
              <mrow class="MJX-TeXAtom-ORD">
                <mn>2</mn>
              </mrow>
            </msup>
          </msqrt>
        </mrow>
        <mo>.</mo>
      </mstyle>
    </mrow>
    <annotation encoding="application/x-tex">{\displaystyle s_{N}={\sqrt {{\frac {1}{N}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}}}.}</annotation>
  </semantics>
</math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/05100833069f1eb35275f27bf59467b30efb7517" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:25.173ex; height:8.009ex;" alt="{\displaystyle s_{N}={\sqrt {{\frac {1}{N}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}}}.}"></span>
</p>

<p>An unbiased estimator for the <i>variance</i> is given by applying <a href="/wiki/Bessel%27s_correction" title="Bessel's correction">Bessel's correction</a>, using <i>N</i>&nbsp;−&nbsp;1 instead of <i>N</i> to yield the <i>unbiased sample variance,</i> denoted <i>s</i><sup>2</sup>:
<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s^{2}={\frac {1}{N-1}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}.}">
  <semantics>
    <mrow class="MJX-TeXAtom-ORD">
      <mstyle displaystyle="true" scriptlevel="0">
        <msup>
          <mi>s</mi>
          <mrow class="MJX-TeXAtom-ORD">
            <mn>2</mn>
          </mrow>
        </msup>
        <mo>=</mo>
        <mrow class="MJX-TeXAtom-ORD">
          <mfrac>
            <mn>1</mn>
            <mrow>
              <mi>N</mi>
              <mo>−<!-- − --></mo>
              <mn>1</mn>
            </mrow>
          </mfrac>
        </mrow>
        <munderover>
          <mo>∑<!-- ∑ --></mo>
          <mrow class="MJX-TeXAtom-ORD">
            <mi>i</mi>
            <mo>=</mo>
            <mn>1</mn>
          </mrow>
          <mrow class="MJX-TeXAtom-ORD">
            <mi>N</mi>
          </mrow>
        </munderover>
        <msup>
          <mrow>
            <mo>(</mo>
            <mrow>
              <msub>
                <mi>x</mi>
                <mrow class="MJX-TeXAtom-ORD">
                  <mi>i</mi>
                </mrow>
              </msub>
              <mo>−<!-- − --></mo>
              <mrow class="MJX-TeXAtom-ORD">
                <mrow class="MJX-TeXAtom-ORD">
                  <mover>
                    <mi>x</mi>
                    <mo stretchy="false">¯<!-- ¯ --></mo>
                  </mover>
                </mrow>
              </mrow>
            </mrow>
            <mo>)</mo>
          </mrow>
          <mrow class="MJX-TeXAtom-ORD">
            <mn>2</mn>
          </mrow>
        </msup>
        <mo>.</mo>
      </mstyle>
    </mrow>
    <annotation encoding="application/x-tex">{\displaystyle s^{2}={\frac {1}{N-1}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}.}</annotation>
  </semantics>
</math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/82217533774779531cf082ea00e0cab8f487c533" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:26.085ex; height:7.343ex;" alt="{\displaystyle s^{2}={\frac {1}{N-1}}\sum _{i=1}^{N}\left(x_{i}-{\bar {x}}\right)^{2}.}"></span>
</p>

<p>This estimator is unbiased if the variance exists and the sample values are drawn independently with replacement. <i>N</i>&nbsp;−&nbsp;1 corresponds to the number of <a href="/wiki/Degrees_of_freedom_(statistics)" title="Degrees of freedom (statistics)">degrees of freedom</a> in the vector of deviations from the mean, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \textstyle (x_{1}-{\bar {x}},\;\dots ,\;x_{n}-{\bar {x}}).}">
  <semantics>
    <mrow class="MJX-TeXAtom-ORD">
      <mstyle displaystyle="true" scriptlevel="0">
        <mstyle displaystyle="false" scriptlevel="0">
          <mo stretchy="false">(</mo>
          <msub>
            <mi>x</mi>
            <mrow class="MJX-TeXAtom-ORD">
              <mn>1</mn>
            </mrow>
          </msub>
          <mo>−<!-- − --></mo>
          <mrow class="MJX-TeXAtom-ORD">
            <mrow class="MJX-TeXAtom-ORD">
              <mover>
                <mi>x</mi>
                <mo stretchy="false">¯<!-- ¯ --></mo>
              </mover>
            </mrow>
          </mrow>
          <mo>,</mo>
          <mspace width="thickmathspace"></mspace>
          <mo>…<!-- … --></mo>
          <mo>,</mo>
          <mspace width="thickmathspace"></mspace>
          <msub>
            <mi>x</mi>
            <mrow class="MJX-TeXAtom-ORD">
              <mi>n</mi>
            </mrow>
          </msub>
          <mo>−<!-- − --></mo>
          <mrow class="MJX-TeXAtom-ORD">
            <mrow class="MJX-TeXAtom-ORD">
              <mover>
                <mi>x</mi>
                <mo stretchy="false">¯<!-- ¯ --></mo>
              </mover>
            </mrow>
          </mrow>
          <mo stretchy="false">)</mo>
          <mo>.</mo>
        </mstyle>
      </mstyle>
    </mrow>
    <annotation encoding="application/x-tex">{\displaystyle \textstyle (x_{1}-{\bar {x}},\;\dots ,\;x_{n}-{\bar {x}}).}</annotation>
  </semantics>
</math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f486a63ff6f56b50fc4a2634494c8dfd154040de" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:22.197ex; height:2.843ex;" alt="{\displaystyle \textstyle (x_{1}-{\bar {x}},\;\dots ,\;x_{n}-{\bar {x}}).}"></span>
</p>

In [50]:
def my_unbiased_std(nums):
    
    # Step 1: Calculate the mean
    mean = np.mean(nums)

    # Step 2: Calculate the deviations of each data point from the mean
    deviations = nums - mean

    # Step 3: Square the deviations
    squared_deviations = deviations ** 2

    # Step 4: Sum the squared deviations and divide by N-1 (unbiased variance)
    variance = np.sum(squared_deviations) / (len(nums) - 1)

    # Step 5: Take the square root to get the standard deviation
    std_dev = np.sqrt(variance)

    return std_dev

In [51]:
my_unbiased_std(speed)

np.float64(9.636336148089395)

In [52]:
np.std(speed) # ddof=0, (stands for degree of freedom) this is the biased one

np.float64(9.258292301032677)

In [53]:
help(np.std)

Help on _ArrayFunctionDispatcher in module numpy:

std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>, mean=<no value>, correction=<no value>)
    Compute the standard deviation along the specified axis.
    
    Returns the standard deviation, a measure of the spread of a distribution,
    of the array elements. The standard deviation is computed for the
    flattened array by default, otherwise over the specified axis.
    
    Parameters
    ----------
    a : array_like
        Calculate the standard deviation of these values.
    axis : None or int or tuple of ints, optional
        Axis or axes along which the standard deviation is computed. The
        default is to compute the standard deviation of the flattened array.
    
        .. versionadded:: 1.7.0
    
        If this is a tuple of ints, a standard deviation is performed over
        multiple axes, instead of a single axis or all the axes as before.
    dtype : dtype, optional
     

In [54]:
np.std(speed, ddof=1) # unbiased one

np.float64(9.636336148089395)

In [55]:
# variance
np.var(speed)

np.float64(85.71597633136093)

In [56]:
df[col].var()

np.float64(92500000.0)

In [57]:
df[col].std()

np.float64(9617.692030835673)

### correlation

In [58]:
cols_ = ['Income', 'Age']

In [59]:
df[cols_].corr()

Unnamed: 0,Income,Age
Income,1.0,0.894369
Age,0.894369,1.0


In [60]:
df.corr

<bound method DataFrame.corr of    Age  Gender Education_Level  Income
0   25  Female        Bachelor   50000
1   30    Male          Master   50000
2   22  Female        Bachelor   45000
3   35    Male             PhD   70000
4   29  Female          Master   55000>

In [61]:
# TODO

https://www.w3schools.com/python/python_ml_percentile.asp