# Classical statistical hypothesis testing
Approaches to determine correlation by analysing statistical probabilities purely in the data

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and variable types,
- set up the statistical tests,
- read dataset,
- present results from the tests,

By default, the notebook is set up to run with an example (wine quality). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to classical_ht.board for detailed instructions. The headers in this notebook follow the cards on the board.

In [0]:
# <halerium id="392e51df-d54f-472b-9662-b097e769a655">
# Link to classical_ht.board
# </halerium id="392e51df-d54f-472b-9662-b097e769a655">


### Imports

In [0]:
import numpy as np
import pandas as pd

### 2. Import the Dataset

In [0]:
# <halerium id="b3efd83e-8e68-4c0c-b19b-106a635fd8e7">
time_series = False
path = 'default example' # Specify the filepath of the data eg. './data/file.csv'
# </halerium id="b3efd83e-8e68-4c0c-b19b-106a635fd8e7">


Importing the dataset

In [0]:
if path =='default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/hypothesis_testing/WineQT.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['Date'], index_col="Date")
else:
    df = pd.read_csv(path, sep=None)

Visualising the dataset

In [0]:
df

### 3. Specify the Features

In [0]:
# Array of ['feature name', 'type'] where type is 'continuous', 'binary_categorical', 'multi_categorical'
# <halerium id="52769062-0a39-4288-9eb3-b4d75d7b0224">
x = [['residual sugar', 'continuous'], ['pH', 'continuous'], ['quality', 'multi_categorical']]
y = [['fixed acidity', 'continuous'], ['volatile acidity', 'continuous']]
# </halerium id="52769062-0a39-4288-9eb3-b4d75d7b0224">

# <halerium id="52769062-0a39-4288-9eb3-b4d75d7b0224">
time_features = []
# </halerium id="52769062-0a39-4288-9eb3-b4d75d7b0224">


### 4. Specify Level of Significance

In [0]:
# <halerium id="c92e4730-01e4-4180-b9b3-bcfeb311360e">
significance = 0.05 # Level of significance
# </halerium id="c92e4730-01e4-4180-b9b3-bcfeb311360e">


### 6. Run the Tests
Some tests may be skipped if there are no x-y pairs that correspond to the test

In [0]:
x_cont = [feature[0] for feature in x if feature[1] == 'continuous']
y_cont = [feature[0] for feature in y if feature[1] == 'continuous']
x_binary = [feature[0] for feature in x if feature[1] == 'binary_categorical']
y_binary = [feature[0] for feature in y if feature[1] == 'binary_categorical']
x_multi = [feature[0] for feature in x if feature[1] == 'multi_categorical']
y_multi = [feature[0] for feature in y if feature[1] == 'multi_categorical']

In [0]:
num_samples = df.shape[0]
print('Number of samples:', num_samples)

In [0]:
results_x = {x_para[0]:[] for x_para in x}
results_y = {y_para[0]:[] for y_para in y}
results = {'x': [], 'y': [], 'test': [], 'passed': []}

### Time series Hypothesis Test
1. Check if stationary (Dickey-Fuller test)
2. Look at residuals of the time series - check if they follow normal distribution (D'Agostino and Pearson's)

In [0]:
from functions.statistical_tests import time_series_test

# <halerium id="4c743736-24b8-4bd3-8407-0dad24bfa284">
time_results = time_series_test(df, time_features, significance, results)
# </halerium id="4c743736-24b8-4bd3-8407-0dad24bfa284">


In [0]:
if time_features:
    time_df = pd.DataFrame(time_results, index=time_features)
    time_df

#### Linear Correlation
For continuous-continuous features

In [0]:
if x_cont and y_cont:
    df_corr = df.corr()
# <halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">
    print(df_corr[y_cont].loc[x_cont])
# </halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">
elif not x_cont:
    print("No continuous x features")
else:
    print("No continuous y features")

Plots of linear correlation

In [0]:
from functions.plotting import plot_linear_corr

plot_linear_corr(df, x_cont, y_cont)

Univariate approach

In [0]:
from functions.statistical_tests import univariate_lin_corr

# Univariate (open to selection bias)
# <halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">
univariate_lin_corr(df, x_cont, y_cont, significance, results)
# </halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">


Multivariate Approach

In [0]:
from functions.statistical_tests import multivariate_lin_corr

# Multivariate (open to confounding bias)
# <halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">
multivariate_lin_corr(df, x_cont, y_cont, significance, results, results_y)
# </halerium id="5aa9dc77-efa9-4bc7-88a4-e50b47cdfbae">


#### ANOVA (Analysis of Variance)
For continuous-non-binary discrete

Null hypothesis: Group means are equal (No effect on the categorical variable)
Alternative hypothesis: At least one group mean is different from other group means (Effect on the categorical variable)

Note: Pairwise comparison using Tukey's honestly significantly differenced test to find which are the significant treatments (discrete)

In [0]:
from functions.statistical_tests import anova

# One way ANOVA\
# <halerium id="d23b832f-a251-4a6c-8d89-f9e68f65c779">
anova(df, x_multi, y_cont, significance, results)
# </halerium id="d23b832f-a251-4a6c-8d89-f9e68f65c779">


#### t-test
For binary categorical - continuous

In [0]:
from functions.statistical_tests import t_test

# <halerium id="a20f2f0c-aeac-446b-bce8-b99de87a8804">
t_test(df, x_binary, y_cont, significance, results)
# </halerium id="a20f2f0c-aeac-446b-bce8-b99de87a8804">


#### Chi-square Test of independence
For multi categorical with contingency table

In [0]:
from functions.statistical_tests import chi_square_test

# <halerium id="01bf4ebf-742b-424c-bf9a-b86bb12254c0">
chi_square_test(df, x_multi, y_multi, significance, results)
# </halerium id="01bf4ebf-742b-424c-bf9a-b86bb12254c0">


### 7. View Results
Note that both dataframes in 'sorted by x' and 'sorted by y' presents the SAME results with different sorting

In [0]:
results_df = pd.DataFrame(results)
# <halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">
results_df
# </halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">


Sorted by x

In [0]:
results_sort_x = results_df.sort_values(by=['y'])
results_sort_x = results_sort_x.sort_values(by=['x'])
index = pd.MultiIndex.from_frame(results_sort_x[['x', 'y']])
results_sort_x.index = index
results_sort_x = results_sort_x[['test', 'passed']]
# <halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">
results_sort_x
# </halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">


Sorted by y

In [0]:
results_sort_y = results_df.sort_values(by=['x'])
results_sort_y = results_sort_y.sort_values(by=['y'])
index = pd.MultiIndex.from_frame(results_sort_y[['y', 'x']])
results_sort_y.index = index
results_sort_y = results_sort_y[['test', 'passed']]
# <halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">
results_sort_y
# </halerium id="1aba5ccb-a1b5-42c9-9615-267354b87fb7">
