In [4]:
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
import pandas as pd
import seaborn as sns
import math

sns.set_style('whitegrid')

# Data Mining

Data mining is the process of discovering meaningful patterns, trends, and relationships in large datasets using statistical, machine learning, and database management techniques. It goes beyond simple data analysis by automatically extracting hidden knowledge that can support decision-making, and helps us understand complex phenomena. Common applications include customer behavior analysis, fraud detection, medical diagnosis, or market trend prediction. By turning raw data into actionable insights, data mining serves as a critical tool in today’s data-driven world.

# Data

The most convenient way to think of the datasets that the majority
of data mining algorithms operate upon is the tabular view. In this
analogy the problem at hand can be treated as (a potentially gigantic)
spreadsheet with several rows – corresponding to data objects – and
columns, each of which includes observed attributes with respect the
different aspects of these data objects.

Another important aspect of the datasets we work with is the
measurement scale of the individual columns in the data matrix
(each corresponding to a random variable). A concise summary of
the different measurement scales and some of the most prototypical
statistics which can be calculated for them:

| Type of attribute | Description | Examples | Statistics |
|-------------------|-------------|----------|------------|
| **Categorical**   |             |          |            |
| Nominal           | Variables can be checked for equality only; | names of cities, hair color | mode, entropy, correlation, χ²-test |
| Ordinal           |  `>` relation can be interpreted among variables; | grades {fail, pass, excellent} | median, percentiles |
| **Numerical**     |             |          |            |
| Interval          | The difference of two variables can be formed and interpreted | shoe sizes, dates, °C | mean, deviation, significance (e.g., F-, t- tests) |
| Ratio             | Ratios can be formed from values of the variables of this kind | age, length, temperature in Kelvin | percent, geometric/harmonic mean, variation |

Let's load the [Bike Rental Dataset](https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset). A slightly modified version that we are going to use can be found at `/data/rental.csv`.

Dataset features:
- `season`: Season
- `yr`: Year
- `mnth`: Month
- `holiday`: Indicator whether the day was a holiday or not.
- `weekday`: Day of the week.
- `workingday`: Indicator whether the day was a working day or weekend.
- `weathersit`: The weather situation on that day. One of:
  - 1: clear, few clouds, partly cloudy, cloudy
  - 2: mist + clouds, mist + broken clouds, mist + few clouds, mist
  - 3: light snow, light rain + thunderstorm + scattered clouds, light rain + scattered clouds
  - 4: heavy rain + ice pallets + thunderstorm + mist, snow + mist
- `temp`: Temperature in degrees Celsius.
- `atemp`: Felt temperature in Celsius.
- `hum`: Relative humidity in percent (0 to 100).
- `windspeed`: Wind speed in km per hour.
- `cnt`: Count of bicycles including both casual and registered users. The count is used as the target in the regression task.


In [81]:
# loading dataset
df = pd.read_csv("https://github.com/ficstamas/data-mining/raw/b76d5b7913c446878fa47de8861c83e26780828f/data/rental.csv", index_col=0)
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0
...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0


In [85]:
# make the train-test splits and separate the target variable
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

train_X, train_y = train[train.columns.difference(["cnt"])], train[["cnt"]]
test_X, test_y = test[test.columns.difference(["cnt"])], test[["cnt"]]

## Visualization

In [None]:
# using a histogram visualize the distribution of numerical features


In [None]:
# visualize the joint behaviour of two variables using scatter plot
# extra: include the histogram of the the point next to the x and y axes 


In [None]:
# show the distribution of windspeed for each season (using box or violin plots)


In [None]:
# vizualize any interesing asapect of the dataset


## Modelling

In [87]:
# model fitting and evaluation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def fit_and_eval(trainX, trainY, testX, testY):
    model = LinearRegression()
    model.fit(trainX, trainY)
    
    predictions = model.predict(testX)
    return r2_score(testY, predictions)

# Pre-processing

Data preprocessing is a crucial step in any data analysis or machine learning workflow because raw data is often incomplete, inconsistent, noisy, or in a format unsuitable for modeling. Preprocessing transforms this raw data into a clean, well-structured form, ensuring that the analysis or model can extract meaningful patterns rather than being misled by errors or irrelevant information. This means that we perform some transformation over the data matrix, either in a column or a row-oriented manner.

## Categorical data

We should transform each nominal features into numerical ones, since not every model can handle it. 

### Numeric mapping

It means that we map each categorical value to a numerical value, for example instead representing `seasons` as strings, we remap each value in the following manner `{'spring': 0, 'summer': 1, 'fall': 2, 'winter': 3}`.

In [None]:
# lets transform each categorical feature to their numerical representation
# 
def _transform_categorical_to_numerical(row):
    """
    row: A row of the dataset. You can access a column by `row.column_name` or row["column_name"]
    """
    # code here
    return row


numeric_mapping_train_X = train_X.apply(_transform_categorical_to_numerical, axis=1)
numeric_mapping_test_X = test_X.apply(_transform_categorical_to_numerical, axis=1)

In [None]:
# evaluate the model
fit_and_eval(numeric_mapping_train_X, trainY, numeric_mapping_test_X, testY)

### One-hot encoding

In [None]:
# lets transform each categorical feature to their one-hot encoded representation
# 


one_hot_mapping_train_X = None
one_hot_mapping_test_X = None

In [None]:
fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

In [None]:
# try to remove dummy variables


## Numerical data

In [131]:
# Utility functions for plotting
from scipy.spatial import ConvexHull
from matplotlib.patches import Ellipse

# Khachiyan Algorithm for MVEE
def mvee(points, tol=1e-3):
    N, d = points.shape
    Q = np.column_stack((points, np.ones(N)))  # Add ones for affine transformation
    u = np.ones(N) / N  # initial uniform weights

    while True:
        # Matrix X = Q^T * diag(u) * Q
        X = Q.T @ np.diag(u) @ Q
        M = np.diag(Q @ np.linalg.inv(X) @ Q.T)  # Mahalanobis distances
        j = np.argmax(M)
        max_M = M[j]
        step_size = (max_M - d - 1) / ((d + 1) * (max_M - 1))
        new_u = (1 - step_size) * u
        new_u[j] += step_size
        if np.linalg.norm(new_u - u) < tol:
            break
        u = new_u

    # Center
    center = u @ points
    # Covariance matrix of the ellipse
    A = np.linalg.inv(points.T @ np.diag(u) @ points - np.outer(center, center)) / d
    return center, A


# Function to plot ellipse from center and matrix A
def plot_enclosing_ellipse(x, ax, eps=0, **kwargs):
    hull = ConvexHull(x)
    hull_points = x[hull.vertices]
    center, A = mvee(hull_points)
    U, s, Vt = np.linalg.svd(A)
    angle = np.degrees(np.arctan2(U[1,0], U[0,0]))
    width, height = 2 / np.sqrt(s) + eps  # radii
    ell = Ellipse(xy=center, width=width, height=height, angle=angle, **kwargs)
    ax.add_patch(ell)

### Mean centering

In [None]:
# Calculate the mean of a random vector without using any builtin or numpy functions
np.random.seed(0)
X = np.random.randn(50)

# code here X_std should contain the final result
X_std = None

assert np.allclose(np.std(X), X_std)

In [200]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=5),
    std=widgets.FloatSlider(min=1, max=50, value=2),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=5, max=50, step=1, value=10)
)
def mean_centering_interact_plot(loc=20, std=2.0, seed=42, n_samples=10):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    
    axs.scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs.scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs, fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(x_centered, axis=0)
    axs.scatter(x_centered[:, 0], x_centered[:, 1], color="yellow", label="Centered Data")
    axs.scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(x_centered, axs, fill=False, color='yellow', linewidth=1)
    
    axs.annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=5.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=2.0, d…

In [None]:
# Center the numerical features in our dataset


In [None]:
# Evaulate the model with the new feature values


### Standardization

In [None]:
# Calculate the standarde deviation of a random vector without using any builtin or numpy functions
np.random.seed(0)
X = np.random.randn(50)

# code here X_std should contain the final result
X_std = None

assert np.allclose(np.std(X), X_std)

In [198]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=3),
    std=widgets.FloatSlider(min=1, max=50, value=2),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def standardization_interact_plot(loc=20, std=2.0, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    x_standardized = x_centered / np.std(x_centered, axis=0)
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    
    axs.scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs.scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs, fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(x_standardized, axis=0)
    axs.scatter(x_standardized[:, 0], x_standardized[:, 1], color="yellow", label="Standardized Data")
    axs.scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(x_standardized, axs, fill=False, color='yellow', linewidth=1)
    
    axs.annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=3.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=2.0, d…

In [None]:
# Standardize the numerical features in our dataset


In [None]:
# Evaulate the model with the new feature values


### Whitening

In [212]:
# Calculate the covariance matrix without using any builtin functions
np.random.seed(0)
X = np.random.randn(100, 30)

# code here X_cov should contain the final result
X_cov = None

assert np.allclose(np.cov(X), X_cov)

(100, 30)

In [226]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-3),
    cov=widgets.FloatSlider(min=0, max=0.9, step=0.05, value=0.8),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def whitening_interact_plot(loc=20, cov=0.8, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.multivariate_normal([loc, loc], [[1, cov], [cov, 1]], n_samples)
    # x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    # Covariance matrix
    cov_matrix = np.cov(x_centered, rowvar=False)

    # Eigen decomposition
    eigvals, eigvecs = np.linalg.eigh(cov_matrix)

    # PCA Whitening: decorrelates & scales to unit variance
    D_inv_sqrt = np.diag(1.0 / np.sqrt(eigvals))
    X_pca_white = x_centered @ eigvecs @ D_inv_sqrt

    # ZCA whitening: rotates the vectors back to the original orientation
    X_zca_white = X_pca_white @ eigvecs.T

    # Cholesky:
    L = np.linalg.cholesky(np.linalg.pinv(cov_matrix), upper=False)
    X_chol_white = x_centered @ L
    # print(np.cov(X_chol_white))
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 3, sharex=True, sharey=True)
    axs[0].set_aspect('equal')
    axs[1].set_aspect('equal')
    axs[2].set_aspect('equal')
    fig.set_size_inches(10.5, 4)
    axs[1].set_title('PCA whitening')
    axs[2].set_title('ZCA whitening')
    axs[0].set_title('Cholesky whitening')

    # PCA plot
    axs[1].scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs[1].scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs[1], fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(X_pca_white, axis=0)
    axs[1].scatter(X_pca_white[:, 0], X_pca_white[:, 1], color="yellow", label="Whitened Data")
    axs[1].scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(X_pca_white, axs[1], fill=False, color='yellow', linewidth=1)
    
    axs[1].annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    

    # ZCA plot

    axs[2].scatter(x[:, 0], x[:, 1], color="red")
    axs[2].scatter(center[0], center[1], marker="x", color="blue")
    
    plot_enclosing_ellipse(x, axs[2], fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(X_zca_white, axis=0)
    axs[2].scatter(X_zca_white[:, 0], X_zca_white[:, 1], color="yellow")
    axs[2].scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(X_zca_white, axs[2], fill=False, color='yellow', linewidth=1)
    
    axs[2].annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

    # Cholesky plot

    axs[0].scatter(x[:, 0], x[:, 1], color="red")
    axs[0].scatter(center[0], center[1], marker="x", color="blue")
    
    plot_enclosing_ellipse(x, axs[0], fill=False, color='red', linewidth=1)

    _m_centered = np.mean(X_chol_white, axis=0)
    axs[0].scatter(X_chol_white[:, 0], X_chol_white[:, 1], color="yellow")
    axs[0].scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(X_chol_white, axs[0], fill=False, color='yellow', linewidth=1)
    
    axs[0].annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-3.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=0.8, …

In [None]:
# Apply whitening transformation to the numeric features


In [None]:
# Evaulate the model with the new feature values


### Min-max scaling

In [195]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-1),
    std=widgets.FloatSlider(min=0.1, max=2, step=0.1, value=0.5),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def min_max_interact_plot(loc=-1, std=0.5, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2)) 
    center = np.mean(x, axis=0)    
    x_mm = (x - np.min(x, axis=0)) / (np.max(x, axis=0) - np.min(x, axis=0))
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    
    axs.scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs.scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs, fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(x_mm, axis=0)
    axs.scatter(x_mm[:, 0], x_mm[:, 1], color="yellow", label="Min-Max Normalized Data")
    axs.scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(x_mm, axs, fill=False, color='yellow', linewidth=1)
    
    axs.annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-1.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=0.5, …

In [None]:
# Apply min-max transformation to the numeric features


In [None]:
# Evaulate the model with the new feature values


### Unit normalization

In [208]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-1),
    std=widgets.FloatSlider(min=0.1, max=2, step=0.1, value=0.5),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def unit_norm_interact_plot(loc=-1, std=0.5, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2)) 
    center = np.mean(x, axis=0)    
    x_unit = x / np.linalg.norm(x, axis=1)[:, None]
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    
    axs.scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs.scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs, fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(x_unit, axis=0)
    axs.scatter(x_unit[:, 0], x_unit[:, 1], color="yellow", label="Min-Max Normalized Data")
    axs.scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(x_unit, axs, fill=False, color='yellow', linewidth=1)
    
    axs.annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-1.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=0.5, …

In [None]:
# Apply unit normalization to the numeric features


In [None]:
# Evaulate the model with the new feature values


# Basic Data Analysis

## Correlation and Causuality

**Correlation refers to a statistical relationship between two variables** - when changes in one variable are associated with changes in another. For example, ice cream sales and beach attendance often rise together, showing a positive correlation. **Causality, on the other hand, means that one event directly influences or produces another.** If A causes B, then changing A will lead to a predictable change in B. While correlation can hint at possible causal links, it does not prove them.

The key difference is that correlation simply describes a relationship, while causality explains the underlying mechanism of that relationship. Many correlated events share a common cause or are influenced by other variables (confounders). For instance, both ice cream sales and drowning incidents increase in summer, but the cause is warmer weather - not ice cream itself.

**A common misconception is assuming that "correlation implies causation".** This error, sometimes called the [post hoc fallacy](https://en.wikipedia.org/wiki/Post_hoc_ergo_propter_hoc), can lead to flawed conclusions in research, business, and policy-making. Proper causal inference requires careful experimental design, statistical controls, or methods like randomized controlled trials, not just observational data. In short: correlation can point you toward possible causes, but causality must be proven through deeper investigation.

[It is not that hard to find missleading examples!](https://www.tylervigen.com/spurious-correlations)

In [210]:
# perform correlation analysis on the features
# from scipy.stats import pearsonr

## Bonferroni’s principle 

Bonferroni’s principle is a statistical caution that says:
> If you keep looking for patterns in data without adjusting your criteria, you’re bound to find "significant" results purely by chance.

It reminds us that if you search a large enough dataset for correlations, patterns, or anomalies without proper statistical controls, you will almost certainly find patterns that are just random noise. This is especially important when working with high-dimensional data, where the number of possible comparisons is huge.

For example, if you test $m=20$ independent hypotheses at a $5%$ significance level ($p < 0.05$), you should expect about 1 false positive even if none of the hypotheses are actually true. Bonferroni’s correction addresses this by lowering the threshold for each test:

$$\alpha'=\frac{\alpha}{m}$$

In [49]:
import numpy as np
from scipy.stats import pearsonr

_max_n_samples=200
_max_n_variables=50

@interact(
    n_samples=widgets.IntSlider(min=5, max=_max_n_samples, step=1, value=100),
    n_variables=widgets.IntSlider(min=5, max=_max_n_variables, step=1, value=5)
)
def bonferroni(n_samples=100, n_variables=20):
    np.random.seed(42)
    # Generate random data (completely uncorrelated)
    data = np.random.randn(n_variables, n_samples)
    data = data.T
    
    alpha = 0.05  # significance level
    
    false_positives = 0
    total_tests = np.sum(np.arange(n_variables))
    
    # Apply Bonferroni correction
    alpha_bonferroni = alpha / total_tests
    false_positives_corrected = 0
    
    # Test all pairs of variables
    p_values = []
    for i in range(n_variables):
        for j in range(i+1, n_variables):
            r, p_value = pearsonr(data[:, i], data[:, j])
            p_values.append(p_value)
            if p_value < alpha:
                false_positives += 1
            if p_value < alpha_bonferroni:
                false_positives_corrected += 1
    
    p_values = np.array(p_values)
    
    print(f"Total tests: {total_tests}")
    print(f"False positives at α={alpha}: {false_positives}")
    print(f"False positives after Bonferroni correction (α={alpha_bonferroni}): {false_positives_corrected}")
    print(f"Adjusted alpha: {alpha_bonferroni:.6f}")
    
    fig, axs = plt.subplots(1, 2)
    
    _n, _, _ = axs[0].hist(p_values)
    axs[0].axvline(alpha, 0, np.max(_n)+1, color="orange", label="α=0.05")
    axs[0].axvline(alpha_bonferroni, 0, np.max(_n)+1, color="red", label="Bonferroni Corrected")
    axs[0].set_title("Distribution of P-values")
    
    _n, _, _ = axs[1].hist(p_values[p_values<=0.1])
    axs[1].axvline(alpha, 0, np.max(_n)+1, color="orange")
    axs[1].axvline(alpha_bonferroni, 0, np.max(_n)+1, color="red")
    axs[1].set_xlim(-1e-3, 0.1)
    axs[1].set_title("Zoomed in [0, 0.1]")
    fig.legend(loc='lower center', ncols=2, bbox_to_anchor=(0.5, -0.05))

interactive(children=(IntSlider(value=100, description='n_samples', max=200, min=5), IntSlider(value=5, descri…

In [None]:
# following Bonferroni's principle, how does the number of correlated features change?


## Mutual Information

# Distances