In [1]:
import numpy as np
import pandas as pd
import math

from ipywidgets import interact, widgets

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

# Data Mining

Data mining is the process of discovering meaningful patterns, trends, and relationships in large datasets using statistical, machine learning, and database management techniques. It goes beyond simple data analysis by automatically extracting hidden knowledge that can support decision-making, and helps us understand complex phenomena. Common applications include customer behavior analysis, fraud detection, medical diagnosis, or market trend prediction. By turning raw data into actionable insights, data mining serves as a critical tool in today’s data-driven world.

# Data

The most convenient way to think of the datasets that the majority
of data mining algorithms operate upon is the tabular view. In this
analogy the problem at hand can be treated as (a potentially gigantic)
spreadsheet with several rows – corresponding to data objects – and
columns, each of which includes observed attributes with respect the
different aspects of these data objects.

Another important aspect of the datasets we work with is the
measurement scale of the individual columns in the data matrix
(each corresponding to a random variable). A concise summary of
the different measurement scales and some of the most prototypical
statistics which can be calculated for them:

| Type of attribute | Description | Examples | Statistics |
|-------------------|-------------|----------|------------|
| **Categorical**   |             |          |            |
| Nominal           | Variables can be checked for equality only; | names of cities, hair color | mode, entropy, correlation, χ²-test |
| Ordinal           |  `>` relation can be interpreted among variables; | grades {fail, pass, excellent} | median, percentiles |
| **Numerical**     |             |          |            |
| Interval          | The difference of two variables can be formed and interpreted | shoe sizes, dates, °C | mean, deviation, significance (e.g., F-, t- tests) |
| Ratio             | Ratios can be formed from values of the variables of this kind | age, length, temperature in Kelvin | percent, geometric/harmonic mean, variation |

Let's load the [Bike Rental Dataset](https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset). A slightly modified version that we are going to use can be found at `/data/rental.csv`.

Dataset features:
- `season`: Season
- `yr`: Year
- `mnth`: Month
- `holiday`: Indicator whether the day was a holiday or not.
- `weekday`: Day of the week.
- `workingday`: Indicator whether the day was a working day or weekend.
- `weathersit`: The weather situation on that day. One of:
  - 1: clear, few clouds, partly cloudy, cloudy
  - 2: mist + clouds, mist + broken clouds, mist + few clouds, mist
  - 3: light snow, light rain + thunderstorm + scattered clouds, light rain + scattered clouds
  - 4: heavy rain + ice pallets + thunderstorm + mist, snow + mist
- `temp`: Temperature in degrees Celsius.
- `atemp`: Felt temperature in Celsius.
- `hum`: Relative humidity in percent (0 to 100).
- `windspeed`: Wind speed in km per hour.
- `cnt`: Count of bicycles including both casual and registered users. The count is used as the target in the regression task.


In [12]:
# loading dataset
df = pd.read_csv("https://github.com/ficstamas/data-mining/raw/b76d5b7913c446878fa47de8861c83e26780828f/data/rental.csv", index_col=0)
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0
...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0


In [13]:
# make the train-test splits and separate the target variable
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

train_X, train_y = train[train.columns.difference(["cnt"])], train[["cnt"]]
test_X, test_y = test[test.columns.difference(["cnt"])], test[["cnt"]]

## Visualization

In [14]:
# using a histogram visualize the distribution of numerical features


In [15]:
# visualize the joint behaviour of two variables using scatter plot
# extra: include the histogram of the the point next to the x and y axes 


In [16]:
# show the distribution of windspeed for each season (using box or violin plots)


In [17]:
# vizualize any interesing asapect of the dataset


## Modelling

In [18]:
# model fitting and evaluation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def fit_and_eval(trainX, trainY, testX, testY):
    model = LinearRegression()
    model.fit(trainX, trainY)
    
    predictions = model.predict(testX)
    return r2_score(testY, predictions)

# Pre-processing

Data preprocessing is a crucial step in any data analysis or machine learning workflow because raw data is often incomplete, inconsistent, noisy, or in a format unsuitable for modeling. Preprocessing transforms this raw data into a clean, well-structured form, ensuring that the analysis or model can extract meaningful patterns rather than being misled by errors or irrelevant information. This means that we perform some transformation over the data matrix, either in a column or a row-oriented manner.

## Basic Concepts

The **mean (or expected) value** can be calculated as:

$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} \mathbf{x}_i$$

In [None]:
# Calculate the mean of a random vector without using any builtin or numpy functions
np.random.seed(0)
X = np.random.randn(50)

# code here, X_mean should contain the final result
X_mean = None

assert np.allclose(np.mean(X), X_mean)

The **variance** measures how much a set of **values deviates from their mean**. A small variance indicates that the data points are close to the mean, while a large variance indicates more spread. It can be seen as the average distance from the mean.
$$\sigma^2 = \frac{1}{n}\sum_{i=1}^{n} (x_i - \bar{x})^2$$

In [None]:
# Calculate the standarde deviation of a random vector without using any builtin or numpy functions
# you can reuse code from previous tasks
np.random.seed(0)
X = np.random.multivariate_normal([0], [[3]], 50)

# code here X_var should contain the final result
X_var = None

assert np.allclose(np.std(X), X_var)

The **standard deviation** is the squere root of the variance. It can be useful if you want measure the spread withing the same units as your input.

$$\sigma = \sqrt{\frac{1}{n}\sum_{i=1}^{n} (x_i - \bar{x})^2}$$

In [None]:
# Calculate the standarde deviation of a random vector without using any builtin or numpy functions
# you can reuse code from previous tasks
np.random.seed(0)
X = np.random.multivariate_normal([0], [[3]], 50)

# code here X_std should contain the final result
X_std = None

assert np.allclose(np.std(X), X_std)

The covariance matrix generalizes the concept of variance to multiple dimensions, measuring how pairs of features vary together.
$$\Sigma = \mathrm{Cov}(\mathbf{X}) = \frac{1}{n} (\mathbf{X} - \bar{\mathbf{X}})^\top (\mathbf{X} - \bar{\mathbf{X}})$$

Each element ($\Sigma_{ij}$) represents the covariance between features $i$ and $j$, while the diagonal entries correspond to the variances of individual features. A positive covariance indicates that the features increase together, while a negative covariance indicates an inverse relationship.

In [None]:
# calculate the covariance matrix without using any builtin functions
# you can reuse code from previous tasks
np.random.seed(0)
X = np.random.multivariate_normal([0, 0, 0], [[1, 0.3, 0.1], [0.3, 1, -0.5], [0.1, -0.5, 1]], 50)

X_cov = None

assert np.allclose(X_cov, np.cov(X.T))

### Correlation and Causuality

**Correlation refers to a statistical relationship between two variables** - when changes in one variable are associated with changes in another. For example, ice cream sales and beach attendance often rise together, showing a positive correlation. **Causality, on the other hand, means that one event directly influences or produces another.** If A causes B, then changing A will lead to a predictable change in B. While correlation can hint at possible causal links, it does not prove them.

The key difference is that correlation simply describes a relationship, while causality explains the underlying mechanism of that relationship. Many correlated events share a common cause or are influenced by other variables (confounders). For instance, both ice cream sales and drowning incidents increase in summer, but the cause is warmer weather - not ice cream itself.

**A common misconception is assuming that "correlation implies causation".** This error, sometimes called the [post hoc fallacy](https://en.wikipedia.org/wiki/Post_hoc_ergo_propter_hoc), can lead to flawed conclusions in research, business, and policy-making. Proper causal inference requires careful experimental design, statistical controls, or methods like randomized controlled trials, not just observational data. In short: correlation can point you toward possible causes, but causality must be proven through deeper investigation.

[It is not that hard to find missleading examples!](https://www.tylervigen.com/spurious-correlations)

The linear correlation coefficient (Pearson correlation coefficient) can be calculated as:
$$\rho_{x,y} = \frac{\mathrm{Cov}(x, y)}{\sigma_x \sigma_y} $$

In [132]:
# calculate the correlation matrix without using any builtin functions
# no bias correction
# you can reuse code from previous tasks
np.random.seed(0)
X = np.random.multivariate_normal([0, 0, 0], [[1, 0.3, 0.1], [0.3, 1, -0.5], [0.1, -0.5, 1]], 50)

X_cor = None

assert np.allclose(X_cor, np.corrcoef(X.T))

In [None]:
# perform correlation analysis on the features
# from scipy.stats import pearsonr
# np.corrcoef

### Mutual Information

Mutual information is a fundamental concept in information theory that quantifies the amount of information shared between two random variables. Unlike correlation, which only captures linear relationships, mutual information measures all types of dependencies, whether linear or nonlinear. It essentially tells us how much knowing the value of one variable reduces the uncertainty (entropy) about the other.

Entropy can be seen as a measurement that quantifies the amount of uncertainty in a feature, which can be formulated as:
$$H(X) = - \sum_{x \in \mathcal{X}} p(x) \log p(x).$$
Mutual information can be expressed between two variable $X$ and $Y$ as:
$$I(X;Y) = \sum_{x \in \mathcal{X}} \sum_{y \in \mathcal{Y}} p(x,y) \log \frac{p(x,y)}{p(x)p(y)},$$
which can be also expressed by entropy:
$$I(X;Y) = H(X) + H(Y) - H(X,Y).$$

In [None]:
# Calculate the Shannon entropy of numerical features
# 

In [None]:
# Calculate the mutual information between `season` and `weathersit`
# 

In [None]:
# By relying on mutual information, try to discratize the `windspeed` feature into 4 differen categories ('low', 'medium', 'high', 'very high')
# 

###  Eigenvalues and Eigenvectors

Eigenvalues and eigenvectors are central concepts in linear algebra that provide deep insights into the behavior of linear transformations represented by matrices. Given a square matrix $A$, an eigenvector $v$ is a non-zero vector that, when multiplied by $A$, only changes in magnitude and not in direction. The factor by which it is scaled is called the eigenvalue $\lambda$, satisfying the equation $Av=\lambda v$. Eigenvalues and eigenvectors reveal fundamental properties of a matrix, such as its invertibility, stability, and the principal directions along which transformations act.

In [33]:
@interact(
    a11=widgets.FloatSlider(min=-3, max=3, step=0.1, value=1),
    a12=widgets.FloatSlider(min=0, max=3, step=0.1, value=0.5),
    a21=widgets.FloatSlider(min=0, max=3, step=0.1, value=0.5),
    a22=widgets.FloatSlider(min=-3, max=3, step=0.1, value=1),
)
def plot_eigen(a11=2.0, a12=0.0, a21=0.0, a22=1.0):
    A = np.array([[a11, a12],
                  [a21, a22]])
    
    # Create a unit circle
    theta = np.linspace(0, 2*np.pi, 200)
    circle = np.vstack([np.cos(theta), np.sin(theta)])

    # Transform circle
    ellipse = A @ circle
    
    # Try computing eigenvalues/vectors
    try:
        vals, vecs = np.linalg.eig(A)
    except np.linalg.LinAlgError:
        vals, vecs = [], []

    print(vecs)
    
    plt.figure(figsize=(6,6))
    
    # Original circle
    plt.plot(circle[0], circle[1], 'k--', alpha=0.5, label="Unit circle")
    
    # Transformed ellipse
    plt.plot(ellipse[0], ellipse[1], 'b-', label="Transformed circle")
    
    # Plot eigenvectors
    for val, vec in zip(vals, vecs.T):
        if np.iscomplex(val):  # skip complex eigenvalues
            continue
        vec = vec / np.linalg.norm(vec)  # normalize
        plt.arrow(0, 0, val*vec[0], val*vec[1],
                  head_width=0.1, color='r', alpha=0.8,
                  length_includes_head=True)
        plt.text(val*vec[0]*1.1, val*vec[1]*1.1, f"λ={val:.2f}", color='r')
    
    plt.axhline(0, color='gray', lw=0.5)
    plt.axvline(0, color='gray', lw=0.5)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.xlim(-3, 3)
    plt.ylim(-3, 3)
    plt.legend()
    plt.title(f"Matrix A = [[{a11:.1f}, {a12:.1f}], [{a21:.1f}, {a22:.1f}]]")
    plt.show()

interactive(children=(FloatSlider(value=1.0, description='a11', max=3.0, min=-3.0), FloatSlider(value=0.5, des…

## Categorical data

Categorical data transformation is a critical step in data preprocessing, as it converts nominal variables into numerical formats that machine learning algorithms can interpret. Many statistical and predictive models rely on mathematical operations, which cannot be directly applied to raw categorical values.

### Numeric mapping

Numeric mapping ensures that each categorical value will have a numerical value, for example instead representing `seasons` as strings, we remap each value in the following manner `{'spring': 0, 'summer': 1, 'fall': 2, 'winter': 3}`.

In [19]:
# lets transform each categorical feature to their numerical representation
# 
def _transform_categorical_to_numerical(row):
    """
    row: A row of the dataset. You can access a column by `row.column_name` or row["column_name"]
    """
    # code here
    return row


numeric_mapping_train_X = train_X.apply(_transform_categorical_to_numerical, axis=1)
numeric_mapping_test_X = test_X.apply(_transform_categorical_to_numerical, axis=1)

In [20]:
# evaluate the model
# fit_and_eval(numeric_mapping_train_X, trainY, numeric_mapping_test_X, testY)

### One-hot encoding

One-hot encoding is a common method for representing categorical variables as binary vectors, where each category is assigned its own column and marked with a `1` for presence and `0` for absence. This approach avoids imposing any false ordinal relationship between categories, making it ideal for nominal data. While it preserves category distinctions, it can significantly increase dimensionality, especially for features with many unique values.

In [21]:
# lets transform each categorical feature to their one-hot encoded representation
# 


one_hot_mapping_train_X = None
one_hot_mapping_test_X = None

In [22]:
# evaluate the model
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

In [23]:
# try to remove dummy variables


## Numerical data

Numerical data transformation involves modifying quantitative variables to improve model performance, interpretability, or compliance with algorithmic assumptions. Common techniques include normalization, standardization, scaling, and non-linear transformations, which help adjust for differences in magnitude, distribution skewness, or outliers. By transforming numerical data appropriately, we can enhance learning efficiency, reduce bias from extreme values, and ensure that each feature contributes proportionately to the model. In short, we can help the model fit better by scaling the numerical features to similar scales (i.e. `windspeed` in km/h scales differently than `temp` in celsius).

In [4]:
# Utility functions for plotting
from scipy.spatial import ConvexHull
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms


def plot_enclosing_ellipse(A, ax, std_levels=(1.0, 2.0, 3.0), styles=('-', '--', ':'), facecolor='none', **kwargs):
    x, y = A[:, 0], A[:, 1]
    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensional dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    for i, sigma in enumerate(std_levels):
        # facecolor=facecolor
        ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, linestyle=styles[i], **kwargs)
    
        # Calculating the standard deviation of x from
        # the squareroot of the variance and multiplying
        # with the given number of standard deviations.
        scale_x = np.sqrt(cov[0, 0]) * sigma
        mean_x = np.mean(x)
    
        # calculating the standard deviation of y ...
        scale_y = np.sqrt(cov[1, 1]) * sigma
        mean_y = np.mean(y)
    
        transf = transforms.Affine2D() \
            .rotate_deg(45) \
            .scale(scale_x, scale_y) \
            .translate(mean_x, mean_y)
    
        ellipse.set_transform(transf + ax.transData)
        ax.add_patch(ellipse)

In [5]:
def plot_transformation(x, x_altered, altered_label, axs):  
    center = np.mean(x, axis=0)  
    axs.scatter(x[:, 0], x[:, 1], color="red", label="Original Data")
    axs.scatter(center[0], center[1], marker="x", color="blue", label="Mean")
    
    plot_enclosing_ellipse(x, axs, fill=False, color='red', linewidth=1)
    

    _m_centered = np.mean(x_altered, axis=0)
    axs.scatter(x_altered[:, 0], x_altered[:, 1], color="yellow", label=altered_label)
    axs.scatter(_m_centered[0], _m_centered[1], marker="x", color="blue")
    plot_enclosing_ellipse(x_altered, axs, fill=False, color='yellow', linewidth=1)
    
    axs.annotate("", xytext=(center[0], center[1]), xy=(_m_centered[0], _m_centered[1]), arrowprops=dict(arrowstyle="->", color="red"))
    xlim = axs.get_xlim()
    ylim = axs.get_ylim()
    axs.vlines(0, -1000, 1000, color="black", lw=1)
    axs.hlines(0, -1000, 1000, color="black", lw=1)
    axs.set_xlim(*xlim)
    axs.set_ylim(*ylim)

### Mean centering

Mean centering is a technique in which the average value of a variable is subtracted from each data point, resulting in a new variable with a mean of zero.

$$x' = x - \bar{x}, \quad \bar{x} = \frac{1}{n}\sum_{i=1}^{n} x_i$$

In [97]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=5),
    std=widgets.FloatSlider(min=1, max=50, value=2),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=5, max=50, step=1, value=10)
)
def mean_centering_interact_plot(loc=20, std=2.0, seed=42, n_samples=10):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    plot_transformation(x, x_centered, "Centered Data", axs)
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=5.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=2.0, d…

In [None]:
# Center the numerical features in our dataset


In [None]:
# Evaulate the model with the new feature values


### Standardization

Standardization is a data transformation technique that rescales numerical features so they have a mean of zero and a standard deviation of one.

$$x' = \frac{x - \bar{x}}{\sigma}$$

In [98]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=3),
    std=widgets.FloatSlider(min=1, max=50, value=2),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def standardization_interact_plot(loc=20, std=2.0, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    x_standardized = x_centered / np.std(x_centered, axis=0)
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    plot_transformation(x, x_standardized, "Standardized Data", axs)
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=3.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=2.0, d…

In [None]:
# Standardize the numerical features in our dataset


In [None]:
# Evaulate the model with the new feature values
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

### Whitening

Whitening is a data transformation technique that not only standardizes features to have zero mean and unit variance but also removes correlations between them, producing uncorrelated variables with identity covariance. Thus we are looking for a transformation $W$ that satisfies $W\Sigma W^T=I$.

Whitening can be achieved through several techniques, each with its own approach to decorrelate and scale data. Here, we are going to focus on Cholesky whitening trasformation, but more can be see in the [Additional Materials](#More-Whitening) section. Cholesky whitening will find such a transformation by decomposing $\Sigma^{-1}$ with Cholesky decomposition.

The Cholesky decomposition factorizes a positive definite matrix $\Sigma^{-1}$ as:
$$\Sigma^{-1}=LL^T,$$
where $L$ is a lower-triangular matrix with positive diagonal entries. Then whitening transformation $W$ is defined as:
$$W=L.$$

In [6]:
@interact(
    loc_x=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-3),
    loc_y=widgets.FloatSlider(min=-20, max=20, step=0.1, value=3),
    cov=widgets.FloatSlider(min=-0.95, max=0.95, step=0.05, value=0.8),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def whitening_interact_plot(loc_x=10, loc_y=10, cov=0.8, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.multivariate_normal([loc_x, loc_y], [[1, cov], [cov, 1]], n_samples)
    print("Covariance matrix of the original data:")
    print(np.array2string(np.corrcoef(x.T), precision=2, floatmode='fixed'))
    # x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    # Covariance matrix
    cov_matrix = np.cov(x_centered, rowvar=False)

    # Cholesky:
    L = np.linalg.cholesky(np.linalg.pinv(cov_matrix), upper=False)
    X_chol_white = x_centered @ L
    # print(np.cov(X_chol_white))
    print("Covariance matrix of Cholesky whitening:")
    print(np.array2string(np.corrcoef(X_chol_white.T), precision=2, floatmode='fixed'))
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1, sharex=True, sharey=True)
    axs.set_aspect('equal')
    fig.set_size_inches(5.5, 4)
    axs.set_title('Cholesky whitening')
    
    plot_transformation(x, X_chol_white, "Whitened Data", axs)
    # fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))
    # remove duplicated legend labels
    handles, labels = fig.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    fig.legend(by_label.values(), by_label.keys(), loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-3.0, description='loc_x', max=20.0, min=-20.0), FloatSlider(value=3.0…

In [None]:
# Apply whitening transformation to the numeric features


In [None]:
# Evaulate the model with the new feature values
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

### Min-max scaling

Min-max normalization is a technique that rescales the values of a feature to a fixed range, usually `[0, 1]`, by subtracting the minimum value and dividing by the range of the data. This ensures that all features contribute proportionately, preventing variables with larger scales from dominating models.

$$x' = \frac{x - x_{\min}}{x_{\max} - x_{\min}}$$

In [7]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-1),
    std=widgets.FloatSlider(min=0.1, max=2, step=0.1, value=0.5),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def min_max_interact_plot(loc=-1, std=0.5, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2)) 
    center = np.mean(x, axis=0)    
    x_mm = (x - np.min(x, axis=0)) / (np.max(x, axis=0) - np.min(x, axis=0))
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    plot_transformation(x, x_mm, "Min-Max Normalized Data", axs)
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-1.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=0.5, …

In [None]:
# Apply min-max transformation to the numeric features


In [None]:
# Evaulate the model with the new feature values
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

### Unit normalization

Unit normalization is a data transformation technique that scales each data vector to have a unit norm, typically a length of 1, by dividing the vector by its magnitude. This emphasizes the direction of the data rather than its absolute scale.

$$\mathbf{x}' = \frac{\mathbf{x}}{\|\mathbf{x}\|_2} = \frac{\mathbf{x}}{\sqrt{\sum_{i=1}^{d} x_i^2}}$$

In [165]:
@interact(
    loc=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-1),
    std=widgets.FloatSlider(min=0.1, max=2, step=0.1, value=0.5),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def unit_norm_interact_plot(loc=-1, std=0.5, seed=42, n_samples=20, center=False):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2)) 
    mean = np.mean(x, axis=0)
    xc = x
    if center:
        xc = x - mean
    x_unit = xc / np.linalg.norm(xc, axis=1)[:, None]
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 1)
    plot_transformation(xc, x_unit, "Unit Normalized Data", axs)
    ticks = axs.get_xticks()
    print(axs.get_xticks())
    axs.set_xticks(ticks)
    axs.set_yticks(ticks)
    fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-1.0, description='loc', max=20.0, min=-20.0), FloatSlider(value=0.5, …

In [None]:
# Apply unit normalization to the numeric features


In [None]:
# Evaulate the model with the new feature values
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)

# Additional Materials

## Bonferroni’s principle 

Bonferroni’s principle is a statistical caution that says:
> If you keep looking for patterns in data without adjusting your criteria, you’re bound to find "significant" results purely by chance.

It reminds us that if you search a large enough dataset for correlations, patterns, or anomalies without proper statistical controls, you will almost certainly find patterns that are just random noise. This is especially important when working with high-dimensional data, where the number of possible comparisons is huge.

For example, if you test $m=20$ independent hypotheses at a $5%$ significance level ($p < 0.05$), you should expect about 1 false positive even if none of the hypotheses are actually true. Bonferroni’s correction addresses this by lowering the threshold for each test:

$$\alpha'=\frac{\alpha}{m}$$

In [None]:
import numpy as np
from scipy.stats import pearsonr

_max_n_samples=200
_max_n_variables=50

@interact(
    n_samples=widgets.IntSlider(min=5, max=_max_n_samples, step=1, value=100),
    n_variables=widgets.IntSlider(min=5, max=_max_n_variables, step=1, value=5)
)
def bonferroni(n_samples=100, n_variables=20):
    np.random.seed(42)
    # Generate random data (completely uncorrelated)
    data = np.random.randn(n_variables, n_samples)
    data = data.T
    
    alpha = 0.05  # significance level
    
    false_positives = 0
    total_tests = np.sum(np.arange(n_variables))
    
    # Apply Bonferroni correction
    alpha_bonferroni = alpha / total_tests
    false_positives_corrected = 0
    
    # Test all pairs of variables
    p_values = []
    for i in range(n_variables):
        for j in range(i+1, n_variables):
            r, p_value = pearsonr(data[:, i], data[:, j])
            p_values.append(p_value)
            if p_value < alpha:
                false_positives += 1
            if p_value < alpha_bonferroni:
                false_positives_corrected += 1
    
    p_values = np.array(p_values)
    
    print(f"Total tests: {total_tests}")
    print(f"False positives at α={alpha}: {false_positives}")
    print(f"False positives after Bonferroni correction (α={alpha_bonferroni}): {false_positives_corrected}")
    print(f"Adjusted alpha: {alpha_bonferroni:.6f}")
    
    fig, axs = plt.subplots(1, 2)
    
    _n, _, _ = axs[0].hist(p_values)
    axs[0].axvline(alpha, 0, np.max(_n)+1, color="orange", label="α=0.05")
    axs[0].axvline(alpha_bonferroni, 0, np.max(_n)+1, color="red", label="Bonferroni Corrected")
    axs[0].set_title("Distribution of P-values")
    
    _n, _, _ = axs[1].hist(p_values[p_values<=0.1])
    axs[1].axvline(alpha, 0, np.max(_n)+1, color="orange")
    axs[1].axvline(alpha_bonferroni, 0, np.max(_n)+1, color="red")
    axs[1].set_xlim(-1e-3, 0.1)
    axs[1].set_title("Zoomed in [0, 0.1]")
    fig.legend(loc='lower center', ncols=2, bbox_to_anchor=(0.5, -0.05))

In [None]:
# following Bonferroni's principle, how does the number of correlated features change?


## More Whitening

| Feature / Aspect           | Cholesky Whitening                                                                 | PCA Whitening                                                                 | ZCA Whitening                                                                 |
|----------------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------------|
| **Purpose**                | Transform correlated variables into uncorrelated ones using covariance factorization | Decorrelate and normalize data by projecting onto principal components        | Decorrelate and normalize data while keeping it as close as possible to original space |
| **Decomposition** | Cholesky: $\Sigma = LL^T$ | PCA: $\Sigma = V \Lambda V^T $ | PCA: $\Sigma = V \Lambda V^T $ |
| **Transformation** | $x_{\text{whitened}} = L^{-1}x$               | $x_{\text{whitened}} = \Lambda^{-1/2} V^T x $ | $x_{\text{whitened}} = V \Lambda^{-1/2} V^T x$                                  |
| **Output Correlation**      | Zero (uncorrelated)                                                               | Zero (uncorrelated)                                                           | Zero (uncorrelated)                                                           |
| **Output Orientation**      | Depends on Cholesky factor; not aligned with original axes                        | Aligned with principal components; rotated axes                               | Closest to original data orientation; minimal rotation                        |
| **Preserves Original Structure?** | No                                                                          | Partially (rotated along principal axes)                                      | Yes, maintains overall data structure                                   |

In [164]:
@interact(
    loc_x=widgets.FloatSlider(min=-20, max=20, step=0.1, value=-3),
    loc_y=widgets.FloatSlider(min=-20, max=20, step=0.1, value=3),
    cov=widgets.FloatSlider(min=-0.95, max=0.95, step=0.05, value=0.8),
    seed=widgets.IntSlider(min=0, max=50, step=1, value=5),
    n_samples=widgets.IntSlider(min=10, max=100, step=1, value=20)
)
def whitening_interact_plot(loc_x=10, loc_y=10, cov=0.8, seed=42, n_samples=20):
    np.random.seed(seed)
    
    # Generate example data
    x = np.random.multivariate_normal([loc_x, loc_y], [[1, cov], [cov, 1]], n_samples)
    print("Covariance matrix of the original data:")
    print(np.array2string(np.corrcoef(x.T), precision=2, floatmode='fixed'))
    # x = np.random.normal(loc=loc, scale=std, size=(n_samples, 2))  # mean=50, std=10
    center = np.mean(x, axis=0)
    x_centered = x - center  # mean-centered
    # Covariance matrix
    cov_matrix = np.cov(x_centered, rowvar=False)

    # Eigen decomposition
    eigvals, eigvecs = np.linalg.eigh(cov_matrix)

    # PCA Whitening: decorrelates & scales to unit variance
    D_inv_sqrt = np.diag(1.0 / np.sqrt(eigvals))
    X_pca_white = x_centered @ eigvecs @ D_inv_sqrt

    # ZCA whitening: rotates the vectors back to the original orientation
    X_zca_white = X_pca_white @ eigvecs.T

    # Cholesky:
    L = np.linalg.cholesky(np.linalg.pinv(cov_matrix), upper=False)
    X_chol_white = x_centered @ L
    # print(np.cov(X_chol_white))
    print("Covariance matrix of Cholesky whitening:")
    print(np.array2string(np.corrcoef(X_chol_white.T), precision=2, floatmode='fixed'))
    print("Covariance matrix of PCA whitening:")
    print(np.array2string(np.corrcoef(X_pca_white.T), precision=2, floatmode='fixed'))
    print("Covariance matrix of ZCA whitening:")
    print(np.array2string(np.corrcoef(X_zca_white.T), precision=2, floatmode='fixed'))
    
    # Plot original vs centered
    fig, axs = plt.subplots(1, 3, sharex=True, sharey=True)
    axs[0].set_aspect('equal')
    axs[1].set_aspect('equal')
    axs[2].set_aspect('equal')
    fig.set_size_inches(10.5, 4)
    axs[1].set_title('PCA whitening')
    axs[2].set_title('ZCA whitening')
    axs[0].set_title('Cholesky whitening')
    
    plot_transformation(x, X_chol_white, "Whitened Data", axs[0])
    plot_transformation(x, X_pca_white, "Whitened Data", axs[1])
    plot_transformation(x, X_zca_white, "Whitened Data", axs[2])
    # fig.legend(loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))
    # remove duplicated legend labels
    handles, labels = fig.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    fig.legend(by_label.values(), by_label.keys(), loc='lower center', ncols=3, bbox_to_anchor=(0.5, -0.05))

interactive(children=(FloatSlider(value=-3.0, description='loc_x', max=20.0, min=-20.0), FloatSlider(value=3.0…

In [None]:
# Apply whitening transformation to the numeric features


In [None]:
# Evaulate the model with the new feature values
# fit_and_eval(one_hot_mapping_train_X, trainY, one_hot_mapping_test_X, testY)