In [1]:
import numpy as np

#### 1. Apply a min-max normalization (feature-wise)

Given the following matrix

$M = \begin{bmatrix}
2 & 8 & 6\\
1 & 7 & 4\\
3 & 6 & 5
\end{bmatrix}$

In [2]:
arr = np.array([[2.0, 8.0, 6.0], [1.0, 7.0, 4.0], [3.0, 6.0, 5.0]])


def min_max_normalization(arr: np.ndarray) -> np.ndarray:
    """Feature-wise normalization.

    Note: Feature-wise normalization means that each feature (column) is normalized independently.

    Args:
        arr (np.ndarray): 2-D array of shape (n_samples, n_features) to be normalized.

    Returns:
        np.ndarray: Normalized array of the same shape as input.
    """

    columns = arr.shape[1]

    for col in range(columns):
        min_val = arr[:, col].min()
        max_val = arr[:, col].max()

        # Avoid division by zero
        if max_val - min_val == 0:
            arr[:, col] = 0
        else:
            arr[:, col] = (arr[:, col] - min_val) / (max_val - min_val)

    return arr


normalized_arr = min_max_normalization(arr.copy())
print("Original array:")
print(arr)
print("Normalized array:")
print(normalized_arr)

Original array:
[[2. 8. 6.]
 [1. 7. 4.]
 [3. 6. 5.]]
Normalized array:
[[0.5 1.  1. ]
 [0.  0.5 0. ]
 [1.  0.  0.5]]


#### 2. One-hot encoding

Get the one-hot of the following array

$v = [0, 2, 1, 3]$

[What is one-hot encoding?](https://dataheroes.ai/glossary/one-hot-encoding/)

In [None]:
arr = np.array([0, 2, 1, 3])


def one_hot_encoding(arr: np.ndarray) -> np.ndarray:
    """One-hot encoding of categorical variables.

    Args:
        arr (np.ndarray): 1-D array of categorical variables to be encoded.

    Returns:
        np.ndarray: One-hot encoded array.
    """

    n_classes = arr.max() + 1
    one_hot = np.zeros((arr.size, n_classes))

    for i in range(arr.size):
        one_hot[i, arr[i]] = 1

    return one_hot


one_hot_arr = one_hot_encoding(arr.copy())
print("Original array:")
print(arr)
print("One-hot encoded array:")
print(one_hot_arr)


def one_hot_encoding_with_broadcasting(arr: np.ndarray) -> np.ndarray:
    # np.arange(arr.max()+1) creates an array of shape (n_classes,)
    # arr[:, np.newaxis] reshapes arr to (n_samples, 1) (a column vector)
    # the comparison arr[:, np.newaxis] == np.arange(arr.max()+1) creates a 2D boolean array
    # of shape (n_samples, n_classes), where each row corresponds to the one-hot encoding
    # of the respective element in arr.
    return (np.arange(arr.max() + 1) == arr[:, np.newaxis]).astype(float)


one_hot_arr_broadcasting = one_hot_encoding_with_broadcasting(arr.copy())
print("Original array:")
print(arr)
print("One-hot encoded array with broadcasting:")
print(one_hot_arr_broadcasting)

Original array:
[0 2 1 3]
One-hot encoded array:
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]
Original array:
[0 2 1 3]
One-hot encoded array with broadcasting:
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]


#### 3. Cosine similarity

Get the cosine similarity for the following arrays

$v_1 = [1.0, 2.0, 3.0]$

$v_2 = [4.0, 5.0, 6.0]$

Resources:

- [Definition of cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
- [Why is it useful](https://www.datastax.com/guides/what-is-cosine-similarity)

In [14]:
v_1 = np.array([1, 2, 3])
v_2 = np.array([4, 5, 6])


def cosine_similarity(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray:
    """Compute cosine similarity between two vectors.

    Args:
        arr1 (np.ndarray): First vector.
        arr2 (np.ndarray): Second vector.

    Returns:
        float: Cosine similarity between the two vectors.
    """

    dot_product = np.dot(arr1, arr2)
    norm_a = np.linalg.norm(arr1)
    norm_b = np.linalg.norm(arr2)

    # Avoid division by zero
    if norm_a == 0 or norm_b == 0:
        return 0.0

    return dot_product / (norm_a * norm_b)


similarity = cosine_similarity(v_1, v_2)
print("Vector 1:")
print(v_1)
print("Vector 2:")
print(v_2)
print("Cosine similarity:")
print(similarity)

Vector 1:
[1 2 3]
Vector 2:
[4 5 6]
Cosine similarity:
0.9746318461970762


#### Outliers detection

Given the following array

$v = [10, 12, 11, 13, 10, 200, 9, 11]$ 

get the index of all elements that are beyond $avg(v) \pm 2\sigma$

In [None]:
v = np.array([10, 12, 11, 13, 10, 200, 9, 11])


def get_outliers_indexes(arr: np.ndarray) -> np.ndarray:
    avg = np.mean(arr)
    std = np.std(arr)
    threshold = 2 * std

    # Find indexes of outliers using a boolean mask
    outliers = np.where(np.abs(arr - avg) > threshold)[0]
    return outliers


outliers = get_outliers_indexes(v.copy())
print("Original array:")
print(v)
print("Outliers indexes:")
print(outliers)

Original array:
[ 10  12  11  13  10 200   9  11]
Outliers indexes:
[5]


#### Grouping by key

Given the following arrays

$k = ["user1", "user2", "user1", "user3", "user2"]$

$v = [100, 200, 150, 50, 300]$

Return a dictionary with the average by user.

In [None]:
keys = np.array(["user1", "user2", "user1", "user3", "user2"])
values = np.array([100, 200, 150, 50, 300])

np.set_printoptions(legacy="1.25")  # Just for better printing


def group_by_keys(keys: np.ndarray, values: np.ndarray) -> dict:
    """Group values by keys.

    Args:
        keys (np.ndarray): Array of keys.
        values (np.ndarray): Array of values.

    Returns:
        dict: Dictionary with keys and their corresponding aggregated values.
    """

    unique_keys = np.unique(keys)
    grouped_dict = {}

    for key in unique_keys:
        mask = np.where(keys == key)
        grouped_dict[key] = values[mask].mean()

    return grouped_dict


grouped_dict = group_by_keys(keys.copy(), values.copy())
print("Keys:")
print(keys)
print("Values:")
print(values)
print("Grouped dictionary:")
print(grouped_dict)

Keys:
['user1' 'user2' 'user1' 'user3' 'user2']
Values:
[100 200 150  50 300]
Grouped dictionary:
{'user1': 125.0, 'user2': 250.0, 'user3': 50.0}
