In [66]:
import numpy as np
import pandas as pd
from typing import *
import sys
sys.path.append('/Users/emicatx/Downloads/CMOR_438_FALL_2025_Emilia/src')
from rice_ml.preprocess.datatype import *

In [67]:
ArrayLike = Union[np.ndarray, Sequence[float], Sequence[Sequence[float]], pd.DataFrame, pd.Series]

def _ensure_numeric(data_vector: ArrayLike, name: str = 'Data') -> np.ndarray:

    # TODO: docstrings, type hints, and examples

    vector = _1D_vectorized(data_vector, name)
    
    if not np.issubdtype(vector.dtype, np.number):
        try:
            vector = vector.astype(float, copy = False)
        except (TypeError, ValueError) as e:
            raise TypeError(f'All entries in {name} must be numeric') from e
    else:
        vector = vector.astype(float, copy = False)

    return vector

In [68]:
test_array = np.array([True, False, True, False])
_ensure_numeric(test_array)

array([1., 0., 1., 0.])

In [69]:
x = [1, 1, 1]
_1D_vectorized(x, 'x')

array([1, 1, 1])

In [70]:
def euclidean_distance(data_vector_1: ArrayLike, 
                       data_vector_2: ArrayLike, 
                       name_1: str = 'data_vector_1', 
                       name_2: str = 'data_vector_2') -> float:
    
    # TODO: docstrings, type hints, and examples

    vector_1 = _ensure_numeric(data_vector_1, name_1)
    vector_2 = _ensure_numeric(data_vector_2, name_2)

    _shape_match(vector_1, vector_2)

    distance = float(np.linalg.norm((vector_2 - vector_1)))

    return distance

In [71]:
def manhattan_distance(data_vector_1: ArrayLike, 
                       data_vector_2: ArrayLike, 
                       name_1: str = 'data_vector_1', 
                       name_2: str = 'data_vector_2') -> float:
    
    vector_1 = _ensure_numeric(data_vector_1, name_1)
    vector_2 = _ensure_numeric(data_vector_2, name_2)

    _shape_match(vector_1, vector_2)

    distance = float(np.sum(np.abs(vector_2 - vector_1)))

    return distance

In [72]:
def minkowski_distance(data_vector_1: ArrayLike, 
                       data_vector_2: ArrayLike, 
                       p: int,
                       name_1: str = 'data_vector_1', 
                       name_2: str = 'data_vector_2') -> float:
    
    if not isinstance(p, int):
        raise TypeError('p parameter must be an integer')
    if p <= 0:
        raise ValueError('p parameter must be greater than zero')
    
    vector_1 = _ensure_numeric(data_vector_1, name_1)
    vector_2 = _ensure_numeric(data_vector_2, name_2)

    _shape_match(vector_1, vector_2)

    distance = float((np.sum((np.abs(vector_2 - vector_1)) ** p)) ** (1 / p))

    return distance

In [73]:
a = [-1, -2]
b = [2, 2]

In [74]:
euclidean_distance(a, b, 'a', 'b')
manhattan_distance(a,b)
minkowski_distance(a, b, 3)

4.497941445275415

In [75]:
input_1 = pd.Series([1, 1, 1])

output = _ensure_numeric(input_1)

output

array([1., 1., 1.])

In [104]:
X_query = np.array([[1, 2], [3, 4]])
X_train = np.array([[4, 6], [0, 0]])

In [115]:
def _distance_calculations(training_array: np.ndarray, query_array: np.ndarray, metric: str, p: Optional[int] = 3) -> np.ndarray:

    # TODO: docstrings, examples (query is row, training is column)

    query_array = _2D_numeric(query_array)
    training_array = _2D_numeric(training_array)

    distance_matrix = np.full((query_array.shape[0], training_array.shape[0]), np.nan)
    for index_1, point_1 in enumerate(query_array):
        for index_2, point_2 in enumerate(training_array):
            if metric == 'euclidean':
                distance = euclidean_distance(point_1, point_2)
            elif metric == 'manhattan':
                distance = manhattan_distance(point_1, point_2)
            elif metric == 'minkowski':
                distance = minkowski_distance(point_1, point_2, p = p)
            else:
                raise ValueError(f"Unsupported metric: {metric}")
            distance_matrix[index_1, index_2] = distance

    return distance_matrix

In [116]:

def _neighbor_finding(training_array: np.ndarray, query_array: np.ndarray, k: int, metric: str, p: Optional[int] = 3) -> Tuple[np.ndarray, np.ndarray]:
    
    # TODO: docstrings and examples, potentially add further checks for other inputs

    if k > training_array.shape[0]:
        raise ValueError(f'Number of neighbors (k = {k}) cannot be greater than number of training samples ({training_array.shape[0]})')
    
    distance_matrix = _distance_calculations(training_array, query_array, metric = metric, p = p)
    indices = np.argpartition(distance_matrix, kth = k - 1, axis = 1)[:, 0:k]

    query_indices = np.arange(distance_matrix.shape[0])[:, None]
    neighbor_distances = distance_matrix[query_indices, indices]
    ordering = np.argsort(neighbor_distances, axis = 1)
    sorted_indices = indices[query_indices, ordering]
    distances_sorted = neighbor_distances[query_indices, ordering]

    return distances_sorted, sorted_indices

In [130]:
one = np.array([1, 2, 3])
two = np.array([1, 2, 4])

overlap = (one == two).astype(float)
mean_accuracy = float(np.mean(overlap))
mean_accuracy

0.6666666666666666

In [146]:
labels = np.array([1, 2, 3, 4])
indices = np.array([[1, 2], [2, 3]])
n_neighbor = np.array([[0.5, 1], [1, 2]])
print(labels[indices])
print(n_neighbor)
weighted = labels[indices] * n_neighbor
print(weighted)
(np.mean(weighted, axis = 1)).astype(float, copy = False)

[[2 3]
 [3 4]]
[[0.5 1. ]
 [1.  2. ]]
[[1. 3.]
 [3. 8.]]


array([2. , 5.5])

In [158]:
weight = np.array([[0, 1, 1]])
targets = np.array([[1, 2, 3]])
if np.any(weight == 0).astype(bool):
    print('yay')

print(np.where(weight != 0))
targets[np.where(weight != 0)]

yay
(array([0, 0]), array([1, 2]))


array([2, 3])

In [118]:
X_train = np.array([
    [0, 0],
 [2, 2],
 [1, 3]
])

# Query data (2 points in 2D)
X_query = np.array([
[1, 1],
 [3, 2]
])



k = 2
metric = "euclidean"

distances, indices = _neighbor_finding(X_train, X_query, 2, metric = metric)
print(distances)
print(indices)

[[1.41421356 1.41421356]
 [1.         2.23606798]]
[[0 1]
 [1 2]]


In [168]:
rng = np.random.default_rng(42)
rng.standard_normal(3).reshape(-1, 1)

array([[ 0.30471708],
       [-1.03998411],
       [ 0.7504512 ]])

In [None]:
_distance_calculations(X_query, X_train, 'minkowski', 3)

[1. 2.]
[4. 6.]
[0. 0.]
[3. 4.]
[4. 6.]
[0. 0.]


array([[4.49794145, 2.08008382],
       [2.08008382, 4.49794145]])

In [348]:
# np.random.seed(0)
# x = np.linspace(-5, 5, 50)
# noise = np.random.normal(0, 1.5, size=x.shape)
# y = -1.5 * x + 10 + noise
# x = x.reshape(-1,1)
# print(x.shape)

x = np.array([[1, 2, 3, 0, 7, 5]])
y = np.array([1, 0, 2, 2, 10, 3])
x = x.reshape(-1, 1)
x = np.hstack([np.ones_like(x), x])
# y = y.reshape(-1, 1)
print(x)
print(y)

[[1 1]
 [1 2]
 [1 3]
 [1 0]
 [1 7]
 [1 5]]
[ 1  0  2  2 10  3]


In [353]:
rng = np.random.default_rng()
training_array = x
train_array = x
train_targets = y
epochs = 1000000
learning_rate = 0.0001
weights = np.zeros((2,))
for iteration in range(epochs):
    for entry in range(train_array.shape[0]):
        error = np.matmul(train_array[entry], weights) - train_targets[entry]
        weights -= learning_rate * (error) * (train_array[entry]).reshape(-1)

# first_matrix = np.matmul(train_array.T, train_array)
# second_matrix = np.matmul(train_array.T, train_targets)
# theta = np.linalg.solve(first_matrix, second_matrix)

# print(theta)

print(weights)

[0. 0.]
[-0.35344034  1.11782379]


In [336]:
weights

array([[-0.35344034],
       [ 1.11782379]])

In [341]:
print(weights.reshape(-1))

[-0.35344034  1.11782379]


In [317]:
weights = np.zeros((2,1))  # including bias as first column
learning_rate = 0.01
for epoch in range(10000):
    y_pred = x @ weights
    error = y_pred - y
    weights -= learning_rate * (x.T @ error) / len(y)
    # for i in range(x.shape[0]):
    #     xi = x[i].reshape(1, -1)   # shape (1,2)
    #     yi = y[i]                   # scalar
    #     y_pred = xi @ weights       # predicted value
    #     error = y_pred - yi  
    #     weights -= learning_rate * xi.T * error

weights

array([[-0.35294118],
       [ 1.11764706]])