<a href="https://colab.research.google.com/github/coldsober-irene/ASSIGNMENTS/blob/main/k_nearest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np

# Euclidean distance function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# K-NN classifier
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

# Example usage
if __name__ == "__main":
    # Sample multivariable data
    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
    y = np.array([0, 0, 1, 1, 1])

    # Initialize and train the K-NN classifier
    knn = KNN(k=3)
    knn.fit(X, y)

    # Make predictions on new data
    X_new = np.array([[2.5, 3.5], [4.5, 5.5]])
    predictions = knn.predict(X_new)

    print("Predictions:", predictions)


In [38]:
import numpy as np

# Euclidean distance function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# K-NN classifier
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

# Example usage
if __name__ == "__main__":
    # Sample dataset with 4 feature variables
    X = np.array([[2, 3, 4, 5], [3, 4, 5, 6], [5, 6, 7, 8], [1, 1, 2, 2], [4, 5, 6, 7]])
    y = np.array([0, 0, 1, 1, 1])  # Binary class labels (0 or 1)

    # Initialize and train the K-NN classifier
    knn = KNN(k=3)
    knn.fit(X, y)

    # Make predictions on new data with 4 features
    X_new = np.array([[0.3, .4, .40, .5], [4, .50, 59, 6]])
    predictions = knn.predict(X_new)

    print("Predictions:", predictions)


Predictions: [0 1]


In [39]:
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

# Sample dataset with two features and two target variables
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([[10, 20], [15, 25], [20, 30], [25, 35], [30, 40]])

# Initialize K-NN regressor
k = 3
knn = KNeighborsRegressor(n_neighbors=k)

# Fit the model to the data
knn.fit(X, y)

# Make predictions for a new data point
new_data = np.array([[1, 2]])
predictions = knn.predict(new_data)
print("Predicted values:", predictions)


Predicted values: [[15. 25.]]


In [40]:
import numpy as np
import pandas as pd

# Create a synthetic dataset
np.random.seed(0)
num_samples = 250
num_features = 100
num_labels = 4
best_k = []
accuraccies = []
# Generate random features
X = np.random.rand(num_samples, num_features)
df_x = pd.DataFrame(X)
# Generate random labels (4 classes)
y = np.random.choice(num_labels, num_samples)

def standadardize_data(data):
    data_with_mean_0 = data - data.mean()
    data_with_std_1 = data_with_mean_0 / data.std()
    standardized_data = data_with_std_1
    return standardized_data

# Define a function to calculate the Euclidean distance between two data points
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

# Define the KNN function for classification
def knn_classification(X_train, y_train, X_test, k=3):
    y_pred = []
    index = 0
    for test_point in X_test:
        # Calculate distances to all training points
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]

        # Get indices of k-nearest neighbors
        k_indices = np.argsort(distances)[:k]

        # Get the labels of the k-nearest neighbors
        k_nearest_labels = [y_train[i] for i in k_indices]

        # Predict the class by majority voting
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    best_k.append(k)

    return np.array(y_pred)

# Split the data into training and testing sets (80% training, 20% testing)
def predict(data, split_ratio = 0.8):
  split_index = int(split_ratio * num_samples)
  # data = standadardize_data(data = X)
  X_train = data[:split_index]
  y_train = y[:split_index]
  X_test = data[split_index:]
  y_test = y[split_index:]
  print(X_train)
  # Set the value of k for KNN
  k_value = list(range(2, 70))

  # Use the KNN function to make predictions
  for index, k_v in enumerate(k_value):
    y_pred = knn_classification(X_train, y_train, X_test, k=k_v)

    # Calculate accuracy
    accuracy = np.mean(y_pred == y_test)
    error_rate = 1 - accuracy

    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error Rate: {error_rate:.2%}")
    accuraccies.append(accuracy)
    print(f"{index}[{k_v}]/{len(k_value)}")


In [41]:
data = standadardize_data(X)

In [42]:
def compute_covariance_matrix(standardized_data):
    standardized_data = pd.DataFrame(standardized_data, columns = [str(i) for i in range(len(list(standardized_data[0])))])
    multiplied = pd.DataFrame()
    cols = list(standardized_data.columns)
    print(cols)
    for col in cols:
        for j, COL in enumerate(cols):
            if j != len(cols):
                multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]

    sums = multiplied.sum()
    c = [i for i in range(0, len(sums), len(cols))]

    Covariance_matrix = []
    for i, index in enumerate(c):
        try:
            Covariance_matrix.append(list(sums[index:c[i+1]]))
        except IndexError:
            Covariance_matrix.append(list(sums[index:]))

    covariance_df = pd.DataFrame(Covariance_matrix, columns=cols, index = cols)
    cov_matrix = np.array(Covariance_matrix)
    return cov_matrix

cv = compute_covariance_matrix(standardized_data = data)
print(cv)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[col] * standardized_data[cols[j]]
  multiplied[col + " " + cols[j]] = standardized_data[c

[[234.0546465   -2.97171882  13.85489899 ...   2.80748565   3.22703962
   25.25804064]
 [ -2.97171882 241.38908908  -7.0713016  ... -21.56020017  12.24578275
  -16.08337369]
 [ 13.85489899  -7.0713016  239.85160913 ...   7.83103965  11.02661669
    3.35050246]
 ...
 [  2.80748565 -21.56020017   7.83103965 ... 241.68309619  18.3752386
  -18.98695594]
 [  3.22703962  12.24578275  11.02661669 ...  18.3752386  275.56616965
    7.81325978]
 [ 25.25804064 -16.08337369   3.35050246 ... -18.98695594   7.81325978
  233.46930248]]


In [43]:
def Compute_principal_components(cov_matrix):
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    eigenvalues_eugenvectors = {eigenvalue:eigenvectors[index] for index, eigenvalue in enumerate(eigenvalues)}
    eigenvalues.sort()

    # CHOOSE ONLY TOP 3 MAXIMUM EIGENVALUES AND GET THEIR EIGENVECTORS
    max_3_eigenvectors = eigenvalues[::-1][:3]
    eigenvectors_of_top_3_eigenvalues = np.array([eigenvalues_eugenvectors[eigen] for eigen in max_3_eigenvectors])

    eigenvectors_of_top_3_eigenvalues = np.transpose(eigenvectors_of_top_3_eigenvalues)
    reduced_dimension_wine_dataset = np.dot(data, eigenvectors_of_top_3_eigenvalues)
    pc_df = pd.DataFrame(reduced_dimension_wine_dataset,
                        columns = ['Principal comp1', 'Principal comp2', 'Principal comp3'])
    # p = pc_df
    # dfi.export(pc_df.head(15).style.set_table_styles(), "pca.png", table_conversion="matplotlib")
    return reduced_dimension_wine_dataset, pd.DataFrame(reduced_dimension_wine_dataset,
                                                        columns = ['Principal comp1', 'Principal comp2', 'Principal comp3'])

In [44]:
pca = Compute_principal_components(cv)

In [45]:
predict(data = data, split_ratio = 0.8)

[[ 0.17857571  0.75124463  0.36427221 ... -1.64124038  1.1427766
  -1.69428902]
 [ 0.62260663 -0.7810783   0.82010104 ... -0.83495103 -1.51071345
  -0.21518055]
 [-0.63724351  0.68637676 -0.41022183 ...  1.25722887  1.63835668
   1.59676046]
 ...
 [ 0.69764342 -0.28594561 -0.13286819 ...  0.91953381 -1.22560588
  -0.72933418]
 [ 0.41959478 -0.271391    0.76103327 ...  0.82082912  1.63095633
   1.16979841]
 [ 1.1692548  -1.05719489 -1.46313689 ... -0.17374162 -0.4708882
   0.44386295]]
Accuracy: 28.00%
Error Rate: 72.00%
0[2]/68
Accuracy: 34.00%
Error Rate: 66.00%
1[3]/68
Accuracy: 28.00%
Error Rate: 72.00%
2[4]/68
Accuracy: 22.00%
Error Rate: 78.00%
3[5]/68
Accuracy: 30.00%
Error Rate: 70.00%
4[6]/68
Accuracy: 32.00%
Error Rate: 68.00%
5[7]/68
Accuracy: 30.00%
Error Rate: 70.00%
6[8]/68
Accuracy: 28.00%
Error Rate: 72.00%
7[9]/68
Accuracy: 24.00%
Error Rate: 76.00%
8[10]/68
Accuracy: 34.00%
Error Rate: 66.00%
9[11]/68
Accuracy: 30.00%
Error Rate: 70.00%
10[12]/68
Accuracy: 26.00%
Error

In [46]:
predict(data = pca, split_ratio = 0.8)

(array([[-1.07538116,  0.13826784,  1.78494279],
       [ 1.93753223, -1.08998864,  0.7767748 ],
       [-0.50913195, -1.39587187,  0.12487171],
       [-1.32209731,  0.85526034, -0.59374049],
       [-1.41003698, -0.32617686, -0.45318986],
       [ 0.3024943 ,  0.73847966, -1.02866201],
       [-0.5424349 ,  1.09522826, -0.3719633 ],
       [ 0.42103562, -1.95683573, -2.69479955],
       [-0.30533238,  1.13926276, -0.0454781 ],
       [ 1.23229353,  0.12157696, -0.17357414],
       [ 0.7630207 , -1.17607763, -1.62322047],
       [-0.11104622,  0.21824082, -1.52760204],
       [-1.5421152 , -0.21421316,  0.11917136],
       [-0.66033476, -1.15775671,  0.03838337],
       [ 0.65257899, -1.58782342,  0.15990296],
       [ 0.53638267, -0.3872028 ,  0.58742762],
       [ 0.2968275 ,  0.13195255, -2.40676199],
       [ 2.02601085, -0.66915811, -1.87097411],
       [ 1.68470797,  1.37274391, -0.46251298],
       [ 2.01047326,  0.38027742,  1.60464249],
       [-0.81231706,  0.53181509, -0.36

  accuracy = np.mean(y_pred == y_test)
