In [None]:
import numpy as np
import pandas as pd

In [None]:
def scale_data(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    scaled_data = (data - mean) / std
    return scaled_data

In [None]:
def euclidean_distance(x1, x2):

    return np.sqrt(np.sum((x1 - x2) ** 2))

In [None]:
def k_nearest_neighbors(X_train, y_train, X_test, k):
    predictions = []

    # Scale the data using the custom scaling function
    X_train_scaled = scale_data(X_train)
    X_test_scaled = scale_data(X_test)

    for i, x_test in enumerate(X_test_scaled):
        # Compute distances between the current test instance and all training instances
        distances = [euclidean_distance(x_test, x_train) for x_train in X_train_scaled]

        # Get the indices of the k-nearest neighbors
        k_neighbors_indices = np.argsort(distances)[:k]

        # Get the labels of the k-nearest neighbors
        k_neighbors_labels = y_train[k_neighbors_indices].flatten()  # Flatten the labels to 1D

        # Find the most common label among the k-nearest neighbors
        predicted_label = np.bincount(k_neighbors_labels).argmax()

        predictions.append(predicted_label)

        # Print cost every 100 iterations
        if (i + 1) % 100 == 0:
            accuracy = np.mean(predictions == y_test[:i + 1].flatten())
            cost = 1 - accuracy
            print(f"Iteration {i + 1}, Cost: {cost:.4f}")

    return np.array(predictions)

In [None]:
df=pd.read_csv('/content/sample_data/Classification_train.csv')
df

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29998,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_np=df.to_numpy()
df_np.shape

(30000, 785)

In [None]:
X,y=df_np[:,1:],df_np[:,0:1]
X.shape,y.shape

((30000, 784), (30000, 1))

In [None]:
def train_test_split(X,y,test_ratio=0.2,random_seed=None):
  if random_seed is not None:
    np.random.seed(random_seed)
  #Shuffle indices
  indices=np.arange(X.shape[0])
  np.random.shuffle(indices)
  #Calculate the number of samples for testing
  test_size=int(X.shape[0]*test_ratio)
  #Split the data
  test_indices=indices[:test_size]
  train_indices=indices[test_size:]
  X_train,X_test=X[train_indices],X[test_indices]
  y_train,y_test=y[train_indices],y[test_indices]
  return X_train,X_test,y_train,y_test

In [None]:
#Split the data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_ratio=0.2,random_seed=42)

In [None]:
# Set the value of k
k_value = 5


In [None]:
print("X_train shape:", X_train.shape, "dtype:", X_train.dtype)
print("y_train shape:", y_train.shape, "dtype:", y_train.dtype)
print("X_test shape:", X_test.shape, "dtype:", X_test.dtype)
print("y_test shape:", y_test.shape, "dtype:", y_test.dtype)

X_train shape: (24000, 784) dtype: int64
y_train shape: (24000, 1) dtype: int64
X_test shape: (6000, 784) dtype: int64
y_test shape: (6000, 1) dtype: int64


In [None]:
# Perform k-Nearest Neighbors classification
predictions = k_nearest_neighbors(X_train, y_train, X_test, k_value)


  scaled_data = (data - mean) / std


Iteration 100, Cost: 0.9500
Iteration 200, Cost: 0.9100
Iteration 300, Cost: 0.9067
Iteration 400, Cost: 0.9025
Iteration 500, Cost: 0.8940
Iteration 600, Cost: 0.8950
Iteration 700, Cost: 0.8957
Iteration 800, Cost: 0.8938
Iteration 900, Cost: 0.8944
Iteration 1000, Cost: 0.8910
Iteration 1100, Cost: 0.8891
Iteration 1200, Cost: 0.8850
Iteration 1300, Cost: 0.8854
Iteration 1400, Cost: 0.8886
Iteration 1500, Cost: 0.8907
Iteration 1600, Cost: 0.8894
Iteration 1700, Cost: 0.8912
Iteration 1800, Cost: 0.8939
Iteration 1900, Cost: 0.8968
Iteration 2000, Cost: 0.8930
Iteration 2100, Cost: 0.8962
Iteration 2200, Cost: 0.8977
Iteration 2400, Cost: 0.9000
Iteration 2500, Cost: 0.9004
Iteration 2600, Cost: 0.9004
Iteration 2700, Cost: 0.9007
Iteration 2800, Cost: 0.9011
Iteration 2900, Cost: 0.9028
Iteration 3000, Cost: 0.9030
Iteration 3100, Cost: 0.9029
Iteration 3200, Cost: 0.9022
Iteration 3300, Cost: 0.9033
Iteration 3400, Cost: 0.9024
Iteration 3500, Cost: 0.9017
Iteration 3600, Cost: 0

In [None]:
# Evaluate accuracy
accuracy = np.mean(predictions == y_test)
print(f"Final Accuracy: {accuracy:.4f}")

Final Accuracy: 0.0987


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score #For assessing my model I have imported sklearn module

# Add zero_division=1 to handle the warning
precision = precision_score(y_test,predictions, average='weighted', zero_division=1)
recall = recall_score(y_test,predictions, average='weighted', zero_division=1)
f1 = f1_score(y_test, predictions, average='weighted', zero_division=1)


In [None]:
precision

0.9110684444444445

In [None]:
recall

0.09866666666666667

In [None]:
f1

0.01772168284789644

On the test dataset

In [None]:
pf=pd.read_csv('/content/sample_data/Classification_test.csv')
pf

Unnamed: 0,ID,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,15795,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,860,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5390,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11964,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11284,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,11260,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,18563,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,634,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,10057,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
pf_np=pf.to_numpy()
pf_np.shape

(10000, 785)

In [None]:
P_test,q_test=pf_np[:,1:],pf_np[:,0:1]
P_test.shape,q_test.shape

((10000, 784), (10000, 1))

In [None]:
# Perform k-Nearest Neighbors classification
predictions_test = k_nearest_neighbors(X_train, y_train, P_test, k_value)

In [None]:
# Evaluate the accuracy on the test dataset
accuracy_classification = np.mean(predictions_test == q_test.flatten())
print(f"Accuracy on Classification_test: {accuracy_classification:.4f}")